In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
df_up = pd.read_csv("../data/5_yr_data/UP5_years.csv")
df_up['datetime'] = pd.to_datetime(df_up['date'])
df_up.drop(columns=["Unnamed: 0"], axis=1, inplace=True)
df_up.sort_values(by="datetime", ascending=True, inplace=True)
df_up.head()

In [3]:
TRAIN_LEN = int(0.8 * len(df_up))
up_train, up_test = (df_up[:TRAIN_LEN],df_up[TRAIN_LEN:])
up_train.set_index('datetime', inplace=True)
up_train.sort_index(inplace=True)
up_test.set_index('datetime', inplace=True)
up_test.sort_index(inplace=True)

In [4]:
up_train

In [17]:
df_up_dt

In [5]:
from statsmodels.tsa.arima_model import ARIMA

In [19]:
from statsmodels.tsa.stattools import kpss
commodities = df_up['commodity'].unique()
for commodity in commodities:
    up_train_commodity = up_train[up_train['commodity'] == commodity]
    up_train_commodity_dt = up_train_commodity.groupby("datetime").agg({"modal_rs_quintal":"mean"})
    res = kpss(up_train_commodity_dt['modal_rs_quintal'])
    kpss_stat, p_value, used_lag, critical_values = res
    print("KPSS results for "+ commodity)
    print("KPSS Statistic:", kpss_stat)
    print("p-value:", p_value)
    print("Number of lags used:", used_lag)
    print("Critical values:", end=" ")
    for key, value in critical_values.items():
        print(f"\t{key}: {value}", end="\t", )
    print("\n---------------------------------------------")

 #### Based on the above results, we would see that
 - #### p-value is very less compared to 0.05
 - #### KPSS statistic is also much higher than the crirical values
 #### So, the data is not stationary

In [7]:
from statsmodels.tsa.stattools import adfuller

commodities = df_up['commodity'].unique()
for commodity in commodities:
    res = adfuller(up_train[up_train['commodity'] == commodity]['modal_rs_quintal'], maxlag=10)
    adf_stat, p_value, used_lag, nobs, critical_values, store = res
    print("ADF results for "+ commodity)
    print("ADF Statistic:", adf_stat)
    print("p-value:", p_value)
    print("Number of lags used:", used_lag)
    print("Number of observations used:", nobs)
    print("Critical values:", end=" ")
    for key, value in critical_values.items():
        print(f"\t{key}: {value}", end="\t", )
    print("\n---------------------------------------------")

 #### Based on the above results, we would see that
 - #### p-value is very less compared to 0.05
 - #### ADF statistic is also much lesser than the crirical values
 #### So, the data is stationary

In [8]:
x = np.linspace(0, 10 * np.pi, 1000)

y = np.sin(x)

plt.figure(figsize=(12,4))
plt.plot(x, y)

# Add labels and title
plt.title('Sine Curve')
plt.xlabel('x values (radians)')
plt.ylabel('sin(x)')

# Show the plot
plt.grid(True)  # Add a grid for better visualization
plt.show()

In [9]:
res = adfuller(y)
adf_stat, p_value, used_lag, nobs, critical_values, store = res

print("ADF Statistic:", adf_stat)
print("p-value:", p_value)
print("Number of lags used:", used_lag)
print("Number of observations used:", nobs)
print("Critical values:")
for key, value in critical_values.items():
    print(f"\t{key}: {value}")

In [15]:
from scipy.stats import boxcox

for commodity in commodities:
    boxcox_commodity_up = pd.Series(boxcox(df_up[df_up['commodity'] == commodity]['modal_rs_quintal'],lmbda=0),index=df_up[df_up['commodity'] == commodity].datetime)
    print(boxcox_commodity_up ,commodity)
    plt.plot(boxcox_commodity_up)
    plt.show()