In [116]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from warnings import simplefilter
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from pandas_profiling import ProfileReport
from scipy import stats

simplefilter('ignore')
%config Completer.use_jedi = False
%config InlineBackend.figure_format = 'retina'

In [110]:
#!pip install -U pandas-profiling[notebook]

In [2]:
df= pd.read_csv('../input/store-sales-time-series-forecasting/train.csv', parse_dates=['date'])

In [3]:
df_= df.set_index('date')
df_

In [4]:
df_= df_.to_period('D')

In [5]:
df_.index[1]

In [114]:
#profile = ProfileReport(df_, title="Pandas Profiling Report")

In [115]:
#profile.to_widgets()

In [6]:
df_days= df.groupby('date', as_index= False)['sales'].sum().round(2)
df_days.set_index('date')
df_days.head()

In [7]:
df_days['time']= range(0, len(df_days.index))
df_days.head()

In [8]:
x= df_days.loc[:,['time']]
y= df_days.loc[:, 'sales']

regr= LinearRegression()
regr.fit(x,y)

preds= regr.predict(x)

In [9]:
df_days['predicted']= preds.round(2)

In [10]:
df_days

In [11]:
regr.coef_

In [12]:
regr.intercept_

In [13]:
regr.score(x,y)

In [14]:
mean_squared_error(y, preds)

In [15]:
plt.style.use("seaborn-whitegrid")
plt.rc(
    "figure",
    autolayout=True,
    figsize=(11, 4),
    titlesize=18,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
%config InlineBackend.figure_format = 'retina'

fig, ax = plt.subplots()
ax.plot('time', 'sales', data=df_days, color='0.75')
ax = sns.regplot(x='time', y='sales', data=df_days, ci=None, scatter_kws=dict(color='0.25'))
ax.set_title('Time Plot of Sales');

In [16]:
for i in range(1,11):
    df_days['lag_%s' %i]= df_days.sales.shift(i, axis=0)
df_days

In [17]:
x= df_days.loc[:, ['lag_5']]
x.dropna(inplace= True)
y= df_days.loc[:, 'sales']
y,x= y.align(x, join='inner')

In [18]:
regr_lag_5= LinearRegression()
regr_lag_5.fit(x,y)

y_pred_lag_5= pd.Series(regr_lag_5.predict(x), x.index)

In [19]:
fig, ax= plt.subplots()
ax.plot(x, y,'.', color='.2')
ax.plot(x, y_pred_lag_5)
ax.set_aspect('equal')
ax.set_title('Plot of sales at lag 5')
ax.set_xlabel('Time')
ax.set_ylabel('Sales')

In [20]:
fig, ax = plt.subplots()
ax = sns.regplot(x='lag_5', y='sales', data=df_days, ci=None, scatter_kws={'color':'0.25', 's':5})
ax.set_title('Time Plot of Sales at lag 5')
ax.set_aspect('equal')

In [75]:
df_days.iloc[:,[0, 2, i+3]]

In [82]:
df_loop.iloc[:,[0]]

In [104]:
coefs= []
for i in (range(1,11)):
    df_loop= df_days.iloc[:,[0, i+2]]
    df_loop.fillna(0, inplace=True)
    model= LinearRegression()
    model.fit(X= df_loop.iloc[:,[0]], y=df_loop.iloc[:,1])
    coefs.append(model.coef_)
coefs= np.concatenate(coefs, axis=0)
coefficients= pd.DataFrame({'values of coefs': coefs.round(2), 'lag':range(1,11)})
coefficients= coefficients.set_index('lag')
print(coefficients)

In [124]:
df_loop

In [128]:

slope, intercept, r_value, p_value, std_err = stats.linregress(df_loop.iloc[:,2],df_loop.iloc[:,0])

In [129]:
slope

In [145]:
iterate= -1
for i in (range(1,11)):
    df_loop= df_days.iloc[:,[0, 1, i+2]]
    #print(df_loop.head())
    df_loop.fillna(0, inplace= True)
    slope, intercept, r_value, p_value, std_err = stats.linregress(df_loop.iloc[:,2],df_loop.iloc[:,0])
    iterate +=1
    plt.figure(iterate)
    f= sns.regplot(x= 'lag_%s'%i, y='sales', data=df_loop, ci=None, scatter_kws=dict(color='0.25'), line_kws={'label':"y={0:.1f}x+{1:.0f} and R2= {2:.1f}".format(slope,intercept,r_value)})
    #f.set_aspect('equal')
    f.set_title('Lag %s Plot of Sales' %i)
    f.legend()
    #plt.show()

In [23]:
ax= y.plot()
ax= y_pred_lag_5.plot(color='.2')

In [24]:
df_rol= df_days.iloc[:,[1, 2]]
df_rol_30=df_rol.rolling(30).mean()
df_rol_30

In [25]:
ax= df_rol_30.iloc[:,0].plot()

In [26]:
df_rol_30= df_rol_30.dropna()
fig, ax = plt.subplots()
ax= sns.regplot(x= 'time', y='sales', data=df_rol_30, ci=None, scatter_kws=dict(color='0.25', s=2))
ax.set_title('Plot of Sales, 30 rolling days ')

In [27]:
df_rol_356=df_rol.rolling(356).mean()
df_rol_356

In [28]:
df_rol_356.iloc[:,0]

In [29]:
ax= df_days.iloc[:,[1]].plot(style=".", color="k")
ax= df_rol_30.iloc[355:,[0]].plot(ax= ax, style=".", color='g')
ax= df_rol_356.iloc[:,0].plot(ax= ax, linewidth= 3, title= 'Plot, window of ~1 year', color= 'r')
ax.legend(['sales without window','window one month','window one year'])

In [30]:
df_rol_356= df_rol_356.dropna()
fig, ax = plt.subplots()
ax= sns.regplot(x= 'time', y='sales', data=df_rol_356, ci=None, scatter_kws=dict(color='0.25', s=2))
ax.set_title('Plot of Sales, 356 rolling days (~1 year)')

In [31]:
dp= DeterministicProcess(
index= df_rol_356.index,
constant= True,
order=1,
drop= True
)

x= dp.in_sample()

x

In [32]:
model_qua= LinearRegression(fit_intercept= False)
#x= df_rol_356.iloc[:,[1]]
y= df_rol_356.iloc[:,0]
x.dropna(inplace= True)
y.dropna(inplace= True)

model_qua.fit(x,y)

print(model_qua.coef_[0], model_qua.intercept_, model_qua.score(x,y))

In [33]:
y_pred= pd.Series(model_qua.predict(x), index=x.index)
y_pred

In [34]:
ax= df_days.iloc[355:,[1]].plot(style= '.', color= '.2', title= 'Linear trend')
y_pred.plot(x= ax, linewidth=3, label="Trend")

In [35]:
out_x= dp.out_of_sample(steps=30)
out_x

In [36]:
y_fore= pd.Series(model_qua.predict(out_x), index=out_x.index)

y_fore

In [37]:
ax= df_days.iloc[1000:,1].plot(style="-", color="k", linewidth=1)
ax= y_pred[650:].plot(ax= ax, linewidth=3, )
ax= y_fore.plot(ax= ax, linewidth= 3, title= 'Trend forecast', color= 'r')
ax.legend(['sales without lag','trend predicted by linear model','trend forecasted by linear model'])

In [38]:
import math
sins= []
def sin_fun (degree):
    for i in degree:
        sins.append(math.sin(math.radians(i)))
    return pd.Series(sins, index= degree)

coss= []
def cos_fun (degree):
    for i in degree:
        coss.append(-1*math.cos(math.radians(i)))
    return pd.Series(coss, index= degree)

ax= sin_fun(range(0,360)).plot()
ax= cos_fun(range(0,360)).plot(ax= ax, color= 'r')

In [39]:
def fourier_features(index, freq, order):
    time = np.arange(len(index), dtype=np.float32)
    k = 2 * np.pi * (1 / freq) * time
    features = {}
    for i in range(1, order + 1):
        features.update({
            f"sin_{freq}_{i}": np.sin(i * k),
            f"cos_{freq}_{i}": np.cos(i * k),
        })
    return pd.DataFrame(features, range(0,len(index)))

In [40]:
df_fourier_4= fourier_features(y, freq= 365.25, order=4)

In [41]:
df_fourier_4

In [42]:
df_fourier_4.plot()

In [43]:
df_days= df_days.set_index('date')
df= df_days.iloc[:, 0:2]

In [44]:
y=df.sales.reset_index(drop=True)

In [45]:
X= pd.Series(df.iloc[:,1], index= df.index)

In [46]:
X= X.reset_index()

In [47]:
def seasonal_plot(X, y, period, freq, ax=None):
    if ax is None:
        _, ax = plt.subplots()
    palette = sns.color_palette("husl", n_colors=X[period].nunique(),)
    ax = sns.lineplot(
        x=freq,
        y=y,
        hue=period,
        data=X,
        ci=False,
        ax=ax,
        palette=palette,
        legend=False,
    )
    ax.set_title(f"Seasonal Plot ({period}/{freq})")
    for line, name in zip(ax.lines, X[period].unique()):
        y_ = line.get_ydata()[-1]
        ax.annotate(
            name,
            xy=(1, y_+0.1),
            xytext=(6, 0),
            color=line.get_color(),
            xycoords=ax.get_yaxis_transform(),
            textcoords="offset points",
            size=14,
            va="center",
        )
    return ax

In [48]:
X['day']= X['date'].dt.dayofweek

In [49]:
X['week']= X['date'].dt.week

In [50]:
X['dayofyear']= X['date'].dt.dayofyear

In [51]:
X['year']= X['date'].dt.year

In [52]:
X= X.iloc[:, [0,2,3,4,5]]

In [53]:
seasonal_plot(X,y, period='year', freq= 'dayofyear')

In [54]:
seasonal_plot(X, y, period= 'year', freq= 'week')

In [55]:
seasonal_plot(X, y, period= 'year', freq= 'day')

In [56]:

def plot_periodogram(ts, detrend='linear', ax=None):
    from scipy.signal import periodogram
    fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram")
    return ax

In [57]:
plot_periodogram(df.sales)

In [58]:
fourier= CalendarFourier(freq= 'A', order= 10)

In [59]:
X= X.set_index('date').to_period('D')

In [60]:
dp = DeterministicProcess(
    index=X.index,
    constant=True,               # dummy feature for bias (y-intercept)
    order=1,                     # trend (order 1 means linear)
    seasonal=True,               # weekly seasonality (indicators)
    additional_terms= [fourier],
    drop= True
)

In [61]:
X_in= dp.in_sample()

In [62]:
X_in

In [63]:
model= LinearRegression()
_= model.fit(X_in,y)

y_pred= pd.Series(model.predict(X_in), y.index)
X_fore= dp.out_of_sample(90)
y_fore= pd.Series(model.predict(X_fore), X_fore.index)



In [64]:
X

In [65]:
data_y_df= {'sales':y, 'date': df.index}
y_df= pd.DataFrame(data= data_y_df)
y_df= y_df.set_index('date')
data_y_df_pred= {'sales':y_pred, 'date': df.index}
y_df_pred= pd.DataFrame(data_y_df_pred)
y_df_pred= y_df_pred.set_index('date')
data_y_df_fore= {'sales':y_fore, 'date': X_fore.index}
y_df_fore= pd.DataFrame(data_y_df_fore)
y_df_fore= y_df_fore.set_index('date')

In [66]:
ax= y_df.plot(alpha= .7, style= '.', color='.7', title= 'Seasonal analysis of sales')
ax= y_df_pred.plot(ax=ax, color='b', linewidth= '1')
ax= y_df_fore.plot(ax=ax, color= 'g')
ax.legend(['actual sales', 'seasonal prediction with indicators and Fourier', 'seasonal forecast with indicators and Fourier'])