#### Library Imports

In [55]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import mean_squared_error, r2_score

### Data Preparation / Cleaning

In [49]:
df_processed = pd.read_pickle('../data/processed/df_processed.pkl')
df_processed.head()


Unnamed: 0_level_0,Store,Dept,Weekly_Sales,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2010-02-05,1,1,24924.5,False,A,151315,42.31,2.572,,,,,,211.096358,8.106
2010-02-05,29,5,15552.08,False,B,93638,24.36,2.788,,,,,,131.527903,10.064
2010-02-05,29,6,3200.22,False,B,93638,24.36,2.788,,,,,,131.527903,10.064
2010-02-05,29,7,10820.05,False,B,93638,24.36,2.788,,,,,,131.527903,10.064
2010-02-05,29,8,20055.64,False,B,93638,24.36,2.788,,,,,,131.527903,10.064


In [50]:
print('\n Reviewing for missing values : \n')
df_processed.isna().sum()


 Reviewing for missing values : 



Store                0
Dept                 0
Weekly_Sales         0
IsHoliday            0
Type                 0
Size                 0
Temperature          0
Fuel_Price           0
MarkDown1       270889
MarkDown2       310322
MarkDown3       284479
MarkDown4       286603
MarkDown5       270138
CPI                  0
Unemployment         0
dtype: int64

In [51]:
print('\n ------------------------- \n')
print('\n Cleaned Dataset : \n')
df_processed.fillna(0, inplace=True)
df_processed.isna().sum()


 ------------------------- 


 Cleaned Dataset : 



Store           0
Dept            0
Weekly_Sales    0
IsHoliday       0
Type            0
Size            0
Temperature     0
Fuel_Price      0
MarkDown1       0
MarkDown2       0
MarkDown3       0
MarkDown4       0
MarkDown5       0
CPI             0
Unemployment    0
dtype: int64

### Feature Selection

In [None]:
X = df_processed.drop('Weekly_Sales', axis=1)
y = df_processed['Weekly_Sales']

numerical_columns = X.select_dtypes(include=np.number).columns
categorical_columns = X.select_dtypes(include='object').columns

In [54]:
# One-Hot Encoding Categorical Variables

if not categorical_columns.empty:
    X = pd.get_dummies(X, columns=categorical_columns, drop_first=False, dtype=int)
    print(X.head())
else:
    print('\n No Categorical features to encode')

            Store  Dept    Size  Temperature  Fuel_Price  MarkDown1  \
Date                                                                  
2010-02-05      1     1  151315        42.31       2.572        0.0   
2010-02-05     29     5   93638        24.36       2.788        0.0   
2010-02-05     29     6   93638        24.36       2.788        0.0   
2010-02-05     29     7   93638        24.36       2.788        0.0   
2010-02-05     29     8   93638        24.36       2.788        0.0   

            MarkDown2  MarkDown3  MarkDown4  MarkDown5         CPI  \
Date                                                                 
2010-02-05        0.0        0.0        0.0        0.0  211.096358   
2010-02-05        0.0        0.0        0.0        0.0  131.527903   
2010-02-05        0.0        0.0        0.0        0.0  131.527903   
2010-02-05        0.0        0.0        0.0        0.0  131.527903   
2010-02-05        0.0        0.0        0.0        0.0  131.527903   

           

In [58]:
# Selecting the best numberical features with SelectKBest

numerical_selection = X.select_dtypes(include=np.number)

if not numerical_selection.empty:
    from sklearn.feature_selection import f_regression
    selector = SelectKBest(score_func=f_regression, k='all')
    selector.fit(numerical_selection, y)

    feature_scores = pd.DataFrame({
        'Feature' : numerical_selection.columns,
        'Score' : selector.scores_
    }).sort_values(by='Score', ascending=False)

    print('\n feature scores from SelectKBest :')
    print(feature_scores)

    X_final = X.copy()
else: 
    X_final = X.copy()
    print('\n No Numerical features found for SelectKBest')


 feature scores from SelectKBest :
            Feature         Score
2              Size  26647.338563
14           Type_A  15009.373846
1              Dept   9444.999393
15           Type_B   7385.920249
16           Type_C   3871.030495
0             Store   3082.190335
9         MarkDown5   1076.367782
5         MarkDown1    940.177812
7         MarkDown3    627.818196
8         MarkDown4    592.624706
11     Unemployment    282.189019
10              CPI    184.602072
6         MarkDown2    181.002868
12  IsHoliday_False     68.802991
13   IsHoliday_True     68.802991
3       Temperature      2.254309
4        Fuel_Price      0.006101
