In [1]:
import pandas as pd
df_stores = pd.read_csv('data/stores.csv')
df_features = pd.read_csv('data/features.csv')
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [2]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8190 entries, 0 to 8189
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         8190 non-null   int64  
 1   Date          8190 non-null   object 
 2   Temperature   8190 non-null   float64
 3   Fuel_Price    8190 non-null   float64
 4   MarkDown1     4032 non-null   float64
 5   MarkDown2     2921 non-null   float64
 6   MarkDown3     3613 non-null   float64
 7   MarkDown4     3464 non-null   float64
 8   MarkDown5     4050 non-null   float64
 9   CPI           7605 non-null   float64
 10  Unemployment  7605 non-null   float64
 11  IsHoliday     8190 non-null   bool   
dtypes: bool(1), float64(9), int64(1), object(1)
memory usage: 712.0+ KB


In [3]:
df_features['formatted_date'] = pd.to_datetime(df_features['Date'])
df_features['week_of_year'] = df_features.formatted_date.apply(lambda x: x.weekofyear)
df_features['month_of_year'] = df_features.formatted_date.apply(lambda x: x.month)
df_features['fortnight_of_year']=df_features['week_of_year'] // 2 + df_features['week_of_year'] % 2
df_features['quarter_of_year']=df_features['week_of_year'] // 13 + 1
df_features.replace({'quarter_of_year': {5: 4}},inplace=True)

In [4]:
#Grouping Markdown by fortnight of year assuming the average of markdowns is same accross different fortnights in a year
#Group cpi & unemployment index by quarter and store. As they tend to remain same per quarter in a particular region
markdown1_mean=df_features.groupby(['Store','fortnight_of_year'])['MarkDown1'].mean().reset_index()
markdown2_mean=df_features.groupby(['Store','fortnight_of_year'])['MarkDown2'].mean().reset_index()
markdown3_mean=df_features.groupby(['Store','fortnight_of_year'])['MarkDown3'].mean().reset_index()
markdown4_mean=df_features.groupby(['Store','fortnight_of_year'])['MarkDown4'].mean().reset_index()
markdown5_mean=df_features.groupby(['Store','fortnight_of_year'])['MarkDown5'].mean().reset_index()
cpi_mean=df_features.groupby(['Store','quarter_of_year'])['CPI'].mean().reset_index()
unemployment_mean=df_features.groupby(['Store','quarter_of_year'])['Unemployment'].mean().reset_index()

In [5]:
markdown_merged=pd.concat([markdown1_mean,markdown2_mean,markdown3_mean,markdown4_mean,markdown5_mean], axis=1)
quarter_merged = pd.concat([cpi_mean,unemployment_mean], axis=1)

In [6]:
markdown_merged = markdown_merged.loc[:,~markdown_merged.columns.duplicated()]
quarter_merged = quarter_merged.loc[:,~quarter_merged.columns.duplicated()]

In [7]:
df_features_merged=pd.merge(df_features, markdown_merged,  how='left', left_on=['Store','fortnight_of_year'], right_on = ['Store','fortnight_of_year'])
df_features_merged_final=pd.merge(df_features_merged, quarter_merged,  how='left', left_on=['Store','quarter_of_year'], right_on = ['Store','quarter_of_year'])
df_features_merged_final.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1_x,MarkDown2_x,MarkDown3_x,MarkDown4_x,MarkDown5_x,CPI_x,...,month_of_year,fortnight_of_year,quarter_of_year,MarkDown1_y,MarkDown2_y,MarkDown3_y,MarkDown4_y,MarkDown5_y,CPI_y,Unemployment_y
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,...,2,3,1,22537.0475,3149.035,202.015,20490.045,4540.2075,217.829723,7.368818
1,1,2010-02-12,38.51,2.548,,,,,,211.24217,...,2,3,1,22537.0475,3149.035,202.015,20490.045,4540.2075,217.829723,7.368818
2,1,2010-02-19,39.93,2.514,,,,,,211.289143,...,2,4,1,28066.995,7111.9225,25.6025,9575.3,5917.3225,217.829723,7.368818
3,1,2010-02-26,46.63,2.561,,,,,,211.319643,...,2,4,1,28066.995,7111.9225,25.6025,9575.3,5917.3225,217.829723,7.368818
4,1,2010-03-05,46.5,2.625,,,,,,211.350143,...,3,5,1,10345.94,568.2,24.5925,6673.5,5005.5375,217.829723,7.368818


In [8]:
df_features_merged_final.MarkDown1_x.fillna(df_features_merged_final.MarkDown1_y, inplace=True)
df_features_merged_final.MarkDown2_x.fillna(df_features_merged_final.MarkDown2_y, inplace=True)
df_features_merged_final.MarkDown3_x.fillna(df_features_merged_final.MarkDown3_y, inplace=True)
df_features_merged_final.MarkDown4_x.fillna(df_features_merged_final.MarkDown4_y, inplace=True)
df_features_merged_final.MarkDown5_x.fillna(df_features_merged_final.MarkDown5_y, inplace=True)
df_features_merged_final.CPI_x.fillna(df_features_merged_final.CPI_y, inplace=True)
df_features_merged_final.Unemployment_x.fillna(df_features_merged_final.Unemployment_y, inplace=True)



In [9]:
df_features_merged_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8190 entries, 0 to 8189
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Store              8190 non-null   int64         
 1   Date               8190 non-null   object        
 2   Temperature        8190 non-null   float64       
 3   Fuel_Price         8190 non-null   float64       
 4   MarkDown1_x        8184 non-null   float64       
 5   MarkDown2_x        7066 non-null   float64       
 6   MarkDown3_x        7982 non-null   float64       
 7   MarkDown4_x        7490 non-null   float64       
 8   MarkDown5_x        8190 non-null   float64       
 9   CPI_x              8190 non-null   float64       
 10  Unemployment_x     8190 non-null   float64       
 11  IsHoliday          8190 non-null   bool          
 12  formatted_date     8190 non-null   datetime64[ns]
 13  week_of_year       8190 non-null   int64         
 14  month_of

In [10]:
df_features_merged_final[['MarkDown1_x','MarkDown2_x','MarkDown3_x','MarkDown4_x','MarkDown5_x']]=df_features_merged_final[['MarkDown1_x','MarkDown2_x','MarkDown3_x','MarkDown4_x','MarkDown5_x']].fillna(0)

In [11]:
df_features_merged_final.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1_x,MarkDown2_x,MarkDown3_x,MarkDown4_x,MarkDown5_x,CPI_x,...,month_of_year,fortnight_of_year,quarter_of_year,MarkDown1_y,MarkDown2_y,MarkDown3_y,MarkDown4_y,MarkDown5_y,CPI_y,Unemployment_y
0,1,2010-02-05,42.31,2.572,22537.0475,3149.035,202.015,20490.045,4540.2075,211.096358,...,2,3,1,22537.0475,3149.035,202.015,20490.045,4540.2075,217.829723,7.368818
1,1,2010-02-12,38.51,2.548,22537.0475,3149.035,202.015,20490.045,4540.2075,211.24217,...,2,3,1,22537.0475,3149.035,202.015,20490.045,4540.2075,217.829723,7.368818
2,1,2010-02-19,39.93,2.514,28066.995,7111.9225,25.6025,9575.3,5917.3225,211.289143,...,2,4,1,28066.995,7111.9225,25.6025,9575.3,5917.3225,217.829723,7.368818
3,1,2010-02-26,46.63,2.561,28066.995,7111.9225,25.6025,9575.3,5917.3225,211.319643,...,2,4,1,28066.995,7111.9225,25.6025,9575.3,5917.3225,217.829723,7.368818
4,1,2010-03-05,46.5,2.625,10345.94,568.2,24.5925,6673.5,5005.5375,211.350143,...,3,5,1,10345.94,568.2,24.5925,6673.5,5005.5375,217.829723,7.368818


In [12]:
df_train.info(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421570 entries, 0 to 421569
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Store         421570 non-null  int64  
 1   Dept          421570 non-null  int64  
 2   Date          421570 non-null  object 
 3   Weekly_Sales  421570 non-null  float64
 4   IsHoliday     421570 non-null  bool   
dtypes: bool(1), float64(1), int64(2), object(1)
memory usage: 13.3+ MB


In [13]:
df_train_merge1=pd.merge(df_train,df_features_merged_final,  how='left', left_on=['Store','Date'], right_on = ['Store','Date'])
df_train_final = pd.merge(df_train_merge1,df_stores,  how='left', left_on=['Store'], right_on = ['Store'])

In [14]:
df_train_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 421570 entries, 0 to 421569
Data columns (total 29 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   Store              421570 non-null  int64         
 1   Dept               421570 non-null  int64         
 2   Date               421570 non-null  object        
 3   Weekly_Sales       421570 non-null  float64       
 4   IsHoliday_x        421570 non-null  bool          
 5   Temperature        421570 non-null  float64       
 6   Fuel_Price         421570 non-null  float64       
 7   MarkDown1_x        421570 non-null  float64       
 8   MarkDown2_x        421570 non-null  float64       
 9   MarkDown3_x        421570 non-null  float64       
 10  MarkDown4_x        421570 non-null  float64       
 11  MarkDown5_x        421570 non-null  float64       
 12  CPI_x              421570 non-null  float64       
 13  Unemployment_x     421570 non-null  float64 

In [15]:
#Removing duplicate columns and renaming columns appropriately

del df_train_final['MarkDown1_y']
del df_train_final['MarkDown2_y']
del df_train_final['MarkDown3_y']
del df_train_final['MarkDown4_y']
del df_train_final['MarkDown5_y']
del df_train_final['CPI_y']
del df_train_final['Unemployment_y']
del df_train_final['IsHoliday_y']
df_train_final.rename(columns={"MarkDown1_x": "MarkDown1", "MarkDown2_x": "MarkDown2", "MarkDown3_x": "MarkDown3",
                                   "MarkDown4_x": "MarkDown4", "MarkDown5_x": "MarkDown5","CPI_x":"CPI","Unemployment_x":"Unemployment","IsHoliday_x": "IsHoliday"},inplace=True)



In [16]:
df_train_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 421570 entries, 0 to 421569
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   Store              421570 non-null  int64         
 1   Dept               421570 non-null  int64         
 2   Date               421570 non-null  object        
 3   Weekly_Sales       421570 non-null  float64       
 4   IsHoliday          421570 non-null  bool          
 5   Temperature        421570 non-null  float64       
 6   Fuel_Price         421570 non-null  float64       
 7   MarkDown1          421570 non-null  float64       
 8   MarkDown2          421570 non-null  float64       
 9   MarkDown3          421570 non-null  float64       
 10  MarkDown4          421570 non-null  float64       
 11  MarkDown5          421570 non-null  float64       
 12  CPI                421570 non-null  float64       
 13  Unemployment       421570 non-null  float64 

In [17]:
df_train_final.head(5)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,MarkDown5,CPI,Unemployment,formatted_date,week_of_year,month_of_year,fortnight_of_year,quarter_of_year,Type,Size
0,1,1,2010-02-05,24924.5,False,42.31,2.572,22537.0475,3149.035,202.015,...,4540.2075,211.096358,8.106,2010-02-05,5,2,3,1,A,151315
1,1,1,2010-02-12,46039.49,True,38.51,2.548,22537.0475,3149.035,202.015,...,4540.2075,211.24217,8.106,2010-02-12,6,2,3,1,A,151315
2,1,1,2010-02-19,41595.55,False,39.93,2.514,28066.995,7111.9225,25.6025,...,5917.3225,211.289143,8.106,2010-02-19,7,2,4,1,A,151315
3,1,1,2010-02-26,19403.54,False,46.63,2.561,28066.995,7111.9225,25.6025,...,5917.3225,211.319643,8.106,2010-02-26,8,2,4,1,A,151315
4,1,1,2010-03-05,21827.9,False,46.5,2.625,10345.94,568.2,24.5925,...,5005.5375,211.350143,8.106,2010-03-05,9,3,5,1,A,151315


In [54]:
df_train_final.to_csv('test_final_train.csv')

In [25]:
def map_size(store_type):
    if store_type=='A':
        return 1
    if store_type=='B':
        return 2
    if store_type=='C':
        return 3
def map_holiday(isHoliday):
    if isHoliday==True:
        return 1
    if isHoliday==False:
        return 0
    
df_train_final['StoreType']=df_train_final['Type'].apply(map_size)
df_train_final['Holiday']=df_train_final['IsHoliday'].apply(map_holiday)

print(df_train_final['StoreType'].value_counts())
print(df_train_final['Holiday'].value_counts())

1    215478
2    163495
3     42597
Name: StoreType, dtype: int64
0    391909
1     29661
Name: Holiday, dtype: int64


In [30]:
df_train_final['year'] = df_train_final.formatted_date.apply(lambda x: x.year)

In [31]:
df_train_final['day'] = df_train_final.formatted_date.apply(lambda x: x.day)

In [32]:
df_train_final.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,week_of_year,month_of_year,fortnight_of_year,quarter_of_year,Type,Size,StoreType,year,day,Holiday
0,1,1,2010-02-05,24924.5,False,42.31,2.572,22537.0475,3149.035,202.015,...,5,2,3,1,A,151315,1,2010,5,0
1,1,1,2010-02-12,46039.49,True,38.51,2.548,22537.0475,3149.035,202.015,...,6,2,3,1,A,151315,1,2010,12,1


In [36]:
all_y=df_train_final['Weekly_Sales']
features=['Store','Dept','week_of_year','month_of_year','year','day','Holiday','Temperature','Size','StoreType']
all_x=df_train_final[features]

In [35]:
train1_x

Unnamed: 0,Store,Dept,week_of_year,month_of_year,year,day,Holiday,Temperature,Size,StoreType
0,1,1,5,2,2010,5,0,42.31,151315,1
1,1,1,6,2,2010,12,1,38.51,151315,1
2,1,1,7,2,2010,19,0,39.93,151315,1
3,1,1,8,2,2010,26,0,46.63,151315,1
4,1,1,9,3,2010,5,0,46.50,151315,1
...,...,...,...,...,...,...,...,...,...,...
421565,45,98,39,9,2012,28,0,64.88,118221,2
421566,45,98,40,10,2012,5,0,64.89,118221,2
421567,45,98,41,10,2012,12,0,54.47,118221,2
421568,45,98,42,10,2012,19,0,56.47,118221,2


In [37]:
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y = train_test_split(all_x,all_y,test_size=.2,random_state=0)

In [43]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor


In [45]:
clf = RandomForestRegressor(n_estimators=100)
clf.fit(train_X[features],train_y)


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [60]:
predictions=clf.predict(test_X[features])
accuracy=clf.score(test_X[features],test_y)
print(accuracy)

0.9796153537304637


In [62]:
import numpy as np
from sklearn.model_selection import cross_val_score

scores=cross_val_score(clf,all_x,all_y,cv=10)
final_accuracy=np.mean(scores)
print(final_accuracy)

0.6357204963513724
