In [1]:
import pandas as pd

In [314]:
train = pd.read_parquet('training_preprocessed.parquet')
train['dcoilwtico'] = train['dcoilwtico'].fillna(method='bfill')

In [325]:
X = train.iloc[:,[1,2,4,5,6,7,8,9,10,11,12,13]].values
Y = train.iloc[:,3].values

n_split = int(len(X)*0.8)

X_train,X_test = X[0:n_split],X[n_split:]
Y_train,Y_test = Y[0:n_split],Y[n_split:]

In [75]:
from sklearn.ensemble import RandomForestRegressor


In [76]:
rfr=RandomForestRegressor(n_estimators=20,n_jobs=4)
rfr.fit(X_train,Y_train)
y_pred = rfr.predict(X_test)
print('Train R2',rfr.score(X_train,Y_train))
print('Val R2',rfr.score(X_test,Y_test))
print('Val RMSLE',mean_squared_log_error(Y_test,y_pred,squared=False))


Train R2 0.9808237378102028
Val R2 0.8276753234012983
Val RMSLE 0.9182157310597648


In [29]:
from sklearn.metrics import mean_squared_log_error

In [51]:
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm
import numpy as np
grid={'n_estimators' : [10,20,50,100],'n_jobs':[6]}




In [52]:
rfr=RandomForestRegressor()
test_scores=[]
for g in tqdm(ParameterGrid(grid)):
    rfr.set_params(**g)
    rfr.fit(X_train,Y_train)
    y_pred = rfr.predict(X_test)
    test_scores.append(mean_squared_log_error(Y_test,y_pred,squared=False))

100%|█████████████████████████████████████████████| 4/4 [05:01<00:00, 75.47s/it]


In [53]:
test_scores

[0.9401234607299687,
 0.9190864526590267,
 0.9050756566472326,
 0.9000582710516055]

In [137]:
stores = train['store_nbr'].unique()
families = train['family'].unique()

In [221]:
len(stores)*len(families)*0.8/60/60

0.396

# Parameter gird for division model

In [239]:
results = []
for store in tqdm(stores):
    for family in families:
        store_df = train[train['store_nbr']==store].drop('store_nbr',axis=1).copy()
        familiy_df = store_df[store_df['family']==family].drop('family',axis=1).copy()
        X = familiy_df.iloc[:,[2,3,4,5,6,7,8,9,10,11]].values
        Y = familiy_df.iloc[:,1].values

        n_split = int(len(X)*0.8)

        X_train,X_test = X[0:n_split],X[n_split:]
        Y_train,Y_test = Y[0:n_split],Y[n_split:]
        rfr=RandomForestRegressor(random_state=0)
        test_scores=[]
        for g in ParameterGrid(grid):
            rfr.set_params(**g)
            rfr.fit(X_train,Y_train)
            y_pred = rfr.predict(X_test)
            test_scores.append(mean_squared_log_error(Y_test,y_pred,squared=False))
        best_idx=np.argmin(test_scores)
        
        results.append((store,family,test_scores[best_idx],ParameterGrid(grid)[best_idx]))

100%|███████████████████████████████████████████| 54/54 [22:15<00:00, 24.72s/it]


In [245]:
parameters = dict(map(lambda x: (f'{x[0]}_{x[1]}',x[3]),results))

# Full division model with fit and predict functions

In [300]:
class SuperModel():
    def __init__(self,parameters,stores,families,fitted_models =None):
        self.parameters = parameters
        self.stores = stores
        self.families = families
        self.models = {}
        self.dont_fit = False
        if fitted_models:
            self.models = fitted_models
            self.dont_fit = True
    
    def fit(self,X_train,Y_train):
        if self.dont_fit:
            return
        temp_df = pd.DataFrame(X_train)
        temp_df['label'] = Y_train
        for store in tqdm(self.stores):
            for family in self.families:
                temp_df2=temp_df[temp_df[0]==store].drop(0,axis=1).copy()
                temp_df3=temp_df2[temp_df2[1]==family].drop(1,axis=1).copy()
                x_train = temp_df3.drop('label',axis=1).values
                y_train = temp_df3['label'].values
                self.models[f'{store}_{family}'] = RandomForestRegressor(random_state=0)
                self.models[f'{store}_{family}'].set_params(**self.parameters[f'{store}_{family}'])
                self.models[f'{store}_{family}'].fit(x_train,y_train)
    def predict(self,X_test):
        predictions = np.zeros(shape=(len(X_test),))
        for i in tqdm(range(len(X_test))):
            predictions[i]=self.models[f'{X_test[i][0]}_{X_test[i][1]}'].predict(X_test[i][2:].reshape(1,-1))
        return predictions
    def predict2(self,X_test):
        return np.apply_along_axis(lambda x: self.models[f'{x[0]}_{x[1]}'].predict(x[2:].reshape(1,-1)),1,X_test)
        
        
        

# Fitting model

In [326]:
m = SuperModel(parameters,stores,families)
m.fit(X,Y)
fitted_models = m.models
#y_pred = m.predict(X_test)

100%|███████████████████████████████████████████| 54/54 [15:12<00:00, 16.89s/it]


In [327]:
train

Unnamed: 0_level_0,date,store_nbr,family,sales,onpromotion,dcoilwtico,city,state,type,cluster,isHoliday,dayofyear,weekofyear,weekday
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,2013-01-01,1,0.0,0.000,0,93.14,18.0,12.0,3.0,13,1.0,1,1,1
1,2013-01-01,1,1.0,0.000,0,93.14,18.0,12.0,3.0,13,1.0,1,1,1
2,2013-01-01,1,2.0,0.000,0,93.14,18.0,12.0,3.0,13,1.0,1,1,1
3,2013-01-01,1,3.0,0.000,0,93.14,18.0,12.0,3.0,13,1.0,1,1,1
4,2013-01-01,1,4.0,0.000,0,93.14,18.0,12.0,3.0,13,1.0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3000883,2017-08-15,9,28.0,438.133,0,47.57,18.0,12.0,1.0,6,1.0,227,33,1
3000884,2017-08-15,9,29.0,154.553,1,47.57,18.0,12.0,1.0,6,1.0,227,33,1
3000885,2017-08-15,9,30.0,2419.729,148,47.57,18.0,12.0,1.0,6,1.0,227,33,1
3000886,2017-08-15,9,31.0,121.000,8,47.57,18.0,12.0,1.0,6,1.0,227,33,1


In [321]:
test

Unnamed: 0_level_0,date,store_nbr,family,onpromotion,dcoilwtico,city,state,type,cluster,isHoliday,dayofyear,weekofyear,weekday
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3000888,2017-08-16,1,0.0,0,46.80,18.0,12.0,3.0,13,0.0,228,33,2
3000889,2017-08-16,1,1.0,0,46.80,18.0,12.0,3.0,13,0.0,228,33,2
3000890,2017-08-16,1,2.0,2,46.80,18.0,12.0,3.0,13,0.0,228,33,2
3000891,2017-08-16,1,3.0,20,46.80,18.0,12.0,3.0,13,0.0,228,33,2
3000892,2017-08-16,1,4.0,0,46.80,18.0,12.0,3.0,13,0.0,228,33,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3029395,2017-08-31,9,28.0,1,47.26,18.0,12.0,1.0,6,0.0,243,35,3
3029396,2017-08-31,9,29.0,0,47.26,18.0,12.0,1.0,6,0.0,243,35,3
3029397,2017-08-31,9,30.0,1,47.26,18.0,12.0,1.0,6,0.0,243,35,3
3029398,2017-08-31,9,31.0,9,47.26,18.0,12.0,1.0,6,0.0,243,35,3


# Loading preprocessed test dataframe

In [332]:
test = pd.read_parquet('test_preprocessed.parquet')
test['dcoilwtico'] = test['dcoilwtico'].fillna(method='bfill')
X = test.iloc[:,[1,2,3,4,5,6,7,8,9,10,11,12]].values


In [333]:
y_pred = m.predict(X)

100%|████████████████████████████████████| 28512/28512 [02:58<00:00, 159.79it/s]


In [339]:
df = pd.DataFrame(index=test.index)

In [340]:
df['sales'] = y_pred

# Saving submission file

In [341]:
df.to_csv('results.csv')

In [216]:
store_1 = train[train['store_nbr']==5].drop('store_nbr',axis=1).copy()
familiy_30 = store_1[store_1['family']==6].drop('family',axis=1).copy()

In [217]:
X = familiy_30.iloc[:,[2,3,4,5,6,7,8,9,10,11]].values
Y = familiy_30.iloc[:,1].values

n_split = int(len(X)*0.8)

X_train,X_test = X[0:n_split],X[n_split:]
Y_train,Y_test = Y[0:n_split],Y[n_split:]

In [225]:
grid={'max_depth':[1,2,3,4,5,6,7,8],'n_jobs':[6]}

In [233]:
rfr=RandomForestRegressor(random_state=0)
test_scores=[]
for g in tqdm(ParameterGrid(grid)):
    rfr.set_params(**g)
    rfr.fit(X_train,Y_train)
    y_pred = rfr.predict(X_test)
    test_scores.append(mean_squared_log_error(Y_test,y_pred,squared=False))
best_idx=np.argmin(test_scores)
print(test_scores[best_idx],ParameterGrid(grid)[best_idx])

100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 10.31it/s]

0.5324297828468002 {'n_jobs': 6, 'max_depth': 1}





In [227]:
for i in range(len(test_scores)):
    print(test_scores[i],ParameterGrid(grid)[i])

0.5555400341792555 {'n_jobs': 6, 'max_depth': 1}
0.5484211342372203 {'n_jobs': 6, 'max_depth': 2}
1.147917498978505 {'n_jobs': 6, 'max_depth': 3}
1.396907200797871 {'n_jobs': 6, 'max_depth': 4}
1.5843235869205856 {'n_jobs': 6, 'max_depth': 5}
1.6497277548434321 {'n_jobs': 6, 'max_depth': 6}
1.6545179726212278 {'n_jobs': 6, 'max_depth': 7}
1.6549710720472761 {'n_jobs': 6, 'max_depth': 8}


In [89]:
X

array([[0, 93.14, 18.0, ..., 1, 1, 1],
       [0, 93.14, 18.0, ..., 2, 1, 2],
       [0, 92.97, 18.0, ..., 3, 1, 3],
       ...,
       [7, 47.59, 18.0, ..., 225, 32, 6],
       [7, 47.59, 18.0, ..., 226, 33, 0],
       [148, 47.57, 18.0, ..., 227, 33, 1]], dtype=object)