In [79]:
#importing all necessary libraries

!pip install vecstack

from vecstack import stacking
import pandas as pd
import numpy as np


from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings("ignore")



In [80]:
#loading training and testing datasets

trainfile = r'/content/train.csv'
train = pd.read_csv(trainfile)

testfile = r'/content/test.csv'
test = pd.read_csv(testfile)

print(train.shape)
print(test.shape)

(137, 43)
(100000, 42)


In [81]:
#summary statistics for features in train data

print(train.describe())
print(train.describe(include=['object']))

               Id          P1          P2          P3          P4          P5  \
count  137.000000  137.000000  137.000000  137.000000  137.000000  137.000000   
mean    68.000000    4.014599    4.408759    4.317518    4.372263    2.007299   
std     39.692569    2.910391    1.514900    1.032337    1.016462    1.209620   
min      0.000000    1.000000    1.000000    0.000000    3.000000    1.000000   
25%     34.000000    2.000000    4.000000    4.000000    4.000000    1.000000   
50%     68.000000    3.000000    5.000000    4.000000    4.000000    2.000000   
75%    102.000000    4.000000    5.000000    5.000000    5.000000    2.000000   
max    136.000000   12.000000    7.500000    7.500000    7.500000    8.000000   

               P6          P7          P8          P9  ...         P29  \
count  137.000000  137.000000  137.000000  137.000000  ...  137.000000   
mean     3.357664    5.423358    5.153285    5.445255  ...    3.135036   
std      2.134235    2.296809    1.858567    1.8

In [82]:
#summary statistics for features in test data

print(test.describe())
print(test.describe(include=['object']))

                  Id             P1             P2             P3  \
count  100000.000000  100000.000000  100000.000000  100000.000000   
mean    49999.500000       4.088030       4.428085       4.215325   
std     28867.657797       2.812963       1.428865       0.842161   
min         0.000000       1.000000       1.000000       0.000000   
25%     24999.750000       2.000000       3.750000       4.000000   
50%     49999.500000       3.000000       5.000000       4.000000   
75%     74999.250000       4.000000       5.000000       4.000000   
max     99999.000000      15.000000       7.500000       6.000000   

                  P4             P5             P6            P7  \
count  100000.000000  100000.000000  100000.000000  100000.00000   
mean        4.396025       1.989590       2.881900       5.30051   
std         1.035827       1.065314       1.531429       2.17858   
min         2.000000       1.000000       1.000000       1.00000   
25%         4.000000       1.000000   

In [83]:
#checking for null values in train data

train.isna().sum()

Id            0
Open Date     0
City          0
City Group    0
Type          0
P1            0
P2            0
P3            0
P4            0
P5            0
P6            0
P7            0
P8            0
P9            0
P10           0
P11           0
P12           0
P13           0
P14           0
P15           0
P16           0
P17           0
P18           0
P19           0
P20           0
P21           0
P22           0
P23           0
P24           0
P25           0
P26           0
P27           0
P28           0
P29           0
P30           0
P31           0
P32           0
P33           0
P34           0
P35           0
P36           0
P37           0
revenue       0
dtype: int64

In [84]:
#checking for null values in test data

test.isna().sum()

Id            0
Open Date     0
City          0
City Group    0
Type          0
P1            0
P2            0
P3            0
P4            0
P5            0
P6            0
P7            0
P8            0
P9            0
P10           0
P11           0
P12           0
P13           0
P14           0
P15           0
P16           0
P17           0
P18           0
P19           0
P20           0
P21           0
P22           0
P23           0
P24           0
P25           0
P26           0
P27           0
P28           0
P29           0
P30           0
P31           0
P32           0
P33           0
P34           0
P35           0
P36           0
P37           0
dtype: int64

In [85]:
#checking the data types of columns in the train set

print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 43 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Id          137 non-null    int64  
 1   Open Date   137 non-null    object 
 2   City        137 non-null    object 
 3   City Group  137 non-null    object 
 4   Type        137 non-null    object 
 5   P1          137 non-null    int64  
 6   P2          137 non-null    float64
 7   P3          137 non-null    float64
 8   P4          137 non-null    float64
 9   P5          137 non-null    int64  
 10  P6          137 non-null    int64  
 11  P7          137 non-null    int64  
 12  P8          137 non-null    int64  
 13  P9          137 non-null    int64  
 14  P10         137 non-null    int64  
 15  P11         137 non-null    int64  
 16  P12         137 non-null    int64  
 17  P13         137 non-null    float64
 18  P14         137 non-null    int64  
 19  P15         137 non-null    i

In [86]:
#check the data types of columns in the test set

print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 42 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Id          100000 non-null  int64  
 1   Open Date   100000 non-null  object 
 2   City        100000 non-null  object 
 3   City Group  100000 non-null  object 
 4   Type        100000 non-null  object 
 5   P1          100000 non-null  int64  
 6   P2          100000 non-null  float64
 7   P3          100000 non-null  float64
 8   P4          100000 non-null  float64
 9   P5          100000 non-null  int64  
 10  P6          100000 non-null  int64  
 11  P7          100000 non-null  int64  
 12  P8          100000 non-null  int64  
 13  P9          100000 non-null  int64  
 14  P10         100000 non-null  int64  
 15  P11         100000 non-null  int64  
 16  P12         100000 non-null  int64  
 17  P13         100000 non-null  float64
 18  P14         100000 non-null  int64  
 19  P15

In [87]:
#eliminating Id column from train data

train=train.drop('Id',axis=1)

In [88]:
#eliminating Id column from test data

test=test.drop('Id',axis=1)

In [89]:
#separating features from target variable

X_train=train.iloc[:, :-1]
Y_train=train.iloc[:, -1]
X_test=test

In [90]:
#checking null values

data = pd.concat([train, test], keys=[0,1])
data.isnull().sum().sort_values(ascending=False)

revenue       100000
P28                0
P20                0
P21                0
P22                0
P23                0
P24                0
P25                0
P26                0
P27                0
P29                0
City               0
P30                0
P31                0
P32                0
P33                0
P34                0
P35                0
P36                0
P37                0
P19                0
Open Date          0
P17                0
P16                0
City Group         0
Type               0
P1                 0
P2                 0
P3                 0
P4                 0
P5                 0
P6                 0
P7                 0
P8                 0
P9                 0
P10                0
P11                0
P12                0
P13                0
P14                0
P15                0
P18                0
dtype: int64

In [91]:
categorical_cols = train.select_dtypes(exclude=['float','int']).columns

In [92]:
#one-hot encoding

features = ['Open Date', 'City', 'City Group', 'Type']
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
Xcat = pd.DataFrame(ohe.fit_transform(X_train[features]),columns=ohe.get_feature_names_out(),index=X_train.index)
X_train = pd.concat([X_train,Xcat],axis=1)
X_train.drop(labels=features,axis=1,inplace=True)

In [93]:
cat = pd.DataFrame(ohe.transform(X_test[features]),columns=ohe.get_feature_names_out(),index=X_test.index)
X_test = pd.concat([X_test,cat],axis=1)
X_test.drop(labels=features,axis=1,inplace=True)

In [94]:
#random forest regressor

rf = RandomForestRegressor()
rf.fit(X_train, Y_train)
Y_pred_rf=rf.predict(X_train)

mean_squared_error(Y_train,Y_pred_rf)
print("RMSE (training) for Random Forest:{0:10f}".format(mean_squared_error(Y_train,Y_pred_rf)))
Y_pred_testrf=rf.predict(X_test)

RMSE (training) for Random Forest:996387780103.042725


In [95]:
y_predrf=pd.DataFrame(Y_pred_testrf)
y_pred_rf=pd.concat([test.iloc[:,0],y_predrf],axis=1)
y_pred_rf.columns=['Id', 'Prediction']
y_pred_rf.to_csv(r'/content/drive/MyDrive/Colab Notebooks/rf.csv')

In [96]:
#decision tree regressor

dt = DecisionTreeRegressor()
dt.fit(X_train, Y_train)
Y_pred_dt=dt.predict(X_train)

mean_squared_error(Y_train,Y_pred_dt)
print("RMSE (training) for Decision Tree:{0:10f}".format(mean_squared_error(Y_train,Y_pred_dt)))
Y_pred_testdt=dt.predict(X_test)

RMSE (training) for Decision Tree:  0.000000


In [97]:
y_preddt=pd.DataFrame(Y_pred_testdt)
y_pred_dt=pd.concat([test.iloc[:,0],y_preddt],axis=1)
y_pred_dt.columns=['Id', 'Prediction']
y_pred_dt.to_csv(r'/content/drive/MyDrive/Colab Notebooks/dt.csv')

In [98]:
#multilayer perceptron regressor

mlp = MLPRegressor()
mlp.fit(X_train, Y_train)
Y_pred_mlp=mlp.predict(X_train)

mean_squared_error(Y_train,Y_pred_mlp)
print("RMSE (training) for MLP:{0:10f}".format(mean_squared_error(Y_train,Y_pred_mlp)))
Y_pred_testmlp=mlp.predict(X_test)

RMSE (training) for MLP:26416454970812.378906


In [99]:
y_predmlp=pd.DataFrame(Y_pred_testmlp)
y_pred_mlp=pd.concat([test.iloc[:,0],y_predmlp],axis=1)
y_pred_mlp.columns=['Id', 'Prediction']
y_pred_mlp.to_csv(r'/content/drive/MyDrive/Colab Notebooks/mlp.csv')

In [100]:
#finding best hyperparameters for decision tree

from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint

param_dist = {
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['auto', 'sqrt', 'log2', None],
    'random_state': [42]
}

random_search = RandomizedSearchCV(dt, param_distributions=param_dist, n_iter=100, cv=5, n_jobs=-1, random_state=42)
random_search.fit(X_train, Y_train)

print("Best Hyperparameters:", random_search.best_params_)
best_params=random_search.best_params_

Best Hyperparameters: {'max_depth': 7, 'max_features': None, 'min_samples_leaf': 15, 'min_samples_split': 12, 'random_state': 42}


In [101]:
#decision tree with hyperparameter tuning

dt2 = DecisionTreeRegressor(**random_search.best_params_)
dt2.fit(X_train, Y_train)
Y_pred_dt2=dt2.predict(X_train)

mean_squared_error(Y_train,Y_pred_dt2)
print("RMSE (training) for Random Forest:{0:10f}".format(mean_squared_error(Y_train,Y_pred_dt2)))
Y_pred_testdt2=dt2.predict(X_test)

y_preddt2=pd.DataFrame(Y_pred_testdt2)
y_pred_dtht=pd.concat([test.iloc[:,0],y_preddt2],axis=1)
y_pred_dtht.columns=['Id', 'Prediction']
y_pred_dtht.to_csv(r'/content/drive/MyDrive/Colab Notebooks/dt_ht.csv')

RMSE (training) for Random Forest:4753276684743.322266


In [102]:
#stacked model

models = [ RandomForestRegressor(), DecisionTreeRegressor(), MLPRegressor() ]

S_Train, S_Test = stacking(models,
                           X_train, Y_train, X_test,
                           regression=True,

                           mode='oof_pred_bag',

                           needs_proba=False,

                           n_folds=5,

                           stratified=True,

                           shuffle=True,

                           random_state=0,

                           verbose=2)

task:         [regression]
metric:       [mean_absolute_error]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [RandomForestRegressor]
    fold  0:  [1616697.17321429]
    fold  1:  [1350409.22285714]
    fold  2:  [1124038.12333333]
    fold  3:  [2212642.52444444]
    fold  4:  [1648161.73296296]
    ----
    MEAN:     [1590389.75536243] + [365048.67920261]
    FULL:     [1588830.09759124]

model  1:     [DecisionTreeRegressor]
    fold  0:  [2064912.39285714]
    fold  1:  [2075267.39285714]
    fold  2:  [1958618.33333333]
    fold  3:  [2478470.18518519]
    fold  4:  [1676270.00000000]
    ----
    MEAN:     [2050707.66084656] + [257864.20386652]
    FULL:     [2050990.61313869]

model  2:     [MLPRegressor]
    fold  0:  [4976243.18828333]
    fold  1:  [3899032.23902949]
    fold  2:  [4380172.24515538]
    fold  3:  [5203148.93993050]
    fold  4:  [3807363.58755862]
    ----
    MEAN:     [4453192.03999147] + [559560.04519757]
    FULL:     [4452964.96953402]



In [103]:
S_TrainFinal=pd.DataFrame(S_Train)
S_TrainFinal.to_csv(r'/content/drive/MyDrive/Colab Notebooks/STrain.csv')

In [104]:
S_TestFinal=pd.DataFrame(S_Test)
S_TestFinal.to_csv(r'/content/drive/MyDrive/Colab Notebooks/STest.csv')

In [105]:
#finding best hyperparameters for the stacked model

param_dist = {
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['auto', 'sqrt', 'log2', None],
    'random_state': [42]
}

random_search = RandomizedSearchCV(dt, param_distributions=param_dist, n_iter=100, cv=5, n_jobs=-1, random_state=42)
random_search.fit(S_Train, Y_train)

print("Best Hyperparameters:", random_search.best_params_)
best_params=random_search.best_params_

Best Hyperparameters: {'max_depth': 9, 'max_features': 'log2', 'min_samples_leaf': 19, 'min_samples_split': 17, 'random_state': 42}


In [106]:
#using the best parameters in the stacked model

dt_sm = DecisionTreeRegressor(**random_search.best_params_)
dt_sm.fit(S_train, Y_train)
Y_pred_dt_sm=dt_sm.predict(S_train)

mean_squared_error(Y_train,Y_pred_dt_sm)
print("RMSE (training) for Random Forest:{0:10f}".format(mean_squared_error(Y_train,Y_pred_dt_sm)))
Y_pred_testdt_sm=dt_sm.predict(X_test)

y_preddt_sm=pd.DataFrame(Y_pred_testdt2)
y_pred_dtsm=pd.concat([test.iloc[:,0],y_preddt_sm],axis=1)
y_pred_dtsm.columns=['Id', 'Prediction']
y_pred_dtsm.to_csv(r'/content/drive/MyDrive/Colab Notebooks/dt_sm_ht.csv')

RMSE (training) for Random Forest:6391818006330.934570
