# Importing Libraries

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import iplot


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from math import sqrt
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
import warnings


In [2]:
#Loading our data
train_df = pd.read_csv(r'C:/Testing/DA/predictFutureSalesClean.csv')

In [3]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,shop_name,item_name,item_category_id,Required,item_category_name,date_month,date_year,total_sales
0,0,2013-01-02,0,59,22154,999.0,1.0,"Ярославль ТЦ ""Альтаир""",ЯВЛЕНИЕ 2012 (BD),37,True,Кино - Blu-Ray,1,2013,999.0
1,1,2013-01-03,0,25,2574,399.0,2.0,"Москва ТРК ""Атриум""",DEL REY LANA Born To Die The Paradise Editio...,55,True,Музыка - CD локального производства,1,2013,798.0
2,2,2013-01-05,0,25,2574,399.0,1.0,"Москва ТРК ""Атриум""",DEL REY LANA Born To Die The Paradise Editio...,55,True,Музыка - CD локального производства,1,2013,399.0
3,3,2013-01-07,0,25,2574,399.0,1.0,"Москва ТРК ""Атриум""",DEL REY LANA Born To Die The Paradise Editio...,55,True,Музыка - CD локального производства,1,2013,399.0
4,4,2013-01-08,0,25,2574,399.0,2.0,"Москва ТРК ""Атриум""",DEL REY LANA Born To Die The Paradise Editio...,55,True,Музыка - CD локального производства,1,2013,798.0


In [4]:
#info of train
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1442564 entries, 0 to 1442563
Data columns (total 15 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   Unnamed: 0          1442564 non-null  int64  
 1   date                1442564 non-null  object 
 2   date_block_num      1442564 non-null  int64  
 3   shop_id             1442564 non-null  int64  
 4   item_id             1442564 non-null  int64  
 5   item_price          1442564 non-null  float64
 6   item_cnt_day        1442564 non-null  float64
 7   shop_name           1442564 non-null  object 
 8   item_name           1442564 non-null  object 
 9   item_category_id    1442564 non-null  int64  
 10  Required            1442564 non-null  bool   
 11  item_category_name  1442564 non-null  object 
 12  date_month          1442564 non-null  int64  
 13  date_year           1442564 non-null  int64  
 14  total_sales         1442564 non-null  float64
dtypes: bool(1), flo

# Separating Numerical From Categorical Data

In [5]:
# Separating categorical data from numerical data
train_categorical_data = train_df.select_dtypes(exclude=['int64', 'float','int32'])
train_numerical_data = train_df.select_dtypes(include=['int64', 'float','int32'])

# Using Label Encoder To Convert Categorical to Numerical Data

In [6]:
# Label Encode and Hot Encode for Categorical Columns
# the category data here will be converted to numbers format
le = LabelEncoder()
train_categorical_data = train_categorical_data.apply(LabelEncoder().fit_transform)

In [7]:
#Looking into our categrical data after conversion
train_categorical_data.head()

Unnamed: 0,date,shop_name,item_name,Required,item_category_name
0,1,59,4731,0,26
1,2,25,461,0,37
2,4,25,461,0,37
3,6,25,461,0,37
4,7,25,461,0,37


# Concatenating Both Categorical and Numerical Data

In [8]:
# Concatenating both Catagorical Data and Numerical Data
# Y will be our new dataframe why X will be our lable
X = pd.concat([train_categorical_data, train_numerical_data], axis=1)
y = train_df['item_cnt_day']

In [9]:
#Checking the X
X.head()

Unnamed: 0.1,date,shop_name,item_name,Required,item_category_name,Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id,date_month,date_year,total_sales
0,1,59,4731,0,26,0,0,59,22154,999.0,1.0,37,1,2013,999.0
1,2,25,461,0,37,1,0,25,2574,399.0,2.0,55,1,2013,798.0
2,4,25,461,0,37,2,0,25,2574,399.0,1.0,55,1,2013,399.0
3,6,25,461,0,37,3,0,25,2574,399.0,1.0,55,1,2013,399.0
4,7,25,461,0,37,4,0,25,2574,399.0,2.0,55,1,2013,798.0


In [10]:
# Checking the y
y.head()

0    1.0
1    2.0
2    1.0
3    1.0
4    2.0
Name: item_cnt_day, dtype: float64

# Splitting data into train and test

In [11]:
# Splitting the data into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [12]:
X_test.head()

Unnamed: 0.1,date,shop_name,item_name,Required,item_category_name,Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id,date_month,date_year,total_sales
1016109,785,38,763,0,10,1016109,25,38,3708,899.0,1.0,19,2,2015,899.0
1231081,899,15,1050,0,11,1231081,29,15,4873,1199.0,1.0,20,6,2015,1199.0
663289,585,50,3724,0,11,663289,19,50,16790,2199.0,1.0,20,8,2014,2199.0
494274,468,6,4524,0,50,494274,15,6,20949,5.0,4.0,71,4,2014,20.0
731159,621,58,772,0,14,731159,20,58,3734,2599.0,1.0,23,9,2014,2599.0


In [13]:
# Checking the X_train
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1009794 entries, 937513 to 121958
Data columns (total 15 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   date                1009794 non-null  int32  
 1   shop_name           1009794 non-null  int32  
 2   item_name           1009794 non-null  int32  
 3   Required            1009794 non-null  int64  
 4   item_category_name  1009794 non-null  int32  
 5   Unnamed: 0          1009794 non-null  int64  
 6   date_block_num      1009794 non-null  int64  
 7   shop_id             1009794 non-null  int64  
 8   item_id             1009794 non-null  int64  
 9   item_price          1009794 non-null  float64
 10  item_cnt_day        1009794 non-null  float64
 11  item_category_id    1009794 non-null  int64  
 12  date_month          1009794 non-null  int64  
 13  date_year           1009794 non-null  int64  
 14  total_sales         1009794 non-null  float64
dtypes: float64(

In [14]:
# Checking the shape of bothe the train and test data
print("The size of training input is", X_train.shape)
print("The size of training output is", y_train.shape)
print("The size of testing input is", X_test.shape)
print("The size of testing output is", y_test.shape)

The size of training input is (1009794, 15)
The size of training output is (1009794,)
The size of testing input is (432770, 15)
The size of testing output is (432770,)


In [17]:
#constant
SCORING = 'neg_mean_absolute_error'
JOB = -1

def model_prediction(regression=None):
    
    REGRESSION_MODEL = ""
    params = ""
    
    if regression == "RANDOMFOREST":
        #perform GridSearchCV on Random Forest Regression
        tuned_params = {'n_estimators': [100, 200], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}  
        model = RandomizedSearchCV(RandomForestRegressor(), tuned_params, n_iter = 3, scoring = SCORING, cv = 3, n_jobs = -1)
        REGRESSION_MODEL = "Random Forest Regressor"
    else:
        if regression == "RIDGE":
            # Performing GridSearchCV on Ridge Regression
            params = {'alpha' : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]} 
            REGRESSION_MODEL = "Ridge Regressor"
            #model = grid_search_cv(regression, params)
        elif regression == "LASSO":
            # Performing GridSearchCV on Lasso Regression
            params = {'alpha' : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}
            REGRESSION_MODEL = "Lasso Regressor"
            #model = grid_search_cv(regression, params)
        elif regression == "DECISIONTREE": 
            # Performing GridSearchCV on Decision Tree Regression
            depth = list(range(3,30))
            params = dict(max_depth = depth)
            print("Params", params)
            REGRESSION_MODEL = "Decision Tree"
        
        model = grid_search_cv(regression, params)
    
    model.fit(X_train, y_train)
    
    #predict train and test result
    y_test_pred = model.predict(X_test)
    
    #check prediction for test
    metrics = metrics_score(y_test, y_test_pred, model=None)
    
    print("Test Result for ", REGRESSION_MODEL, " Model : ")
    print("Root Mean Squared Error: ", sqrt(mean_squared_error(y_test, y_test_pred)))
    print("R-Squared: ", r2_score(y_test, y_test_pred))
    
    return metrics
    


def grid_search_cv(regression, params):
    if regression == "RIDGE":
        model = GridSearchCV(Ridge(), params, cv = 7, scoring = SCORING, n_jobs = JOB)
    elif regression == "LASSO":
        model = GridSearchCV(Lasso(), params ,cv = 15, scoring = SCORING, n_jobs = JOB)
    elif regression == "DECISIONTREE":
        model = GridSearchCV(DecisionTreeRegressor(), params, cv = 10)
    return model


def metrics_score(original, predict, model=None):
    rmse = sqrt(mean_squared_error(original, predict))
    mae = mean_absolute_error(original, predict)
    mape = mean_absolute_percentage_error(original, predict)
    r2score = r2_score(original, predict)
        
    metrics = dict (MAE="{:.4f}".format(mae), MAPE="{:.4f}".format(mape), R2_Score="{:.4f}".format(r2score), RMSE="{:.4f}".format(rmse))
    
    return metrics


def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true+1e-6)) * 100
    
    if type(mape) == pd.Series: mape = mape[0]
        
    return mape
        

In [19]:
metrics_df = pd.DataFrame()

algorithms = ["RIDGE","LASSO", "DECISIONTREE","RANDOMFOREST"]
for algorithm in algorithms:
    print(algorithm)
    metrics = model_prediction(algorithm)
    
    # Creating result df
    metrics["model"] = algorithm
    metrics_df = metrics_df.append(metrics, ignore_index=True)

RIDGE



Ill-conditioned matrix (rcond=4.94828e-22): result may not be accurate.



Test Result for  Ridge Regressor  Model : 
Root Mean Squared Error:  1.8081193218501576e-11
R-Squared:  1.0
LASSO
Test Result for  Lasso Regressor  Model : 
Root Mean Squared Error:  0.0018112817368141773
R-Squared:  0.9999994090119767
DECISIONTREE
Params {'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]}
Test Result for  Decision Tree  Model : 
Root Mean Squared Error:  0.20699213045178858
R-Squared:  0.9922818301349369
RANDOMFOREST
Test Result for  Random Forest Regressor  Model : 
Root Mean Squared Error:  0.27817003994580347
R-Squared:  0.986061136369507
