In [2]:
import numpy as np
import pandas as pd
!pip install xgboost
!pip install lightgbm
!pip install catboost

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings(action='ignore')



In [3]:
data = pd.read_csv('supermarket_sales - Sheet1.csv')

In [4]:
data

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.8200,80.2200,3/8/2019,10:29,Cash,76.40,4.761905,3.8200,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.2880,489.0480,1/27/2019,20:33,Ewallet,465.76,4.761905,23.2880,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,233-67-5758,C,Naypyitaw,Normal,Male,Health and beauty,40.35,1,2.0175,42.3675,1/29/2019,13:46,Ewallet,40.35,4.761905,2.0175,6.2
996,303-96-2227,B,Mandalay,Normal,Female,Home and lifestyle,97.38,10,48.6900,1022.4900,3/2/2019,17:16,Ewallet,973.80,4.761905,48.6900,4.4
997,727-02-1313,A,Yangon,Member,Male,Food and beverages,31.84,1,1.5920,33.4320,2/9/2019,13:22,Cash,31.84,4.761905,1.5920,7.7
998,347-56-2442,A,Yangon,Normal,Male,Home and lifestyle,65.82,1,3.2910,69.1110,2/22/2019,15:33,Cash,65.82,4.761905,3.2910,4.1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Invoice ID               1000 non-null   object 
 1   Branch                   1000 non-null   object 
 2   City                     1000 non-null   object 
 3   Customer type            1000 non-null   object 
 4   Gender                   1000 non-null   object 
 5   Product line             1000 non-null   object 
 6   Unit price               1000 non-null   float64
 7   Quantity                 1000 non-null   int64  
 8   Tax 5%                   1000 non-null   float64
 9   Total                    1000 non-null   float64
 10  Date                     1000 non-null   object 
 11  Time                     1000 non-null   object 
 12  Payment                  1000 non-null   object 
 13  cogs                     1000 non-null   float64
 14  gross margin percentage  

## 1. INITIAL PREPROCESSING

In [13]:
def preprocess_input(df):
    df = df.copy()
    
    #Drop ID column (useless for the model)
    df = df.drop('Invoice ID', axis=1)
    
    #Split df into X and y
    y = df['Rating']
    X = df.drop('Rating', axis = 1)
    
    return X, y

In [16]:
X, y = preprocess_input(data)

# Train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, shuffle = True, random_state = 1)

In [20]:
X_train.shape

(700, 15)

In [23]:
X_train.head(2)

Unnamed: 0,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income
731,A,Yangon,Normal,Male,Health and beauty,56.0,3,8.4,176.4,2/28/2019,19:33,Ewallet,168.0,4.761905,8.4
716,A,Yangon,Member,Female,Fashion accessories,71.46,7,25.011,525.231,3/28/2019,16:06,Ewallet,500.22,4.761905,25.011


## 2. CONSTRUCTING PIPELINE

In [24]:
X_train.select_dtypes('object')

Unnamed: 0,Branch,City,Customer type,Gender,Product line,Date,Time,Payment
731,A,Yangon,Normal,Male,Health and beauty,2/28/2019,19:33,Ewallet
716,A,Yangon,Member,Female,Fashion accessories,3/28/2019,16:06,Ewallet
640,B,Mandalay,Member,Female,Food and beverages,2/23/2019,20:00,Ewallet
804,B,Mandalay,Member,Female,Electronic accessories,2/23/2019,11:12,Cash
737,C,Naypyitaw,Normal,Male,Electronic accessories,1/29/2019,14:26,Ewallet
...,...,...,...,...,...,...,...,...
767,B,Mandalay,Normal,Male,Sports and travel,2/13/2019,13:59,Cash
72,B,Mandalay,Member,Female,Food and beverages,3/5/2019,18:17,Ewallet
908,A,Yangon,Member,Female,Food and beverages,3/27/2019,16:30,Ewallet
235,A,Yangon,Normal,Female,Sports and travel,1/20/2019,18:09,Ewallet


In [26]:
{column: len(X_train[column].unique()) for column in X_train.select_dtypes('object').columns}

{'Branch': 3,
 'City': 3,
 'Customer type': 2,
 'Gender': 2,
 'Product line': 6,
 'Date': 89,
 'Time': 427,
 'Payment': 3}

In [27]:
# Categorize our features

binary_features = ['Customer type', 'Gender']
date_features = ['Date']
time_features = ['Time']
nominal_features = ['Branch', 'City', 'Product line', 'Payment']


In [46]:
# Creating customer transformers for date and time features

class DateEncoder:
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        for column in X.columns:
            X[column] = pd.to_datetime(X[column])
            X[column + '_year'] = X[column].apply(lambda x: x.year)
            X[column + '_month'] = X[column].apply(lambda x: x.month)
            X[column + '_day'] = X[column].apply(lambda x: x.day)
            
            X = X.drop(column, axis = 1)
        return X   


class TimeEncoder:
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        for column in X.columns:
            X[column] = pd.to_datetime(X[column])
            X[column + '_hour'] = X[column].apply(lambda x: x.hour)
            X[column + '_minute'] = X[column].apply(lambda x: x.minute)
            
            X = X.drop(column, axis = 1)
           
        return X   
    

In [47]:
#Contruct transformer pipelines for each feature type

binary_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder())
])

date_transformer = Pipeline(steps=[
    ('date', DateEncoder())
])

time_transformer = Pipeline(steps=[
    ('time', TimeEncoder())
])

nominal_transformer = Pipeline(steps=[
    ('Ohe', OneHotEncoder())
])

In [48]:
# Combine Transformers with Column Transformers

preprocessor = ColumnTransformer(transformers = [
    ('binary', binary_transformer, binary_features),
    ('date', date_transformer, date_features),
    ('time', time_transformer, time_features),
    ('nominal', nominal_transformer, nominal_features),
])

## 3. TRAINING

In [51]:
# Define Models

models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor(),
    "                               XGBoost": XGBRegressor(),
    "                              LightGBM": LGBMRegressor(),
    "                              CatBoost": CatBoostRegressor(verbose=0)
}


#Make a Scaler

scaler = StandardScaler()

for name, model in models.items():
    #Construct final PipeLine
    
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', scaler),
        ('model', model)
    ])
    
    
    #Fit the pipeline
    pipeline.fit(X_train, y_train)
    print(name + ' Trained.')

                     Linear Regression Trained.
 Linear Regression (L2 Regularization) Trained.
 Linear Regression (L1 Regularization) Trained.
                   K-Nearest Neighbors Trained.
                        Neural Network Trained.
Support Vector Machine (Linear Kernel) Trained.
   Support Vector Machine (RBF Kernel) Trained.
                         Decision Tree Trained.
                         Random Forest Trained.
                     Gradient Boosting Trained.
                               XGBoost Trained.
                              LightGBM Trained.
                              CatBoost Trained.


## 4. RESULTS

In [53]:
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', scaler),
        ('model', model)
    ])
    print(name + " R^2 Score: {:.5f}".format(pipeline.score(X_test, y_test)))

                     Linear Regression R^2 Score: -0.01521
 Linear Regression (L2 Regularization) R^2 Score: -0.01449
 Linear Regression (L1 Regularization) R^2 Score: -0.00059
                   K-Nearest Neighbors R^2 Score: -0.13735
                        Neural Network R^2 Score: -0.07561
Support Vector Machine (Linear Kernel) R^2 Score: -0.04709
   Support Vector Machine (RBF Kernel) R^2 Score: -0.08280
                         Decision Tree R^2 Score: -1.01751
                         Random Forest R^2 Score: -0.04349
                     Gradient Boosting R^2 Score: -0.13533
                               XGBoost R^2 Score: -0.24937
                              LightGBM R^2 Score: -0.17964
                              CatBoost R^2 Score: -0.15741
