# Predictive Modelling for BA Data
Prepared by CPT6  
Master of Science in Data Science 2021 
Asian Institute of Management

---

*All rights reserved. No part of this document may be reproduced or transmitted in any form or by any means, or stored in any retrieval system of any nature without prior written permission.*

© 2021

<div><span style="background-color: #E72388; padding-top: 100px; padding-right: 20px; padding-bottom: 50px; padding-left: 20px; color: #ffffff; font-size: 20px; font-weight: bold"> DATA PREPARATION </span></div>

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [6]:
df_concurrence = pd.read_csv('df_concurrence.csv') 
df = pd.read_csv('sales_data.csv')
df_concurrence['date'] = pd.to_datetime(df_concurrence['date'])
df['date'] = pd.to_datetime(df['date'])
df_concurrence = df_concurrence.drop('Unnamed: 0', axis=1)

In [7]:
df.shape

(93007, 21)

In [604]:
def agegroup(milk):
    if '0-6' in milk:
        age = '0-6'
    elif '6-12' in milk:
        age = '6-12'
    elif '0-12' in milk:
        age = '0-12'
    elif '1-3' in milk:
        age = '1-3'
    elif '3+' in milk:
        age = '3+'
    elif '6+' in milk:
        age = '6+'
    else:
        age = 'enfamama'
    return age

def flavor(item):
    if 'chocolate' in item:
        flav = 'chocolate'
    elif 'choco' in item:
        flav = 'chocolate'
    elif 'vanilla' in item:
        flav = 'vanilla'
    elif 'milk' in item:
        flav = 'milk'
    elif 'formula' in item:
        flav = 'formula'
    elif 'plain' in item:
        flav = 'plain'
    elif 'a+ four' in item:
        flav = 'plain'
    elif 'premium adult nutritional beverage' in item:
        flav = 'milk'
    elif 'premium adult nutrional beverage' in item:
        flav = 'milk'
    elif 'premium adult nutrional beverage' in item:
        flav = 'milk'
    elif 'premium adult nutrition beverage' in item:
        flav = 'milk'
    elif 'gentlease' in item:
        flav = 'milk'
    elif 'a+ three' in item:
        flav = 'milk'
    else:
        flav = 'unknown'
    return flav


def bundle(bund):
    if '[' in bund:
        bund1 =  1
    elif 'buy' in bund:
        if 'buy 1 ' in bund:
            bund1 = 0
        else:
            bund1 = 1
    elif 'x 3' in bund:
        bund1 = 1
    else:
        bund1 = 0
    return bund1

In [691]:
df_copy = df.copy()

<div><span style="background-color: #E72388; padding-top: 100px; padding-right: 20px; padding-bottom: 50px; padding-left: 20px; color: #ffffff; font-size: 20px; font-weight: bold"> FEATURE ENGINEERING </span></div>

In [692]:
df_copy['age_group']=df_copy['product_name'].apply(lambda x: agegroup(x))
df_copy['product_name'] = df_copy['product_name'].apply(lambda x: x.lower())
df_copy['flavor'] = df_copy['product_name'].apply(lambda x: flavor(x))
df_copy['bundle'] = df_copy['product_name'].apply(lambda x: bundle(x))

In [693]:
prods = df_copy.loc[df_copy['shop_name'] == 'LazMart Philippines'].product_name.values.tolist()

In [694]:
prods = [(i.split()[0]) for i in prods]

In [695]:
df_copy.loc[df_copy['shop_name'] == 'LazMart Philippines', 'shop_name'] = prods

In [696]:
df_copy['shop_name'] = df_copy.shop_name.apply(lambda x: x.lower())

In [697]:
df_copy = df_copy.drop(['cat1', 'cat2', 'cat3', 'cat4', 
              'cat5', 'cat6', 'buyer', 'order',
              'country', 'url' ], axis=1)

In [698]:
df1 = df_copy.copy()

In [699]:
#0 shoppee 1 lazada
df1['channel'] = df1['channel'].apply(lambda x: 1 if x == 'Lazada' else 0)

In [700]:
months = df1.date.dt.month
days = df1.date.dt.day
day_name = df1.date.dt.day_name()
is_weekend = day_name.apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)

In [701]:
shop_conditions = [
    df1['shop_name'].str.contains('lactum'),
    df1['shop_name'].str.contains('enfa'),
    df1['shop_name'].str.contains('sustagen')
]

shops = [0, 1, 2]

shop_bins = np.select(shop_conditions, shops, default='other')


In [702]:
df1 = df1.drop(['sku', 'product_name', 'brand'], axis=1)

In [703]:
df1.head()

Unnamed: 0,date,channel,shop_name,sap,sales,units_sold,product_views,click_rate,age_group,flavor,bundle
0,2019-01-01,1,lazada retail lactum,2048174.0,0.0,0,25,0.0,3+,milk,0
1,2019-01-01,1,lazada retail lactum,0.0,0.0,10,116,8.62,3+,milk,0
2,2019-01-01,1,lazada retail lactum,0.0,0.0,1,27,3.7,3+,milk,0
3,2019-01-01,1,lazada retail lactum,2020877.0,0.0,0,62,0.0,3+,milk,0
4,2019-01-01,1,lazada retail lactum,0.0,0.0,17,298,5.7,3+,milk,0


In [704]:

df1 = pd.get_dummies(data=df1, columns=['age_group', 'flavor'])
df1 = df1.drop('sap', axis=1)
df1['shop_name'] = shop_bins
df1 = df1.groupby(['date', 
             'shop_name',
             'channel'])[df1.columns[3:]].sum().reset_index()

In [750]:
df1

Unnamed: 0,date,shop_name,channel,sales,units_sold,product_views,click_rate,bundle,age_group_0-12,age_group_0-6,...,flavor_chocolate,flavor_formula,flavor_milk,flavor_plain,flavor_unknown,flavor_vanilla,payday,promo_day,laz_concur,sp_concur
0,2019-01-01,0,0,86.0,6,0,0.00,0,0,0,...,0,0,4,0,0,0,0,0,32,0.0
1,2019-01-01,0,1,30.0,36,985,96.30,0,0,2,...,5,2,24,0,0,0,0,0,32,0.0
2,2019-01-01,1,1,155.0,11,564,46.07,0,1,4,...,1,5,24,0,0,1,0,0,32,0.0
3,2019-01-01,2,1,0.0,0,50,0.00,0,0,0,...,3,0,3,0,0,0,0,0,32,0.0
4,2019-01-02,0,0,143.0,10,0,0.00,1,0,0,...,0,0,5,0,0,0,0,0,37,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4081,2020-12-31,0,1,0.0,0,0,201.01,2,0,7,...,5,0,8,1,46,0,1,0,87,26.0
4082,2020-12-31,1,0,5078.0,0,1149,171.81,8,2,1,...,2,3,9,2,0,1,1,0,87,26.0
4083,2020-12-31,1,1,0.0,0,0,123.73,1,3,5,...,1,0,20,11,26,1,1,0,87,26.0
4084,2020-12-31,2,0,1350.0,0,394,233.42,8,0,0,...,6,0,3,0,0,0,1,0,87,26.0


In [706]:
months_31 = ((df1['date'].dt.month == 1) | 
 (df1['date'].dt.month == 3) | 
 (df1['date'].dt.month == 5) |
 (df1['date'].dt.month == 7) | 
 (df1['date'].dt.month == 8) |
 (df1['date'].dt.month == 10)| 
 (df1['date'].dt.month == 12))

In [707]:
months_30 = ((df1['date'].dt.month == 2) | 
 (df1['date'].dt.month == 4) | 
 (df1['date'].dt.month == 6) |
 (df1['date'].dt.month == 9) | 
 (df1['date'].dt.month == 11))

In [708]:
df1['payday'] = 0
df1['promo_day'] = 0

In [709]:
df1.loc[(df1['date'].dt.month == 11) & (df1['date'].dt.day == 11), 'promo_day'] = 1
df1.loc[(df1['date'].dt.month == 12) & (df1['date'].dt.day == 12), 'promo_day'] = 1


In [710]:
df1.loc[(df1['date'].dt.day == 15), 'payday'] = 1
df1.loc[(months_31) & (df1['date'].dt.day == 31), 'payday'] = 1 
df1.loc[(months_30) & (df1['date'].dt.day == 30), 'payday'] = 1 

df1.loc[(df1['date'].dt.month == 11) & (df1['date'].dt.day == 11), 'promo_day'] = 1
df1.loc[(df1['date'].dt.month == 12) & (df1['date'].dt.day == 12), 'promo_day'] = 1


In [711]:
months = df1.date.dt.month
day_name = df1.date.dt.day_name()
is_weekend = day_name.apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)

In [712]:
#dates

df_months = pd.get_dummies(months, prefix='month')

df_datefeatures = pd.concat([df_months, is_weekend], 
                            axis=1)

In [713]:
df_datefeatures = df_datefeatures.rename(columns={'date':'is_weekend'})

In [714]:
df1 = df1.merge(df_concurrence, on='date')

In [715]:
df2 = df1.copy()

In [716]:
df1[:20]

Unnamed: 0,date,shop_name,channel,sales,units_sold,product_views,click_rate,bundle,age_group_0-12,age_group_0-6,...,flavor_chocolate,flavor_formula,flavor_milk,flavor_plain,flavor_unknown,flavor_vanilla,payday,promo_day,laz_concur,sp_concur
0,2019-01-01,0,0,86.0,6,0,0.0,0,0,0,...,0,0,4,0,0,0,0,0,32,0.0
1,2019-01-01,0,1,30.0,36,985,96.3,0,0,2,...,5,2,24,0,0,0,0,0,32,0.0
2,2019-01-01,1,1,155.0,11,564,46.07,0,1,4,...,1,5,24,0,0,1,0,0,32,0.0
3,2019-01-01,2,1,0.0,0,50,0.0,0,0,0,...,3,0,3,0,0,0,0,0,32,0.0
4,2019-01-02,0,0,143.0,10,0,0.0,1,0,0,...,0,0,5,0,0,0,0,0,37,0.0
5,2019-01-02,0,1,60.0,72,1186,79.81,0,0,2,...,5,2,24,0,0,0,0,0,37,0.0
6,2019-01-02,1,0,30.0,1,0,0.0,0,0,0,...,0,0,1,0,0,0,0,0,37,0.0
7,2019-01-02,1,1,644.0,35,962,72.01,0,1,4,...,1,5,24,0,0,1,0,0,37,0.0
8,2019-01-02,2,1,0.0,4,83,16.0,0,0,0,...,3,0,3,0,0,0,0,0,37,0.0
9,2019-01-03,0,0,85.0,6,0,0.0,0,0,0,...,1,0,3,0,0,0,0,0,39,0.0


In [717]:
df2 = pd.concat([df_datefeatures, df2], axis=1)
df2 = df2.drop(['product_views', 'units_sold', 'click_rate'], axis=1)
# df2 = pd.get_dummies(data=df2, columns=['shop_name', 'channel'])

In [720]:
df2 = df2.drop('date', axis=1)

In [724]:
df2['shop_name'] = df2.shop_name.astype(int)

In [729]:
df2['sp_concur'] = df2.sp_concur.astype(int)

In [730]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4086 entries, 0 to 4085
Data columns (total 34 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   month_1             4086 non-null   uint8  
 1   month_2             4086 non-null   uint8  
 2   month_3             4086 non-null   uint8  
 3   month_4             4086 non-null   uint8  
 4   month_5             4086 non-null   uint8  
 5   month_6             4086 non-null   uint8  
 6   month_7             4086 non-null   uint8  
 7   month_8             4086 non-null   uint8  
 8   month_9             4086 non-null   uint8  
 9   month_10            4086 non-null   uint8  
 10  month_11            4086 non-null   uint8  
 11  month_12            4086 non-null   uint8  
 12  is_weekend          4086 non-null   int64  
 13  shop_name           4086 non-null   int64  
 14  channel             4086 non-null   int64  
 15  sales               4086 non-null   float64
 16  bundle

- weekend pagsamahin
- bundle pagsamahin

<div><span style="background-color: #E72388; padding-top: 100px; padding-right: 20px; padding-bottom: 50px; padding-left: 20px; color: #ffffff; font-size: 20px; font-weight: bold"> MODEL CREATION </span></div>

In [755]:
df1.head()

Unnamed: 0,date,shop_name,channel,sales,units_sold,product_views,click_rate,bundle,age_group_0-12,age_group_0-6,...,flavor_chocolate,flavor_formula,flavor_milk,flavor_plain,flavor_unknown,flavor_vanilla,payday,promo_day,laz_concur,sp_concur
0,2019-01-01,0,0,86.0,6,0,0.0,0,0,0,...,0,0,4,0,0,0,0,0,32,0.0
1,2019-01-01,0,1,30.0,36,985,96.3,0,0,2,...,5,2,24,0,0,0,0,0,32,0.0
2,2019-01-01,1,1,155.0,11,564,46.07,0,1,4,...,1,5,24,0,0,1,0,0,32,0.0
3,2019-01-01,2,1,0.0,0,50,0.0,0,0,0,...,3,0,3,0,0,0,0,0,32,0.0
4,2019-01-02,0,0,143.0,10,0,0.0,1,0,0,...,0,0,5,0,0,0,0,0,37,0.0


In [744]:
df_target = df2['sales']
df_features = df2.drop('sales', axis=1)

In [760]:
df_target = df1['sales']

In [840]:
df_features.columns

Index(['month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12',
       'is_weekend', 'shop_name', 'channel', 'bundle', 'age_group_0-12',
       'age_group_0-6', 'age_group_1-3', 'age_group_3+', 'age_group_6+',
       'age_group_6-12', 'age_group_enfamama', 'flavor_chocolate',
       'flavor_formula', 'flavor_milk', 'flavor_plain', 'flavor_unknown',
       'flavor_vanilla', 'payday', 'promo_day', 'laz_concur', 'sp_concur'],
      dtype='object')

In [836]:
dff = df_features[['month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12',
           'shop_name', 'channel', 'bundle', 'age_group_0-12',
       'age_group_0-6', 'age_group_1-3', 'age_group_3+', 'age_group_6+',
       'age_group_6-12', 'age_group_enfamama', 'flavor_chocolate',
       'flavor_formula', 'flavor_milk', 'flavor_plain', 'flavor_unknown',
       'flavor_vanilla', 'payday', 'promo_day', 'laz_concur', 'sp_concur']]

In [745]:
# scaler = StandardScaler()
# scaler.fit(df_features)
# scaler.transform(df_features)
# df_features = pd.DataFrame(scaler.fit_transform(df_features), columns = df_features.columns)

In [837]:
X_train_0_1, X_test_0_1, y_train_0_1, y_test_0_1 = train_test_split(
    dff, df_target, test_size=0.20)


LR = LinearRegression()
LR.fit(X_train_0_1, y_train_0_1)
print('running model')

Lasso1 = Lasso()
Lasso1.fit(X_train_0_1, y_train_0_1)
print('running model')

Ridge1 = Ridge()
Ridge1.fit(X_train_0_1, y_train_0_1)
print('running model')

kNN = KNeighborsRegressor() 
kNN.fit(X_train_0_1,y_train_0_1)
print('running model')

DT = DecisionTreeRegressor(random_state=1337) 
DT.fit(X_train_0_1,y_train_0_1)
print('running model')

#RF = RandomForestRegressor(n_estimators = 100) 
RF = RandomForestRegressor(random_state=1337) 
RF.fit(X_train_0_1,y_train_0_1)
print('running model')


# GBM = GradientBoostingRegressor(max_depth = 5, learning_rate=.1) 
GBM = GradientBoostingRegressor(random_state=1337) 
GBM.fit(X_train_0_1,y_train_0_1)
print('running model')


cols = ['Regression Method','Train Accuracy', 'Test Accuracy']
df_0_1 = pd.DataFrame(columns=cols)


df_0_1.loc[1] = ['Linear Regression', LR.score(X_train_0_1, y_train_0_1), 
                 LR.score(X_test_0_1, y_test_0_1)]

df_0_1.loc[2] = ['Linear Regression + Lasso', 
                 Lasso1.score(X_train_0_1, y_train_0_1), 
                 Lasso1.score(X_test_0_1, y_test_0_1)]

df_0_1.loc[3] = ['Linear Regression + Ridge', 
                 Ridge1.score(X_train_0_1, y_train_0_1), 
                 Ridge1.score(X_test_0_1, y_test_0_1)]

df_0_1.loc[4] = ['kNN', kNN.score(X_train_0_1, y_train_0_1), 
                 kNN.score(X_test_0_1, y_test_0_1)]

df_0_1.loc[5] = ['Decision Tree',DT.score(X_train_0_1, y_train_0_1),
                 DT.score(X_test_0_1, y_test_0_1)]

df_0_1.loc[6] = ['Random Forest',RF.score(X_train_0_1, y_train_0_1), 
                 RF.score(X_test_0_1, y_test_0_1)]

df_0_1.loc[7] = ['Gradient Boosting Method',
                 GBM.score(X_train_0_1, y_train_0_1), 
                 GBM.score(X_test_0_1, y_test_0_1)]

df_0_1

running model


  model = cd_fast.enet_coordinate_descent(


running model
running model
running model
running model
running model
running model


Unnamed: 0,Regression Method,Train Accuracy,Test Accuracy
1,Linear Regression,0.293443,0.351313
2,Linear Regression + Lasso,0.293438,0.350799
3,Linear Regression + Ridge,0.293007,0.34328
4,kNN,0.452499,0.40973
5,Decision Tree,0.999985,0.16695
6,Random Forest,0.908772,0.50197
7,Gradient Boosting Method,0.804372,0.380515


In [842]:
scores = []
train = []
for i in range(10):
    X_train_0_1, X_test_0_1, y_train_0_1, y_test_0_1 = train_test_split(
        dff, df_target, test_size=0.25, random_state=i)

    # GBM = GradientBoostingRegressor(max_depth = 5, learning_rate=.1) 
    GBM = GradientBoostingRegressor()
    GBM.fit(X_train_0_1,y_train_0_1)
    scores.append(GBM.score(X_test_0_1, y_test_0_1))
    train.append(GBM.score(X_train_0_1, y_train_0_1))

scores
print(sum(scores)/len(scores))
print(sum(train)/len(train))

0.23635261961959012
0.8570821018425043


In [814]:
train

[0.8455836187547815,
 0.9047625893580906,
 0.8385308053700706,
 0.8493035954322635,
 0.8496568948170626,
 0.8913109292004702,
 0.8882549137662142,
 0.8727977121649921,
 0.8097096833022419,
 0.8343014676697015]

In [786]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(RF, df_features, df_target, cv=10)
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

[-0.51811386  0.47779663  0.35201667  0.10785052  0.38050865  0.39292792
  0.04778128 -0.03260602  0.30062653 -0.29251289]
0.12 accuracy with a standard deviation of 0.31


In [780]:
params_gbm = {
    'max_depth':[8, 10, 12, 14, 16, 18],
    'learning_rate':[0.05, 0.1, .2, .3, .4, .5, .6]
}
gbm = DecisionTreeRegressor()
grid_gbm = GridSearchCV(gbm, param_grid=params_gbm, n_jobs=-1, verbose=20)
grid_gbm.fit(X_train_0_1, y_train_0_1)

print('Best Score:', grid_gbm.best_score_)
print('Best Parameters:', grid_gbm.best_params_)

Fitting 5 folds for each of 42 candidates, totalling 210 fits


ValueError: Invalid parameter learning_rate for estimator DecisionTreeRegressor(). Check the list of available parameters with `estimator.get_params().keys()`.

In [22]:
# # print('Best Score:', grid_gbm.best_score_)
# print('Best Parameters:', grid_gbm.best_params_)

In [749]:
cols = ['Learning Rate','Train Accuracy', 'Test Accuracy']
df_0_1 = pd.DataFrame(columns=cols)

for i, y in zip((np.linspace(0.1, 1, 10)), range(10)):
    GBM = GradientBoostingRegressor(learning_rate=i, random_state=1337) 
    GBM.fit(X_train_0_1,y_train_0_1)

    df_0_1.loc[y] = [i,
                     GBM.score(X_train_0_1, y_train_0_1), 
                     GBM.score(X_test_0_1, y_test_0_1)]

df_0_1

Unnamed: 0,Learning Rate,Train Accuracy,Test Accuracy
0,0.1,0.852654,0.360318
1,0.2,0.923702,0.432804
2,0.3,0.948891,0.39988
3,0.4,0.96482,0.334131
4,0.5,0.955337,0.179081
5,0.6,0.973632,0.368886
6,0.7,0.974129,0.361285
7,0.8,0.976732,0.119298
8,0.9,0.981044,0.428356
9,1.0,0.980799,0.335532


In [24]:
cols = ['Regression Method','Train Accuracy', 'Test Accuracy']
df_0_1 = pd.DataFrame(columns=cols)
for i in range(1, 40):
    GBM = GradientBoostingRegressor(learning_rate=0.9,
                                    random_state=i,
#                                     loss='ls',
#                                     criterion='mse',
#                                     min_samples_split=5
                                   ) 
    
    GBM.fit(X_train_0_1,y_train_0_1)

    df_0_1.loc[i] = [i,
                     GBM.score(X_train_0_1, y_train_0_1), 
                     GBM.score(X_test_0_1, y_test_0_1)]

df_0_1['Test Accuracy'].mean()

0.6256195959395198