In [3]:
import pandas as pd
import numpy as np
import scipy.stats as stats

# for regressions with statsmodels:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
from statsmodels.compat import lzip
from statsmodels.stats.outliers_influence import OLSInfluence
from statsmodels.graphics.regressionplots import plot_leverage_resid2

# for regressions with scikit-learn:
import sklearn.linear_model as sklm
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, classification_report, precision_score, \
                            accuracy_score, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score

#for plotting confusion matrix:
import scikitplot as skplt

#for ordinal logistic regression
from mord import LogisticIT

#for KNN
# for KNN:
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor

In [4]:
#These are utility tools of the DMBA book. 
from dmba import regressionSummary, exhaustive_search
from dmba import backward_elimination, forward_selection, stepwise_selection
from dmba import adjusted_r2_score, AIC_score, BIC_score
from dmba import classificationSummary, gainsChart, liftChart

In [46]:
#for plotting decision trees
import pydotplus as pplus
import graphviz
from IPython.display import Image
from six import StringIO
# if not found: ! pip install six

#for modeling with decision trees
from sklearn.tree import export_graphviz

#Scikit stuff
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, \
                            BaggingClassifier, BaggingRegressor, \
                            GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, \
                            classification_report, precision_score, \
                            accuracy_score, roc_curve, roc_auc_score
import scikitplot as skplt


In [5]:
df = pd.read_excel("processed_CIR.xlsx")

df.reindex()

Unnamed: 0.1,Unnamed: 0,Segment,Category,Region,A,B,C,D,E,F,G,H,I,J,K,L,AVG,Year
0,2,INTERNET,Retail Price ($ per pair),NorthA,67.00,67.000,67.00,67.00,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,Y10
1,3,INTERNET,Search Engine Advert. ($000s),NorthA,5000.00,5000.000,5000.00,5000.00,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,Y10
2,4,INTERNET,Free Shipping,NorthA,0.00,0.000,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Y10
3,6,INTERNET,S/Q Rating (1 to 10 stars),NorthA,4.00,4.000,4.00,4.00,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,Y10
4,7,INTERNET,Model Availability,NorthA,200.00,200.000,200.00,200.00,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,Y10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,43,PRIVATELABEL,Offer Price (max = $40.00),LA,43.69,39.000,59.98,39.49,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67.5,Y18
1076,44,PRIVATELABEL,S/Q Rating (min = 3.0 stars),LA,5.50,5.000,5.00,5.10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,Y18
1077,46,PRIVATELABEL,Pairs Offered (000s),LA,383.00,995.000,296.00,1003.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2008.0,Y18
1078,47,PRIVATELABEL,Pairs Sold (000s),LA,0.00,995.000,0.00,1003.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2677.0,Y18


In [6]:
A_team_df = df[["Segment", "Category", "Region", "A", "Year"]]
A_team_df
table = pd.pivot_table(A_team_df, values='A', columns="Category", index = ["Year", "Region", "Segment"])
table

Unnamed: 0_level_0,Unnamed: 1_level_0,Category,Gained / Lost (due to stockouts),Brand Advertising ($000s),Brand Reputation (prior-year image),Celebrity Appeal,Delivery Time (weeks),Free Shipping,Market Share (%),Model Availability,Offer Price (max = $40.00),Online Orders (000s),...,Pairs Offered (000s),Pairs Sold (000s),Rebate Offer ($ per pair),Retail Outlets,Retail Price ($ per pair),Retailer Support ($ per outlet),S/Q Rating (1 to 10 stars),S/Q Rating (min = 3.0 stars),Search Engine Advert. ($000s),Wholesale Price ($ per pair)
Year,Region,Segment,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Y10,AP,INTERNET,,8000.0,70.0,0.0,,0.0,0.2500,200.0,,225.0,...,,225.0,,,67.0,,4.0,,4000.0,
Y10,AP,PRIVATELABEL,,,,,,,0.2500,,32.00,,...,200.0,200.0,,,,,,3.0,,
Y10,AP,WHOLESALE,0.0,8000.0,70.0,0.0,3.0,,0.2500,200.0,,,...,,1275.0,5.0,700.0,,2750.0,4.0,,,48.0
Y10,EP,INTERNET,,9000.0,70.0,0.0,,0.0,0.2500,200.0,,300.0,...,,300.0,,,72.0,,4.0,,4500.0,
Y10,EP,PRIVATELABEL,,,,,,,0.2500,,37.00,,...,200.0,200.0,,,,,,3.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Y18,LA,PRIVATELABEL,,,,,,,0.0000,,43.69,,...,383.0,0.0,,,,,,5.5,,
Y18,LA,WHOLESALE,0.0,30000.0,86.0,135.0,1.0,,0.2413,50.0,,,...,,2272.0,5.0,4456.0,,8500.0,9.7,,,77.0
Y18,NorthA,INTERNET,,30000.0,86.0,145.0,,1.0,0.1773,50.0,,576.0,...,,576.0,,,122.5,,9.7,,20000.0,
Y18,NorthA,PRIVATELABEL,,,,,,,0.0000,,0.00,,...,0.0,0.0,,,,,,0.0,,


In [24]:
master_pivot = pd.DataFrame()
for team in ["A", "B", "C", "D"]: 

    A_team_df = df[["Segment", "Category", "Region", team, "Year"]]
    A_team_df
    table = pd.pivot_table(A_team_df, values= team, columns="Category", index = ["Year", "Region", "Segment"])

    if master_pivot.empty:
       master_pivot = table
    else: 
        master_pivot = pd.concat([master_pivot, table])

In [25]:
master_pivot.columns = master_pivot.columns.str.replace(' ', '')

# Internet Segments     

In [98]:
idx = pd.IndexSlice
Internet_df = master_pivot.loc[idx[["Y11", "Y12", "Y13", "Y14", "Y15", "Y16", "Y17", "Y18"], :, "INTERNET"], :].dropna(axis=1)
Internet_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Category,BrandAdvertising($000s),BrandReputation(prior-yearimage),CelebrityAppeal,FreeShipping,MarketShare(%),ModelAvailability,OnlineOrders(000s),PairsSold(000s),RetailPrice($perpair),S/QRating(1to10stars),SearchEngineAdvert.($000s)
Year,Region,Segment,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Y11,NorthA,INTERNET,12000.0,70.0,0.0,0.0,0.1789,248.8,287.0,287.0,75.0,5.5,6250.0
Y11,NorthA,INTERNET,22000.0,70.0,0.0,0.0,0.351,347.3,563.0,563.0,83.0,6.7,7000.0
Y11,NorthA,INTERNET,16000.0,70.0,0.0,1.0,0.2294,297.3,368.0,368.0,99.99,7.3,7000.0
Y11,NorthA,INTERNET,16500.0,70.0,0.0,1.0,0.2406,297.3,386.0,386.0,96.0,5.9,7250.0
Y12,NorthA,INTERNET,24000.0,65.0,40.0,0.0,0.1842,343.0,339.0,339.0,84.0,6.3,7250.0
Y12,NorthA,INTERNET,28000.0,89.0,60.0,0.0,0.337,445.2,620.0,620.0,95.0,7.1,12250.0
Y12,NorthA,INTERNET,21000.0,74.0,215.0,1.0,0.2201,390.3,405.0,405.0,115.9,8.4,11250.0
Y12,NorthA,INTERNET,24000.0,72.0,100.0,1.0,0.2587,394.3,476.0,476.0,99.0,6.3,9500.0
Y13,NorthA,INTERNET,27000.0,64.0,40.0,0.0,0.1268,445.8,260.0,260.0,100.0,6.0,9000.0
Y13,NorthA,INTERNET,28000.0,96.0,60.0,0.0,0.1989,449.3,408.0,408.0,100.0,7.1,12250.0


In [99]:
print(Internet_df.dtypes)

Category
BrandAdvertising($000s)             float64
BrandReputation(prior-yearimage)    float64
CelebrityAppeal                     float64
FreeShipping                        float64
MarketShare(%)                      float64
ModelAvailability                   float64
OnlineOrders(000s)                  float64
PairsSold(000s)                     float64
RetailPrice($perpair)               float64
S/QRating(1to10stars)               float64
SearchEngineAdvert.($000s)          float64
dtype: object


In [100]:
y = Internet_df["OnlineOrders(000s)"]
X_values = Internet_df.drop(columns=["OnlineOrders(000s)", "BrandReputation(prior-yearimage)", "MarketShare(%)", "PairsSold(000s)"])
#X_values = sm.add_constant(X_values)
X_train, X_test, Y_train, Y_test = train_test_split(X_values, y, test_size=0.25, random_state=7)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(24, 7) (24,)
(8, 7) (8,)


In [101]:
X_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Category,BrandAdvertising($000s),CelebrityAppeal,FreeShipping,ModelAvailability,RetailPrice($perpair),S/QRating(1to10stars),SearchEngineAdvert.($000s)
Year,Region,Segment,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Y17,NorthA,INTERNET,30000.0,0.0,1.0,500.0,119.0,9.0,20000.0
Y15,NorthA,INTERNET,30000.0,100.0,0.0,498.3,109.75,8.1,20000.0
Y16,NorthA,INTERNET,30000.0,245.0,0.0,499.9,111.75,8.0,20000.0
Y13,NorthA,INTERNET,30000.0,100.0,1.0,442.3,92.0,7.1,14500.0
Y18,NorthA,INTERNET,30000.0,0.0,1.0,500.0,119.0,9.5,20000.0
Y14,NorthA,INTERNET,7000.0,40.0,1.0,374.9,99.5,7.3,9250.0
Y15,NorthA,INTERNET,30000.0,155.0,1.0,494.3,116.49,8.9,17500.0
Y11,NorthA,INTERNET,12000.0,0.0,0.0,248.8,75.0,5.5,6250.0
Y18,NorthA,INTERNET,30000.0,230.0,1.0,451.2,117.5,9.9,20000.0
Y12,NorthA,INTERNET,21000.0,215.0,1.0,390.3,115.9,8.4,11250.0


In [103]:
OLS_internet_all = sm.OLS(y, X_values)
OLS_internet_all = OLS_internet_all.fit()
OLS_internet_all.summary()

0,1,2,3
Dep. Variable:,OnlineOrders(000s),R-squared (uncentered):,0.979
Model:,OLS,Adj. R-squared (uncentered):,0.973
Method:,Least Squares,F-statistic:,166.2
Date:,"Mon, 30 Jan 2023",Prob (F-statistic):,2.39e-19
Time:,15:22:11,Log-Likelihood:,-190.33
No. Observations:,32,AIC:,394.7
Df Residuals:,25,BIC:,404.9
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
BrandAdvertising($000s),0.0110,0.004,2.633,0.014,0.002,0.020
CelebrityAppeal,0.6286,0.246,2.554,0.017,0.122,1.136
FreeShipping,1.3834,46.667,0.030,0.977,-94.728,97.495
ModelAvailability,0.5756,0.169,3.413,0.002,0.228,0.923
RetailPrice($perpair),-9.2602,2.675,-3.462,0.002,-14.770,-3.751
S/QRating(1to10stars),105.6692,35.808,2.951,0.007,31.921,179.418
SearchEngineAdvert.($000s),0.0113,0.007,1.673,0.107,-0.003,0.025

0,1,2,3
Omnibus:,0.173,Durbin-Watson:,1.647
Prob(Omnibus):,0.917,Jarque-Bera (JB):,0.385
Skew:,0.03,Prob(JB):,0.825
Kurtosis:,2.466,Cond. No.,78200.0


In [104]:
preds = OLS_internet_all.predict(X_train)
regressionSummary(Y_train, preds)


Regression statistics

                      Mean Error (ME) : 9.3556
       Root Mean Squared Error (RMSE) : 94.0249
            Mean Absolute Error (MAE) : 73.4502
          Mean Percentage Error (MPE) : -0.8399
Mean Absolute Percentage Error (MAPE) : 14.4275


In [105]:
preds = OLS_internet_all.predict(X_test)
regressionSummary(Y_test, preds)


Regression statistics

                      Mean Error (ME) : -22.7020
       Root Mean Squared Error (RMSE) : 88.3581
            Mean Absolute Error (MAE) : 74.4414
          Mean Percentage Error (MPE) : -6.0664
Mean Absolute Percentage Error (MAPE) : 17.4549


# Ensembles 



In [106]:
y_values = Internet_df["OnlineOrders(000s)"]
X_values = Internet_df.drop(columns=["OnlineOrders(000s)", "BrandReputation(prior-yearimage)", "MarketShare(%)", "PairsSold(000s)"])
#X_values = sm.add_constant(X_values)
X_train, X_test, Y_train, Y_test = train_test_split(X_values, y, test_size=0.25, random_state=7)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(24, 7) (24,)
(8, 7) (8,)


In [107]:
bag = RandomForestRegressor(max_features=11, random_state=23)

In [108]:
bag.fit(X_train, Y_train)

In [109]:
bag_preds = bag.predict(X_train)
bag_preds

array([702.69, 768.26, 936.79, 714.27, 724.59, 300.59, 752.97, 309.4 ,
       909.73, 430.5 , 590.09, 597.47, 694.87, 855.68, 415.42, 689.23,
       429.49, 708.16, 722.77, 356.87, 690.42, 970.32, 344.42, 783.92])

In [110]:
regressionSummary(Y_train, bag_preds)
# try kfold cross validation here. 


Regression statistics

                      Mean Error (ME) : -3.9967
       Root Mean Squared Error (RMSE) : 53.2802
            Mean Absolute Error (MAE) : 40.0467
          Mean Percentage Error (MPE) : -3.4198
Mean Absolute Percentage Error (MAPE) : 8.1514


In [111]:
NRMSE = 53.2802 / (float(max(Y_train)) - float(min(Y_train))) 
NRMSE

0.06761446700507615

In [112]:
bag_preds = bag.predict(X_test)
regressionSummary(Y_test, bag_preds)


Regression statistics

                      Mean Error (ME) : -59.4150
       Root Mean Squared Error (RMSE) : 150.1275
            Mean Absolute Error (MAE) : 115.6925
          Mean Percentage Error (MPE) : -14.8175
Mean Absolute Percentage Error (MAPE) : 25.1255


In [113]:
NRMSE = 150.1275 / (float(max(Y_train)) - float(min(Y_train))) 
NRMSE

0.19051713197969544

The ensemble did a .19 Normalized RMSE 

# Decision Tree 

In [84]:
tree_reg = DecisionTreeRegressor(max_depth = 5, random_state=23)

In [85]:
tree_reg.fit(X_train, Y_train)

In [86]:
tree_preds = tree_reg.predict(X_train)

In [87]:
regressionSummary(Y_train, tree_preds)


Regression statistics

                      Mean Error (ME) : -0.0000
       Root Mean Squared Error (RMSE) : 48.9314
            Mean Absolute Error (MAE) : 34.7656
          Mean Percentage Error (MPE) : -1.1432
Mean Absolute Percentage Error (MAPE) : 7.9815


In [88]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

ModuleNotFoundError: No module named 'tensorflow'