In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.tree import (DecisionTreeRegressor, plot_tree)
from sklearn.metrics import (r2_score, mean_absolute_error)
#from sklearn.preprocessing import StandardScaler


In [None]:
knit_data = pd.read_csv("../data/interim/transactions_sd_knits_resampled_engin_synth_gt.csv")

In [None]:
knit_data['price_log'] = np.log(knit_data['price'] + 1)
knit_data['quantity_log'] = np.log(knit_data['quantity'] +1)

In [None]:
fig, ax =plt.subplots(3,2,figsize=(15,10))

sns.histplot(knit_data['price'], bins=40, ax=ax[0,0])
sns.histplot(knit_data['price_log'], bins=40, ax=ax[0,1])
sns.histplot(knit_data['quantity'], bins=40, ax=ax[1,0])
sns.histplot(knit_data['quantity_log'], bins=40, ax=ax[1,1])
sns.scatterplot(data=knit_data, x='price', y='quantity', ax=ax[2,0])
sns.scatterplot(data=knit_data, x='price_log', y='quantity_log', ax=ax[2,1])

fig.show()

In [None]:
knit_data.drop(columns=['price'], inplace=True)
knit_data.drop(columns=['quantity'], inplace=True)

In [None]:
def prepare_data(df):
    knit_data['transaction_date'] = pd.to_datetime(knit_data['transaction_date'], infer_datetime_format=True)
    knit_data['week_no'] = knit_data['week_no'].astype('object')
    knit_data['review'] = knit_data['review'].astype('object')
    knit_data.drop(columns=['month'], inplace=True)
    return(df)

In [None]:
knit_data = prepare_data(knit_data)

In [None]:
def one_hot_encode_categorical(df):
    # one-hot encoding
    df_encoded = pd.get_dummies(df) 
    # drop columns to get k-1 columns for 
    df_encoded.drop(columns=['p_id_p_1', 'week_no_2', 'label_desc_lab_1', 'color_simple_Other', 'review_0.0'], 
                    axis=1, 
                    inplace=True)
    return df_encoded

In [None]:
knit_data = one_hot_encode_categorical(knit_data)

In [None]:
knit_data_full = knit_data.copy()

In [None]:
knit_data_red = knit_data.drop(columns=['star_rating', 'google_trends_knit', 'google_trends_colour',
       'p_id_p_10', 'p_id_p_11', 'p_id_p_12', 'p_id_p_13', 'p_id_p_14',
       'p_id_p_15', 'p_id_p_16', 'p_id_p_17', 'p_id_p_18', 'p_id_p_19',
       'p_id_p_2', 'p_id_p_20', 'p_id_p_21', 'p_id_p_22', 'p_id_p_23',
       'p_id_p_24', 'p_id_p_3', 'p_id_p_4', 'p_id_p_5', 'p_id_p_6', 'p_id_p_7',
       'p_id_p_8', 'p_id_p_9', 'week_no_4', 'week_no_5', 'week_no_6',
       'week_no_7', 'week_no_8', 'week_no_9', 'week_no_10', 'week_no_11',
       'week_no_12', 'week_no_13', 'week_no_14', 'week_no_15', 'week_no_16',
       'week_no_17', 'week_no_18', 'week_no_19', 'week_no_20', 'week_no_21',
       'week_no_22', 'week_no_23', 'week_no_24', 'week_no_25', 'week_no_26',
       'week_no_27', 'week_no_28', 'week_no_29', 'week_no_30', 'week_no_31',
       'week_no_32', 'week_no_33', 'week_no_34', 'week_no_35', 'week_no_36',
       'week_no_37', 'week_no_38', 'week_no_39', 'week_no_40', 'week_no_41',
       'week_no_42', 'week_no_43', 'week_no_44', 'week_no_45', 'week_no_46',
       'week_no_47', 'week_no_48', 'week_no_49', 'week_no_50', 'week_no_51',
       'week_no_52', 'color_simple_Black', 'color_simple_Blue',
       'color_simple_Brown', 'color_simple_Cream', 'color_simple_Green',
       'color_simple_Pink', 'color_simple_White', 'color_simple_Yellow',
       'color_simple_Zebra', 'review_-1.0', 'review_1.0'])

In [None]:
print(knit_data.shape)
print(knit_data[knit_data['transaction_date'] < '2021-10-3'].shape)
print(knit_data[knit_data['transaction_date'] >= '2021-10-3'].shape)

In [None]:
def temporal_test_train_split_aa(df):
   
   df_train = df[df['transaction_date'] < '2021-10-3']
   df_test = df[df['transaction_date'] >= '2021-10-3']

   y_train = df_train['quantity_log']
   y_test = df_test['quantity_log']

   X_train = df_train.drop(['quantity_log', 'transaction_date'], axis=1)
   X_test = df_test.drop(['quantity_log', 'transaction_date'], axis=1)
 
   return X_train, X_test, y_train, y_test

In [None]:
X_train_f, X_test_f, y_train_f, y_test_f = temporal_test_train_split_aa(knit_data_full)
X_train_r, X_test_r, y_train_r, y_test_r = temporal_test_train_split_aa(knit_data_red)

### all features

In [None]:
max_features_f_ = list(range(2, len(X_train_f.columns))) # 
max_depth_f_ = list(range(2,10))
params_f = []
maximum_score_f = -1 ## should be 0 but models are terrible
## Random selection of parameters to test
random.seed(5)

mf_f_ = random.choices(max_features_f_, k=50)
md_f_ = random.choices(max_depth_f_, k=50)

## Iterations to select best model
for i in range (50):
    print('model number:',i+1)
    #selection of parameters to test
    mf_f = mf_f_[i]
    md_f = md_f_[i]
    print(' parameters:',[mf_f,md_f])
    #model
    DT_cen = DecisionTreeRegressor(max_features=mf_f,
                                    max_depth=md_f,
                                    random_state=0
                                    ).fit(X_train_f, y_train_f)

    # test model on unseen data
    y_test_pred_f = DT_cen.predict(X_test_f)
    
    # take exp of predicted values
    y_test_pred_f = np.exp(y_test_pred_f) - 1

    score_f = r2_score(np.exp(y_test_f) - 1, y_test_pred_f)
    print(' R2:',score_f)

    #compare performances on validation data
    if score_f > maximum_score_f:
        params_f = [mf_f,md_f]
        maximum_score_f = score_f

In [None]:
params_f

In [None]:
## Best model
mf_f,md_f = params_f

DT_cen_f = DecisionTreeRegressor(max_features=mf_f,
                                max_depth=md_f,
                                random_state=0
                                ).fit(X_train_f, y_train_f)

y_train_pred_f = DT_cen_f.predict(X_train_f)
y_train_pred_f = np.exp(y_train_pred_f) - 1

y_test_pred_f = DT_cen_f.predict(X_test_f)
y_test_pred_f = np.exp(y_test_pred_f) - 1

y_train_f = np.exp(y_train_f) - 1
y_test_f = np.exp(y_test_f) -1

is_r2 = r2_score(y_train_f, y_train_pred_f)
oos_r2=r2_score(y_test_f, y_test_pred_f)

print('\nBest Model:')
print('Parameters:',params_f)
print('IS R2:',is_r2)
print('OOS R2:', oos_r2)

print('IS MSE', mean_absolute_error(y_train_f, y_train_pred_f))
print('OOS MSE', mean_absolute_error(y_test_f, y_test_pred_f))

In [None]:
compare_f = pd.DataFrame({'real':y_test_f, 'pred':y_test_pred_f})

sns.scatterplot(data=compare_f, x='real', y='pred')

In [None]:
DT_cen_vis_f = DecisionTreeRegressor(max_features=params_f[0],
                                            max_depth=params_f[1],
                                            random_state=0
                                            ).fit(X_train_f, y_train_f)


## Print the tree
plt.figure(figsize=(15,8), dpi=400)
plot_tree(DT_cen_vis_f, feature_names = X_train_f.columns)
plt.show()



### reduced features

In [None]:
max_features_r_ = list(range(2, len(X_train_r.columns))) # 
max_depth_r_ = list(range(2,10))
params_r = []
maximum_score_r = -1 ## should be 0 but models are terrible
## Random selection of parameters to test
random.seed(5)

mf_r_ = random.choices(max_features_r_, k=50)
md_r_ = random.choices(max_depth_r_, k=50)

## Iterations to select best model
for i in range (50):
    print('model number:',i+1)
    #selection of parameters to test
    mf_r = mf_r_[i]
    md_r = md_r_[i]
    print(' parameters:',[mf_r,md_r])
    #model
    DT_cen = DecisionTreeRegressor(max_features=mf_r,
                                    max_depth=md_r,
                                    random_state=0
                                    ).fit(X_train_r, y_train_r)

    # test model on unseen data
    y_test_pred_r = DT_cen.predict(X_test_r)
    
    # take exp of predicted values
    y_test_pred_r = np.exp(y_test_pred_r) - 1

    score_r = r2_score(np.exp(y_test_r) - 1, y_test_pred_r)
    print(' R2:',score_r)

    #compare performances on validation data
    if score_r > maximum_score_r:
        params_r = [mf_r,md_r]
        maximum_score_r = score_r

In [None]:
params_r

In [None]:
## best model
mf_r,md_r = params_r

DT_cen_r = DecisionTreeRegressor(max_features=mf_r,
                                max_depth=md_r,
                                random_state=0
                                ).fit(X_train_r, y_train_r)

y_train_pred_r = DT_cen_r.predict(X_train_r)
y_train_pred_r = np.exp(y_train_pred_r) - 1

y_test_pred_r = DT_cen_r.predict(X_test_r)
y_test_pred_r = np.exp(y_test_pred_r) - 1

y_train_r = np.exp(y_train_r) - 1
y_test_r = np.exp(y_test_r) -1

is_r2 = r2_score(y_train_r, y_train_pred_r)
oos_r2=r2_score(y_test_r, y_test_pred_r)

print('\nBest Model:')
print('Parameters:',params_r)
print('IS R2:',is_r2)
print('OOS R2:', oos_r2)

print('IS MSE', mean_absolute_error(y_train_r, y_train_pred_r))
print('OOS MSE', mean_absolute_error(y_test_r, y_test_pred_r))

In [None]:
y_pred_r = DT_cen_r.predict(X_test_r)
compare_r = pd.DataFrame({'real':y_test_r, 'pred':y_test_pred_r})

sns.scatterplot(data=compare_r, x='real', y='pred')

In [None]:
DT_cen_vis_r = DecisionTreeRegressor(max_features=params_r[0],
                                            max_depth=params_r[1],
                                            random_state=0
                                            ).fit(X_train_r, y_train_r)


## Print the tree
plt.figure(figsize=(15,8), dpi=400)
plot_tree(DT_cen_vis_r, feature_names = X_train_r.columns)
plt.show()

