In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, median_absolute_error)
#from autofeat import FeatureSelector, AutoFeatRegressor

In [None]:
knit_data = pd.read_csv("../data/interim/transactions_sd_knits_resampled_engin_synth_gt.csv")
#knit_data = pd.read_csv("../data/interim/transactions_sd_knits_resampled_engin_synth_gt_gb.csv")

In [None]:
knit_data.columns

In [None]:
print(knit_data.groupby(['p_id', 'week_no'])['quantity'].sum().min())
print(knit_data.groupby(['p_id', 'week_no'])['quantity'].sum().max())

In [None]:
#sns.pairplot(data = knit_data)
sns.heatmap(knit_data.corr(), annot=True, linewidths=2)

In [None]:
def prepare_data(df):
    knit_data['transaction_date'] = pd.to_datetime(knit_data['transaction_date'], infer_datetime_format=True)
    knit_data['week_no'] = knit_data['week_no'].astype('object')
    knit_data['review'] = knit_data['review'].astype('object')
    knit_data.drop(columns=['month'], inplace=True)
    knit_data.drop(columns=['p_id'], inplace=True)
    return(df)

In [None]:
knit_data = prepare_data(knit_data)

In [None]:
def one_hot_encode_categorical(df):
    # one-hot encoding
    df_encoded = pd.get_dummies(df) 
    # drop columns to get k-1 columns for 
    df_encoded.drop(columns=['week_no_2', 'label_desc_lab_1', 'color_simple_Other', 'review_0.0'], 
                    axis=1, 
                    inplace=True)
    return df_encoded
    

In [None]:
knit_data = one_hot_encode_categorical(knit_data)

In [None]:
def log_price_quantity(df):
    df['price_log'] = np.log(df['price'] + 1)
    df['quantity_log'] = np.log(df['quantity'] + 1)
    df.drop(columns=['price'], inplace=True)
    df.drop(columns=['quantity'], inplace=True)
    return df

In [None]:
knit_data = log_price_quantity(knit_data)

In [None]:
def drop_columns_f1(df):
    df = df.drop(columns=['star_rating', 'review_-1.0', 'review_1.0', 'google_trends_knit', 'google_trends_colour'])
    return df

In [None]:
def drop_columns_f2(df):
    df = df.drop(columns=['star_rating', 'review_-1.0', 'review_1.0'])
    return df

In [None]:
def drop_columns_f3(df):
    df = df.drop(columns=['star_rating',
       'google_trends_knit', 'google_trends_colour', 
       'color_simple_Black', 'color_simple_Blue', 'color_simple_Brown',
       'color_simple_Cream', 'color_simple_Green', 'color_simple_Pink',
       'color_simple_White', 'color_simple_Yellow', 'color_simple_Zebra',
       'review_-1.0', 'review_1.0'])
    return df

In [None]:
#knit_data = drop_columns_f3(knit_data)

In [None]:
def temporal_test_train_split_aa(df):
   
   df_train = df[df['transaction_date'] < '2021-10-3']
   df_test = df[df['transaction_date'] >= '2021-10-3']

   y_train = df_train['quantity_log']
   y_test = df_test['quantity_log']

   X_train = df_train.drop(['quantity_log', 'transaction_date'], axis=1)
   X_test = df_test.drop(['quantity_log', 'transaction_date'], axis=1)
 
   return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = temporal_test_train_split_aa(knit_data)


### try multiple features and tree depths

In [None]:
max_features_ = list(range(2, len(X_train.columns))) # 
max_depth_ = list(range(2,10))
params = []
minimum_score = 1000 ## 

## Random selection of parameters to test
random.seed(5)

mf_ = random.choices(max_features_, k=50)
md_ = random.choices(max_depth_, k=50)

## Iterations to select best model
for i in range (50):
    print('model number:',i+1)
    #selection of parameters to test
    mf = mf_[i]
    md = md_[i]
    print(' parameters:',[mf,md])
    #model
    RF_cen = RandomForestRegressor(max_features=mf,
                                    max_depth=md,
                                    n_estimators=100,
                                    random_state=0
                                    ).fit(X_train, y_train)

    # test model on unseen data
    y_test_pred = RF_cen.predict(X_test)
    
    # take exp of predicted values
    y_test_pred = np.exp(y_test_pred) - 1

    #score = r2_score(np.log(y_test) - 1, y_test_pred)
    #print(' R2:', score)

    # evaluate based on MAE - change to median_absolute_error
    score = mean_absolute_error(np.log(y_test) - 1, y_test_pred) # evaluate 

    #compare performances on validation data, minimise mae
    if score < minimum_score:
        params = [mf,md]
        minimum_score = score

In [None]:
params

In [None]:
## Best model
mf,md = params

RF_cen = RandomForestRegressor(max_features=mf,
                                max_depth=md,
                                n_estimators=100,
                                random_state=0
                                ).fit(X_train, y_train)

y_train_pred = RF_cen.predict(X_train)
y_train_pred = np.exp(y_train_pred) - 1

y_test_pred = RF_cen.predict(X_test)
y_test_pred = np.exp(y_test_pred) - 1

y_train = np.exp(y_train) - 1
y_test = np.exp(y_test) -1

is_r2 = round(r2_score(y_train, y_train_pred), 2)
oos_r2 = round(r2_score(y_test, y_test_pred), 2)

is_rmse = round(mean_squared_error(y_train, y_train_pred)**0.5, 2)
oos_rmse = round(mean_squared_error(y_test, y_test_pred)**0.5, 2)

is_mape = round(mean_absolute_percentage_error(y_train, y_train_pred), 2)
oos_mape = round(mean_absolute_percentage_error(y_test, y_test_pred), 2)

is_mae = round(mean_absolute_error(y_train, y_train_pred), 2)
oos_mae = round(mean_absolute_error(y_test, y_test_pred), 2)

is_mdape = round(np.median(np.abs((y_train - y_train_pred)/y_train))*100, 2)
oos_mdape = round(np.median(np.abs((y_test - y_test_pred)/y_test))*100, 2)

is_mdae = round(median_absolute_error(y_train, y_train_pred), 2)
oos_mdae = round(median_absolute_error(y_test, y_test_pred), 2)



print('\nBest Model:')
print('Parameters:', params)

print('IS R2:',is_r2)
print('OOS R2:', oos_r2)

print('IS RMSE', is_rmse)
print('OOS RMSE', oos_rmse)

print('IS MAPE', is_mape)
print('OOS MAPE', oos_mape)

print('IS MAE', is_mae)
print('OOS MAE', oos_mae)

print('IS MdAPE', is_mdape)
print('OOS MdAPE', oos_mdape)

print('IS MdAE', is_mdae)
print('OOS MdAE', oos_mdae)



In [None]:
compare = pd.DataFrame({'real':y_test, 'pred':y_test_pred})

sns.scatterplot(data=compare, x='real', y='pred')

In [None]:
importance = RF_cen.feature_importances_
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
# https://stackoverflow.com/questions/30355159/python-how-to-get-real-feature-name-from-feature-importances
feature_importance = pd.DataFrame(columns=['feature', 'importance'])
for feature, importance in zip(X_train.columns, RF_cen.feature_importances_):
    row_dict = {'feature' : feature, 'importance' : importance}
    feature_importance = feature_importance.append(row_dict, ignore_index=True)

In [None]:
feature_importance.

In [None]:
# Return weight of features in order of importance
feature_names = X_train.columns
coef = RF_cen.feature_importances_
# sort them out in descending order
indices = np.argsort(abs(coef))[::-1]
#print(indices)
#print(feature_names[indices.astype(int)])

for i in indices:
    print(feature_names[i], ':', coef[i])