# Several Items

## Library Importing

In [None]:
import random
import re
import seaborn as sns
import numpy as np
import datetime as dt
import time
import pandas as pd
import os  
pd.options.mode.chained_assignment = None
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_squared_error
from sklearn.metrics import confusion_matrix, mean_absolute_error, recall_score, precision_score
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score, TimeSeriesSplit
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression, LinearRegression 
from sklearn.dummy import DummyRegressor, DummyClassifier
from sklearn.neighbors import *
from catboost import CatBoostClassifier,CatBoostRegressor, CatBoost
from lightgbm import LGBMRegressor, early_stopping
from statsmodels.tsa.seasonal import seasonal_decompose
from scipy import stats as st
from IPython.display import clear_output

## Extra Functions

In [None]:
def full_stats(data):
    mean = data.mean()
    median = data.quantile(0.50)
    minimum = data.min()
    maximum = data.max()
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    iqr = q3-q1
    big_outliers = q3 + (1.5*iqr)
    small_outliers = q1 - (1.5*iqr)
    stdev = np.std(data)
    variance = np.var(data)
    deviations = data - mean
    display(deviations)
    #sns.boxplot(data)
    report = pd.DataFrame(data={'minimum':[minimum],
                                'q1':[q1],
                                'median':[median],
                                'q3':[q3],
                                'maximum':[maximum],
                                'iqr':[iqr],
                                'big_outliers':[big_outliers],
                                'small_outliers':[small_outliers],
                                'standard_deviation':[stdev],
                                'variance':[variance]})
    return report

In [None]:
def get_stats(data):
    count = data.count()
    mean = data.mean()
    median = data.quantile(0.50)
    minimum = data.min()
    maximum = data.max()
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    iqr = q3-q1
    big_outliers = q3 + (1.5*iqr)
    small_outliers = q1 - (1.5*iqr)
    stdev = np.std(data)
    variance = np.var(data)
    deviations = abs(data - mean)
    mad = deviations.mean()
    return [count,minimum,q1,median,q3,maximum,iqr,big_outliers,small_outliers,stdev,variance,mad]

In [None]:
def get_scores(model, features, target, predictions):
    score = model.score(features, target)
    avg_volume = sum(predictions)/len(predictions)
    r2 = r2_score(target,predictions)
    mse = mean_squared_error(target,predictions)
    mae = mean_absolute_error(target,predictions)
    rmse = mse**0.5
    return score, avg_volume, r2, mse, mae, rmse

In [None]:
def cv_test(df,features,target):
    scores = []
    rmses=[]
    
    sample_size = int(len(df) / 5)

    for i in range(0, len(df), sample_size):
        valid_indexes = list(range(i, i + sample_size))
        train_indexes = list(range(0, i)) + list(range(i + sample_size, len(df)))

        features_train = features.iloc[train_indexes]
        features_valid = features.iloc[valid_indexes]

        target_train = target.iloc[train_indexes]
        target_valid = target.iloc[valid_indexes]

        model = LinearRegression().fit(features_train, target_train)
        score = model.score(features_valid, target_valid)
        predictions = model.predict(features_valid)
        rmses.append(mean_squared_error(target_valid,predictions)**0.5)
        scores.append(score)
    return round(sum(scores)/len(scores),3),round(sum(rmses)/len(rmses),3)

In [None]:
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)

    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=12345
    )

    return features_upsampled, target_upsampled

In [None]:
def downsample(features, target, fraction):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_downsampled = pd.concat(
        [features_zeros.sample(frac=fraction, random_state=12345)]
        + [features_ones]
    )
    target_downsampled = pd.concat(
        [target_zeros.sample(frac=fraction, random_state=12345)]
        + [target_ones]
    )

    features_downsampled, target_downsampled = shuffle(
        features_downsampled, target_downsampled, random_state=12345
    )

    return features_downsampled, target_downsampled

In [None]:
def make_features(data, max_lag):
    data['year'] = data.index.year
    data['month'] = data.index.month
    data['day'] = data.index.day
    data['dayofweek'] = data.index.dayofweek

    data['previous_day'] = data['num_orders'].shift(24)
    
    for lag in range(1, max_lag + 1):
        data['lag_{}'.format(lag)] = data['num_orders'].shift(lag)

    data['next_hour'] = data['num_orders'].shift(-1)
    
    data = data.dropna()
    return data

In [None]:
def rmse(y_true, y_predict):
    mse = mean_squared_error(y_true, y_predict)
    return mse**0.5

In [None]:
def get_knn(df, n, k, metric):
    
    """
    Returns k nearest neighbors

    :param df: pandas DataFrame used to find similar objects within
    :param n: object no for which the nearest neighbours are looked for
    :param k: the number of the nearest neighbours to return
    :param metric: name of distance metric
    """

    nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree', metric=metric).fit(df[feature_names].to_numpy())
    nbrs_distances, nbrs_indices = nbrs.kneighbors([df.iloc[n][feature_names]], k, return_distance=True)
    
    df_res = pd.concat([
        df.iloc[nbrs_indices[0]], 
        pd.DataFrame(nbrs_distances.T, index=nbrs_indices[0], columns=['distance'])
        ], axis=1)
    
    return df_res

In [None]:
def build_knc(train, target, test, n_neighbors):
    np.random.seed(42)
    knc = KNeighborsClassifier(n_neighbors=n_neighbors)
    knc.fit(features_train, target)
    y_pred = knc.predict(test)
    return y_pred

In [None]:
def eval_regressor(y_true, y_pred):
    
    rmse = math.sqrt(sklearn.metrics.mean_squared_error(y_true, y_pred))
    print(f'RMSE: {rmse:.2f}')
    
    r2_score = math.sqrt(sklearn.metrics.r2_score(y_true, y_pred))
    print(f'R2: {r2_score:.2f}')  

In [None]:
def fix_column_names(df):
    df.columns = df.columns.str.replace(' ', '_')
    df.columns = df.columns.str.lower()
    return df

## Load/Prep Data

In [None]:
df = pd.read_csv('Sample_Grocery_Features.csv', index_col=[0], parse_dates=True, error_bad_lines=False)

In [None]:
df = fix_column_names(df)
df.head()

In [None]:
df = df.dropna(how='any', axis=1)

In [None]:
df = df.sort_index()

In [None]:
for col in ['seasonal?','allergy_warning','damaged?','on_sale']:
    df[col] = df[col].map({'YES': 1, 'NO': 0})

In [None]:
for col in ['price_purchased','price_sold','weekly_sales','monthly_sales','quarterly_sales','yearly_sales']:
    df[col] = df[col].str.split("$").str[1]
    df[col] = df[col].str.replace(',','')
    df[col] = df[col].astype('float64')

In [None]:
for col in df.columns:
    if 'date' in col:
        df[col] = pd.to_datetime(df[col])
df.info()

In [None]:
df.head()

## Model Training

### Scale/Split Data

In [None]:
early_stopping(stopping_rounds=1,verbose=-1)

In [None]:
random_state = 42

In [None]:
scaler = StandardScaler()
encoder = LabelEncoder()

In [None]:
features = df.drop('price_sold',axis=1)
target = df['price_sold']

In [None]:
features_train, features_test, target_train, target_test = train_test_split(features, target, 
                                                                            test_size=0.2, 
                                                                            random_state=random_state)

In [None]:
features_train_scaled = features_train.copy()
features_test_scaled = features_test.copy()

In [None]:
for col in df.columns:
    if 'date' in col:
        features_train_scaled[col] = encoder.fit_transform(features_train_scaled[col])
        features_test_scaled[col] = encoder.fit_transform(features_test_scaled[col])

In [None]:
scaler.fit(features_train_scaled)

In [None]:
features_train_scaled = scaler.transform(features_train_scaled)
features_test_scaled = scaler.transform(features_test_scaled)

In [None]:
def unscale_data(data):
    data = scaler.inverse_transform(data)
    for col in data.columns:
        if 'date' in col:
            data[col] = encoder.inverse_transform(data[col])
    return data

### Dummy Test

In [None]:
dr = DummyRegressor(strategy='median')

In [None]:
start = time.time()
dr.fit(features_train_scaled, target_train)
dr_fit_time = round(time.time()-start,2)

In [None]:
dr_predictions = dr.predict(features_test_scaled)
dr_rmse = rmse(target_test, dr_predictions)

In [None]:
cross_val_score(dr,features_train_scaled,target_train,scoring=make_scorer(rmse),verbose=10,cv=5)

In [None]:
features_test['dr_error'] = (dr_predictions-target_test)

In [None]:
features_test['dr_predictions'] = dr_predictions

### Linear Regression

In [None]:
lr = LinearRegression()
start = time.time()
lr.fit(features_train_scaled, target_train)
lr_fit_time = round(time.time()-start,2)

In [None]:
lr_predictions = lr.predict(features_test_scaled)
lr_rmse = rmse(target_test, lr_predictions)

In [None]:
cross_val_score(lr,features_train_scaled,target_train,scoring=make_scorer(rmse),verbose=10,cv=5)

In [None]:
features_test['lr_predictions'] = np.round(lr_predictions,2)

In [None]:
features_test['lr_error'] = np.round((lr_predictions-target_test),2)

### Other Models

#### Random Forest Regressor

In [None]:
rfr = RandomForestRegressor(random_state=random_state,
                            n_estimators=100,
                            max_features=0.3,
                            verbose=False,
                            warm_start=True)

In [None]:
start = time.time()
rfr.fit(features_train_scaled, target_train)
rfr_fit_time = round(time.time()-start,2)

In [None]:
rfr_predictions = rfr.predict(features_test_scaled)
rfr_rmse = rmse(target_test,rfr_predictions)

In [None]:
cross_val_score(rfr,features_train_scaled,target_train,scoring=make_scorer(rmse),verbose=10,cv=5)

In [None]:
features_test['rfr_predictions'] = np.round(rfr_predictions,2)
features_test['rfr_error'] = np.round((rfr_predictions-target_test),2)

#### Decision Tree Regressor

In [None]:
dtr = DecisionTreeRegressor(random_state=random_state,
                            splitter='best',
                            max_features=None)

In [None]:
start = time.time()
dtr.fit(features_train_scaled, target_train)
dtr_fit_time = round(time.time()-start,2)

In [None]:
dtr_predictions = dtr.predict(features_test_scaled)
dtr_rmse = rmse(target_test,dtr_predictions)

In [None]:
cross_val_score(dtr,features_train_scaled,target_train,scoring=make_scorer(rmse),verbose=10,cv=5)

In [None]:
features_test['dtr_predictions'] = np.round(dtr_predictions,2)
features_test['dtr_error'] = np.round((dtr_predictions-target_test),2)

#### LGBM Regressor

In [None]:
eval_set = [(features_test_scaled, target_test)]

In [None]:
lgbm = LGBMRegressor(random_state=random_state,
                     early_stopping_round=5,
                     num_iterations=1000,
                     n_estimators=100,
                     max_depth=5,
                     num_leaves=31,
                     learning_rate=0.01)

In [None]:
start = time.time()
lgbm.fit(features_train_scaled, 
         target_train,
         eval_metric='error',
         eval_set=eval_set)
lgbm_fit_time = round(time.time()-start,2)

In [None]:
lgbm_predictions = lgbm.predict(features_test_scaled)
lgbm_rmse = rmse(target_test,lgbm_predictions)

In [None]:
features_test['lgbm_predictions'] = np.round(lgbm_predictions,2)
features_test['lgbm_error'] = np.round((lgbm_predictions-target_test),2)

#### CatBoostRegressor

In [None]:
cbr = CatBoostRegressor(random_state=random_state,
                        early_stopping_rounds=5,
                        iterations=1000,
                        max_depth=None,
                        num_leaves=31,
                        learning_rate=0.01)

In [None]:
start = time.time()
cbr.fit(features_train_scaled, 
        target_train,
        eval_set=eval_set,
        silent=True)
cbr_fit_time = round(time.time()-start,2)

In [None]:
cbr_predictions = lgbm.predict(features_test_scaled)
cbr_rmse = rmse(target_test,lgbm_predictions)

In [None]:
cross_val_score(cbr,features_train_scaled,target_train,scoring=make_scorer(rmse),verbose=10,cv=5)

In [None]:
features_test['cbr_predictions'] = np.round(cbr_predictions,2)
features_test['cbr_error'] = np.round((cbr_predictions-target_test),2)

## Error Analysis

In [None]:
features_test[['price_purchased',
               'dr_predictions',
               'lr_predictions',
               'rfr_predictions',
               'lgbm_predictions',
               'cbr_predictions']].sort_index().sample(5)

In [None]:
sns.set(rc={'figure.figsize':(14,10)})

In [None]:
rmse_fit = pd.DataFrame({'RMSE':[dr_rmse,lr_rmse,rfr_rmse,dtr_rmse,lgbm_rmse,cbr_rmse],
                         'Fit Time':[dr_fit_time,lr_fit_time,rfr_fit_time,dtr_fit_time,lgbm_fit_time,cbr_fit_time]},
                        index=['Dummy','Linear','Random Forest','Decision Tree','Light Gradient Boost','Cat Boost'])

In [None]:
rmse_fit

In [None]:
rmse_fit.T

In [None]:
ax = rmse_fit[['RMSE']].plot(kind='bar',title='Root Mean Squared Error by Model')
ax.figure.savefig("RMSE_by_Model.png")
ax.figure.savefig("RMSE_by_Model.pdf")

In [None]:
ax = rmse_fit[['Fit Time']].plot(kind='bar',title='Training_Time_by_Model')
ax.figure.savefig("Training_Time_by_Model.png")
ax.figure.savefig("Training_Time_by_Model.pdf")

In [None]:
df_price_analysis = pd.concat([target_test,features_test[['dr_predictions','dr_error','lr_predictions','lr_error','dtr_predictions','dtr_error','cbr_predictions',
                                                    'rfr_predictions','rfr_error','lgbm_predictions','lgbm_error','cbr_error']]],
                       axis=1,
                       join='outer')
df_price_analysis = df_price_analysis.sort_index()

In [None]:
df_feature_importance_analysis = pd.DataFrame(data={'cbr':cbr.feature_importances_,
                                                    'lgbm':lgbm.feature_importances_,
                                                    'rfr':rfr.feature_importances_,
                                                    'dtr':dtr.feature_importances_},
                                              index=features_train.columns)

In [None]:
for col in df_feature_importance_analysis.columns:
    total = df_feature_importance_analysis[col].sum()
    df_feature_importance_analysis[col] = df_feature_importance_analysis[col]/total

In [None]:
df_feature_importance_analysis['sum'] = (df_feature_importance_analysis['cbr'] + df_feature_importance_analysis['lgbm'] + 
                                        df_feature_importance_analysis['rfr'] + df_feature_importance_analysis['dtr'])

In [None]:
df_feature_importance_analysis.sort_values('sum',ascending=False)

In [None]:
df_feature_importance_analysis.sort_values('cbr',ascending=False)

In [None]:
df_feature_importance_analysis.sort_values('lgbm',ascending=False)

In [None]:
df_feature_importance_analysis.sort_values('rfr',ascending=False)

In [None]:
df_feature_importance_analysis.sort_values('dtr',ascending=False)

In [None]:
ax = df_feature_importance_analysis.T.plot(kind='bar',title='Importance by Model')
ax.figure.savefig("Importance_By_Model_Bar.png")
ax.figure.savefig("Importance_By_Model_Bar.pdf")

In [None]:
ax = df_feature_importance_analysis.plot(kind='bar',title='Importance_by_Feature')
ax.figure.savefig("Importance_By_Feature_Bar.png")
ax.figure.savefig("Importance_By_Feature_Bar.pdf")

In [None]:
ax = df_price_analysis[
     ['lr_error','rfr_error','lgbm_error','cbr_error','dtr_error']
     ].cumsum().plot(
     title='Cumulative Error',
     alpha=1)
ax.figure.savefig("Error_All_Prices_Line.png")
ax.figure.savefig("Error_All_Prices_Line.pdf")

In [None]:
ax = df_price_analysis[df_price_analysis['price_sold']<df_price_analysis['price_sold'].median()][
    ['lr_error','rfr_error','lgbm_error','cbr_error','dtr_error']
    ].cumsum().plot(
    title='Error Lower Than Median',
    alpha=1)
ax.figure.savefig("Error_Small_Prices_Line.png")
ax.figure.savefig("Error_Small_Prices_Line.pdf")

In [None]:
ax = df_price_analysis[df_price_analysis['price_sold']>df_price_analysis['price_sold'].median()][
    ['lr_error','rfr_error','lgbm_error','cbr_error','dtr_error']
    ].cumsum().plot(
    title='Error Higher Than Median',
    alpha=1)
ax.figure.savefig("Error_Large_Prices_Line.png")
ax.figure.savefig("Error_Large_Prices_Line.pdf")

In [None]:
ax = df_price_analysis[df_price_analysis['price_sold']<abs(df_price_analysis['price_sold'].median()-0.5)][
    ['lr_error','rfr_error','lgbm_error','cbr_error','dtr_error']
    ].cumsum().plot(
    title='Error Around Median',
    alpha=1)
ax.figure.savefig("Error_Average_Prices_Line.png")
ax.figure.savefig("Error_Average_Prices_Line.pdf")

In [None]:
features_test[['price_purchased',
               'dr_predictions',
               'lr_predictions',
               'dtr_predictions',
               'rfr_predictions',
               'lgbm_predictions',
               'cbr_predictions']].sort_index().to_csv("Price_and_Predictions.csv") 

In [None]:
df_feature_importance_analysis.to_csv("Importance_by_Feature.csv")

In [None]:
df_feature_importance_analysis.T.to_csv("Importance_by_Model.csv")

In [None]:
df_price_analysis.to_csv("Price_With_Predictions.csv")