In [None]:
from itertools import groupby
from sklearn.preprocessing import Imputer
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from datetime import timedelta
import numpy as np
from sklearn.model_selection import GroupKFold

from sklearn.metrics import classification_report

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# Creating and Reverting Dummies
The following two functions helps to convert a dataframe (X_train) with categorical variables (cat_var) and numerical variables (num_var) into a final dataframe that includes both dummies plus numericals.

In [None]:
def get_dummies_into_df(X_train, cat_var, num_var):
    X_train_dumm = X_train
    
    for var in cat_var:
        cat_list = 'var'+'_'+var
        cat_list = pd.get_dummies(X_train_dumm[var], prefix=var)
        X_train_dumm = X_train_dumm.join(cat_list)
    
    all_var = X_train_dumm.columns.values.tolist()
    original_var = X_train.columns.values.tolist()
    dummies_list = [i for i in all_var if i not in original_var]
    to_keep = num_var + dummies_list
    X_train_dumm = X_train_dumm[to_keep]
    
    return X_train_dumm

In [None]:
def back_from_x_dummies(df, cat_var, num_var, data = 'not_checking'):
    # df: pandas DataFrame with all numericals
    # cat_var: list of categorical variables in dummies in df
    # num_var: list of numerical variables in df
    # data: pandas DataFrame if checking with original converted table
    
    all_merged_columns = pd.DataFrame()

    for ori_col in cat_var:
        #print('Performing merge_dummies for:', ori_col)

        # Structure of dummies is with _ split
        ori_col_ = ori_col + '_'

        # Finding dummy columns based on cat_var & num_var
        dummy_tuples = [(col.split(ori_col_)[1],col) for col in df.columns if ori_col_ in col if col not in num_var]
        dummy_columns = [col for col in df.columns if ori_col_ in col if col not in num_var]
        dummy_df = df[dummy_columns]
        sum_across = dummy_df.sum(axis=1)

        # Get column results
        for dummy, cols in groupby(dummy_tuples, lambda item: item[0]):
            # Find max value among columns
            max_columns = dummy_df.idxmax(axis=1)

            # Set results to missing if all dummies are missing
            all_merged_columns[ori_col] = sum_across.apply(lambda x: 'missing' if x==0 else np.nan)

            # Remove category_ prefix
            all_merged_columns[ori_col] = all_merged_columns[ori_col].fillna(max_columns.apply(lambda item: item.split(ori_col_)[1]))

        if type(data) == pd.core.frame.DataFrame:
            # Check result
            # You may ignore checking if using SMOTE
            print(list(all_merged_columns[ori_col]) == list(data[ori_col].fillna('missing')))
        else: 
            pass
    
    data_final = pd.concat([df[num_var], all_merged_columns], axis=1)
    
    return data_final

# SMOTE
Using oversampling package, SMOTE or BorderlineSMOTE (user may change the code below), function takes X variables (col name: df_train_columns) and y_train (col name: target_var) to perform oversampling.<br>
Please note that the X variables must be converted to numericals and have no missing variables. Thus is suggested to use with get_dummies_into_df and Imputer package.

In [None]:
def oversampling_SMOTE(X_train, y_train, df_train_columns, target_var):
    # X_train: pandas DataFrame
    # y_train: list
    # target_var: str
    
    # Check the numbers of our data
    print("Length of original data is ",len(X_train))
    print("Proportion of AVERAGE sellers in original data",len(X_train[y_train==0])/len(X_train))
    print("Proportion of BEST sellers in original data",len(X_train[y_train==1])/len(X_train))
    
    # Counter oversampling
    os = SMOTE(random_state=0) #or
    # os = BorderlineSMOTE(random_state=0)
    os_data_X,os_data_y=os.fit_sample(X_train, y_train)
    del(X_train, y_train)
    os_data_X = pd.DataFrame(data=os_data_X,columns=df_train_columns)
    os_data_y= pd.DataFrame(data=os_data_y,columns=[target_var])
    
    print("Length of oversampled data is ",len(os_data_X))
    print("Proportion of AVERAGE sellers data in oversampled data is ",len(os_data_y[os_data_y[target_var]==0])/len(os_data_X))
    print("Proportion of BEST sellers data in oversampled data is ",len(os_data_y[os_data_y[target_var]==1])/len(os_data_X))
    
    return os_data_X, os_data_y

# Basic Functions
The following definitions help to save code lines.

In [None]:
def get_rmse(predicted, actual):
    rmse = ((predicted - actual) ** 2).mean() ** .5
    return rmse

In [None]:
def round_half_up(n, decimals=0):
    multiplier = 10 ** decimals
    return np.floor(n*multiplier + 0.5) / multiplier

# Modelling
The following function is the actual LightGBM function that wraps all the above codes in a flow highlighted in the README file of this repository. Please refer to the file for more information.

In [None]:
def get_lightgbm_result_best(data, cat_var, num_var, target_var, final_target,
                        train_till, fold_by, future = 'unfilled',
                        n_splits = 5, asym_val = 1.0, sample_size = 50000,
                        params = {'num_leaves' : 31}, metrics = ["mse", 'mae']):
    
    for column in cat_var:
        data[column] =  data[column].astype('category')
    
    if type(future) == str:
        pass
    else:
        for column in cat_var:
            future[column] =  future[column].astype('category')
        
    df_train_columns = cat_var + num_var
    
    data['FirstSalesDate'] = pd.to_datetime(data['FirstSalesDate'])
    test = data[data['FirstSalesDate'] >= pd.to_datetime(train_till)].copy()
    train = data[data['FirstSalesDate'] < pd.to_datetime(train_till)].copy()
    test['qty_bins'] = pd.qcut(test[final_target].rank(method='first'), 4, 
                               labels=["0-25", "25-50", "50-75","75-100"], duplicates='drop')
    
    oof = np.zeros(len(train))
    predictions = np.zeros(len(test))
    feature_importance_df = pd.DataFrame()
    
    if type(future) == str:
        pass
    else:
        forecasts = np.zeros(len(future))
    
    X = train[df_train_columns]
    y = train[target_var]
    groups = train[fold_by]
    del(train)
    
    group_kfold = GroupKFold(n_splits=n_splits)
    group_kfold.get_n_splits(X, y, groups)

    fold_ = 0

    for trn_idx, val_idx in group_kfold.split(X, y, groups):
        
        fold_ = fold_ + 1
        print('Model performing Fold', str(fold_))
        
        # Split data
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]
        
        # Get dummies
        X_train_dumm = get_dummies_into_df(X_train, cat_var, num_var)
        
        # Impute missing values
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        X_train_fit = imp.fit_transform(X_train_dumm)
        
        # SMOTE
        X_train, y_train = oversampling_SMOTE(X_train_fit, y_train, X_train_dumm.columns, target_var)
        del(X_train_fit, X_train_dumm)
    
        # Limited space on laptop to run!
        # Best if can batch process
        if sample_size == 'NULL':
            pass
        else:
            idx = list(np.random.choice(y_train.index.values, sample_size, replace=False))
            y_train = y_train.iloc[idx].copy()
            X_train = X_train.iloc[idx].copy()
            print("Proportion of BEST sellers in sampled data is ", 
                  len(y_train[y_train[target_var]==1])/len(X_train))

        # Back to model variables, reset cat
        X_train = back_from_x_dummies(X_train, cat_var, num_var, data = 'not_checking')
        for column in cat_var:
            X_train[column] =  X_train[column].astype('category')
        X_train = X_train[df_train_columns]
        
        # Modelling
        gbm = lgb.LGBMRegressor(random_state=33)
        
        if asym_val == 1.0:
            gbm.set_params(**{**{'objective': 'binary'}, **params}, metrics = metrics, silent=False)
        else:
            def custom_asymmetric_objective(y_true, y_pred):
                residual = (y_true - y_pred).astype("float")
                grad = np.where(residual>0, -2*asym_val*residual, -2*residual)
                hess = np.where(residual>0, 2*asym_val, 2.0)
                return grad, hess
            gbm.set_params(**{**{'objective': custom_asymmetric_objective}, **params}, metrics = metrics, silent=False)

        gbm.fit(
            X_train,
            y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='l2',
            verbose=False,
        )

        oof[val_idx] = gbm.predict(X_val)

        if target_var[:3]=='log':
            val_rmse = ((np.exp(oof[val_idx]) - np.exp(y_val)) ** 2).mean() ** .5
        else:
            val_rmse = ((oof[val_idx] - y_val) ** 2).mean() ** .5

        # predict
        predictions = predictions*(fold_-1)
        predictions += gbm.predict(test[df_train_columns])
        predictions = predictions/ fold_
        
        # forecast
        if type(future) == str:
            pass
        else:
            forecasts = forecasts*(fold_-1)
            forecasts += gbm.predict(future[df_train_columns])
            forecasts = forecasts/ fold_

    result = []
    
    if target_var[:3]=='log':
        test_rmse = get_rmse(np.exp(predictions), np.exp(test[target_var]))
        test_rmse_25 = get_rmse(np.exp(predictions[test['qty_bins'] == "0-25"]), np.exp(test[test['qty_bins'] == "0-25"][target_var]))
        test_rmse_50 = get_rmse(np.exp(predictions[test['qty_bins'] == "25-50"]), np.exp(test[test['qty_bins'] == "25-50"][target_var]))
        test_rmse_75 = get_rmse(np.exp(predictions[test['qty_bins'] == "50-75"]), np.exp(test[test['qty_bins'] == "50-75"][target_var]))
        test_rmse_100 = get_rmse(np.exp(predictions[test['qty_bins'] == "75-100"]), np.exp(test[test['qty_bins'] == "75-100"][target_var]))
    else:
        test_rmse = get_rmse(predictions, test[target_var])
        test_rmse_25 = get_rmse(predictions[test['qty_bins'] == "0-25"], test[test['qty_bins'] == "0-25"][target_var])
        test_rmse_50 = get_rmse(predictions[test['qty_bins'] == "25-50"], test[test['qty_bins'] == "25-50"][target_var])
        test_rmse_75 = get_rmse(predictions[test['qty_bins'] == "50-75"], test[test['qty_bins'] == "50-75"][target_var])
        test_rmse_100 = get_rmse(predictions[test['qty_bins'] == "75-100"], test[test['qty_bins'] == "75-100"][target_var])
    
    result.append((cat_var, num_var, target_var, final_target,
                   train_till, fold_by, n_splits, asym_val, metrics, 
                   test_rmse, test_rmse_25, test_rmse_50, test_rmse_75, test_rmse_100))
    result = pd.DataFrame(result)
    result.columns = ['cat_var', 'num_var', 'target_var', 'final_target',
                      'train_till', 'fold_by', 'n_splits', 'asym_val', 'metrics',
                      'test_rmse', 'test_rmse_25', 'test_rmse_50', 'test_rmse_75', 'test_rmse_100']
    
    if type(future) == str:
        return gbm, predictions, oof, result
    else:
        return gbm, predictions, oof, result, forecasts 

# Run Model
Now we may run the model by inputing our data, variables, and parameters.
The result table helps to do error analysis across another y_variable, in my use case, the actual sold qty. This helps me check where my model misses predictions (Is_Sold).

In [None]:
%%time

# Define variables
data = data #put dataset here
cat_var = ['colour', 'size', 'details'] #list of categorical variables
num_var= ['heelheight', 'price'] #list of numerical variables 
target_var = 'Is_Sold' #str of column name
final_target = 'TotalSoldQty' #str of column name for error analysis

train_till = '06-01-2019' #str of train-test split by time
fold_by = 'article' #str of column name to group fold by
future = 'NULL' #str or data for forecasting

n_splits = 10 #number of kfold splits run
asym_val = 1 #if 1, runs binary objective. else runs asymmetric custom objective
sample_size = 50000 #to turn of random sampling, put 'NULL'
params = {'num_leaves' : 31} #gbm params, see documentation https://lightgbm.readthedocs.io/en/latest/Parameters.html

# Run function
gbm, predictions, oof, result, forecasts = get_lightgbm_classifier(
    data, cat_var, num_var, target_var, final_target,
    train_till, fold_by, future = future,
    n_splits = n_splits, asym_val = asym_val, sample_size = sample_size,
    params = params, metrics = ["mse", 'mae'])

# Print result
print(result)

# Review Results
Using standard classification error reviewing methods like Precision, Recall, ROC.

In [None]:
test = data[data['FirstSalesDate'] >= pd.to_datetime(train_till)].copy()
train = data[data['FirstSalesDate'] < pd.to_datetime(train_till)].copy()

train['pred_Is_Sold'] = oof
train['pred_Is_Sold'] = train['pred_Is_Sold'].apply(lambda x: round_half_up(x))
train['pred_Is_Sold'] = train['pred_Is_Sold'].apply(lambda x: 0 if x<=0 else 1)

test['pred_Is_Sold'] = predictions
test['pred_Is_Sold'] = test['pred_Is_Sold'].apply(lambda x: round_half_up(x))
#test['pred_Is_Sold'] = test['pred_Is_Sold'].apply(lambda x: 0 if x<=0 else 1)

future['pred_Is_Sold'] = pd.DataFrame(forecasts).apply(lambda x: round_half_up(x))
#future['pred_Is_Sold'] = future['pred_Is_Sold'].apply(lambda x: 0 if x<=0 else 1)

pred_data = pd.concat([train, test])
pred_data.head()

In [None]:
# TRAIN Confusion Matrix

confusion_matrix = pd.crosstab(train['pred_Is_Sold'], train[target_var], rownames=['Pred'], colnames=['Actual'])

if len(confusion_matrix)==2:
    accuracy_rate = round(((confusion_matrix.iloc[0,0] + confusion_matrix.iloc[1,1])/ (confusion_matrix.iloc[0,1] + confusion_matrix.iloc[1,0] + confusion_matrix.iloc[0,0] + confusion_matrix.iloc[1,1]))*100,2)
    print("Accuracy:", accuracy_rate,"%")
else:
    pass

confusion_matrix

In [None]:
# TEST Confusion Matrix

confusion_matrix = pd.crosstab(test['pred_Is_Sold'], test[target_var], rownames=['Pred'], colnames=['Actual'])

if len(confusion_matrix)==2:
    accuracy_rate = round(((confusion_matrix.iloc[0,0] + confusion_matrix.iloc[1,1])/ (confusion_matrix.iloc[0,1] + confusion_matrix.iloc[1,0] + confusion_matrix.iloc[0,0] + confusion_matrix.iloc[1,1]))*100,2)
    print("Accuracy:", accuracy_rate,"%")
else:
    pass

confusion_matrix

In [None]:
# Precision and Recall

print(classification_report(test['pred_Is_Sold'], test[target_var]))

In [None]:
# Plot ROC Curve

logit_roc_auc = roc_auc_score(test[target_var], gbm.predict(test[cat_var+num_var]))
fpr, tpr, thresholds = roc_curve(test[target_var], gbm.predict(test[cat_var+num_var]))
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()