# Logistic Regression Train Model
#### Note: If you are not retraining the model, you will just need to execute the Score Notebook on the refreshed data.  If you are retraining the model, ensure that the version number is updated prior to serializing the model to disk to version models over time for comparisions.

### Load all required modules including Oracle connection and  Data Processing Functions.

In [None]:
import os 
import cx_Oracle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import statsmodels.api as sm
import joblib
import datetime
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import KFold, GridSearchCV
from statistics import mean
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, KMeansSMOTE

# Update path to where function file resides
if os.name == 'nt':
    state = !cd
    
    # Load DB Connection File from Windows Machine
    os.chdir(r'directory name')
    from db_connection import oracle_connection
    
    # Load function file from Windows Machine
    os.chdir(r'directory name')
    from general_functions import *
elif os.name == 'posix':
    state = !pwd
    
    # Load DB Connection File from Mac Machine
    os.chdir('directory name')
    from db_connection import oracle_connection
    
    # Load function file from Mac Machine
    os.chdir('directory name')
    from general_functions import *
else:
    print('No OS!')

#Change directory back to working Jupyter Notebook Directory after importing connection module
os.chdir(state[0])

todays_date = datetime.date.today().strftime('%Y%m%d')

pd.options.display.max_columns = None
pd.options.display.max_rows = None

## Create DB Connection String

In [None]:
if os.name == 'nt':
    # Update path to where config file resides
    db_creds = os.path.expanduser('~') + 'directory name'
    creds = oracle_connection(db_creds)

    url = creds['host'] + ":" + creds['port'] + "/" + creds['database']

    db = cx_Oracle.connect(creds['user'], creds['password'], url)

    cursor = db.cursor()
elif os.name == 'posix':
    # Update path to where config file resides
    db_creds = os.path.expanduser('~') + 'directory name'
    creds = oracle_connection(db_creds)

    url = creds['host'] + ":" + creds['port'] + "/" + creds['database']

    db = cx_Oracle.connect(creds['user'], creds['password'], url, encoding = 'UTF-8')
    cursor = db.cursor()
else:
    print('No OS!')

### Send query to Oracle database and return as Pandas DF

In [None]:
# Rerun the train sample to include the new data sources

query = """
        
        """

df = pd.read_sql(query, cursor.connection)

In [None]:
LABEL_VAL = 'column'

In [None]:
df.head()

In [None]:
df.shape

# Data Pre-processing steps

In [None]:
df_tr = df.copy()
df_tr = df_tr.drop(['column'], axis = 1)

### Replace all missing values with 'None' or 0 depending on the Data Type of column

In [None]:
dtype_dict_value = replace_values(df_tr, char_value = 'Unknown')

df_tr.fillna(value = dtype_dict_value, inplace = True)
df_tr.describe(include = 'all')

### Convert data into correct buckets

In [None]:
df_tr['column'] = df_tr['column'].str.replace('_', ' ')

df_tr.loc[(df_tr['column'] == 'value'),'column'] = 'value'
df_tr.loc[(df_tr['column'] == 'value'),'column'] = 'value'
df_tr.loc[(df_tr['column'] == 'value'),'column'] = 'value'
df_tr.loc[(df_tr['column'] == 'value'),'column'] = 'valule'

### Convert all columns that are Factor Levels or Flag columns into Category data types

In [None]:
# One Hot Encoding - Extra Columns (Can Pass Values in list)
df_tr, forward_mapping_dict, inv_mapping_dict, encoder = convert_cat_to_cat_lvl(df_tr, encode_method = 'OneHot')

# Numeric Encoding - Inplace (Can Pass Values in List)
#df_tr, forward_mapping_dict, inv_mapping_dict, encoder = convert_cat_to_cat_lvl(df_tr, encode_method = 'Numeric')

df_tr.head()

### Remove Highly Correlated Values

In [None]:
corr_df, dtr_unique_corr_cols = corr_vars(df_tr, corr_threshold = .95)
%store dtr_unique_corr_cols

In [None]:
df_tr = df_tr.drop(dtr_unique_corr_cols, axis = 1)
df_tr.shape

### Standardize columns using Scaler Fit

In [None]:
df_std, scaler_fit = standardize_cols(df_tr, LABEL_VAL)

df_std.head()

# Split Data into Train/Test

In [None]:
train_df, test_df = train_test_split(df_std, test_size = 0.33, random_state = 5)
print(train_df.shape, test_df.shape)

In [None]:
train_label = train_df[LABEL_VAL]
train_features = train_df.drop([LABEL_VAL], axis = 1)

test_label = test_df[LABEL_VAL]
test_features = test_df.drop([LABEL_VAL], axis = 1)

train_label_count = train_label.value_counts()
test_label_count = test_label.value_counts()

print("Train\n", train_label_count, "\nTest\n", test_label_count)

# Use SMOTE to generate a more balanced data set

In [None]:
# Can pass 'smote', 'borderline', 'svm', 'kmeans' as smote_method
over_samp = generate_smote_sample(smote_method = 'smote', n_jobs = 16)

over_samp_feat, over_samp_label = over_samp.fit_sample(train_features, train_label)
over_samp_columns = train_features.columns

over_samp_feat = pd.DataFrame(data = over_samp_feat, columns = over_samp_columns)
over_samp_label = pd.DataFrame(data = over_samp_label, columns = [LABEL_VAL])

os_train_label = over_samp_label[LABEL_VAL]

# we can Check the numbers of our data
print("Length of oversampled data is ",len(over_samp_feat))
print("Number of no target products in oversampled data",len(over_samp_label[over_samp_label[LABEL_VAL] == 0]))
print("Number of target products",len(over_samp_label[over_samp_label[LABEL_VAL] == 1]))
print("Proportion of no target data in oversampled data is ",len(over_samp_label[over_samp_label[LABEL_VAL] == 0])/len(over_samp_feat))
print("Proportion of target data in oversampled data is ",len(over_samp_label[over_samp_label[LABEL_VAL] == 1])/len(over_samp_feat))

In [None]:
os_train_label.value_counts()

# Logistic Regression Model on SMOTE Sample

In [None]:
lr_model = LogisticRegression(solver = 'liblinear', penalty = 'l1', random_state = 5, n_jobs = 4)


lr_model_fit, lr_model_summary = generate_model_summary(df_std
                                                     , lr_model
                                                     , over_samp_feat
                                                     , os_train_label
                                                     , test_features
                                                     , test_label
                                                     , is_tree_model = False)

lr_model_score = lr_model_summary[0]
lr_model_confusion_list = lr_model_summary[1]
lr_model_metrics = lr_model_summary[2]
lr_fpr = lr_model_summary[3]
lr_tpr = lr_model_summary[4]
lr_roc_score = lr_model_summary[6]

print("Model Accuracy Score: ", lr_model_summary[0]
      , "\nModel Confusion List (TN, FP, FN, TP): ", lr_model_summary[1]
      , "\nModel Summary: \n", lr_model_summary[2]
      , "\nROC Score: ", lr_model_summary[6])

In [None]:
logit_model_os = sm.Logit(os_train_label, over_samp_feat)
lr_result_os = logit_model_os.fit(method = 'bfgs')
print(lr_result_os.summary2())

In [None]:
sum_df = lr_result_os.summary2().tables[1]
sum_df["INFLUENCE"] = (np.std(over_samp_feat, 0)*sum_df["Coef."])
sum_df.loc[:, ("Coef.", "P>|z|", "INFLUENCE")].sort_values(by = "INFLUENCE", axis = 0, ascending = False).head(20)

sort_sum_df = sum_df.sort_values(by = "INFLUENCE", axis = 0, ascending = False).head(20)
dtr_lr_import_feats = sort_sum_df.index.values

print(dtr_lr_import_feats)

#%store dtr_lr_import_feats - V1 Code

# Random Forest on SMOTE Sample

##### TOP Hyper Parameters: n_estimators = 135, max_depth = 35

In [None]:
rf_model = RandomForestClassifier(n_estimators = 250, max_depth = 30, random_state = 5, n_jobs = -1)

rf_model_fit, rf_model_summary = generate_model_summary(df_std
                                                     , rf_model
                                                     , over_samp_feat
                                                     , os_train_label
                                                     , test_features
                                                     , test_label
                                                     , is_tree_model = True)

rf_model_score = rf_model_summary[0]
rf_model_confusion_list = rf_model_summary[1]
rf_model_metrics = rf_model_summary[2]
rf_fpr = rf_model_summary[3]
rf_tpr = rf_model_summary[4]
rf_roc_score = rf_model_summary[6]

print("Model Accuracy Score: ", rf_model_summary[0]
      , "\nModel Confusion List (TN, FP, FN, TP): ", rf_model_summary[1]
      , "\nModel Summary: \n", rf_model_summary[2]
      , "\nROC Score: ", rf_model_summary[6])

In [None]:
rf_import_df = pd.DataFrame()
rf_import_df['FEATURE_NAME'], rf_import_df['FEATURE_IMPORTANCE'] = train_features.columns, rf_model_fit.feature_importances_
rf_import_cols = list(rf_import_df.sort_values(by = ['FEATURE_IMPORTANCE'], axis = 0, ascending = False).head(20)['FEATURE_NAME'])
rf_import_df.sort_values(by = ['FEATURE_IMPORTANCE'], axis = 0, ascending = False).head(20)

# XGBoost Model on SMOTE Sample

In [None]:
xgb_model = xgb.XGBClassifier(n_estimators = 1000, max_depth = 6, learning_rate = 0.05, random_state = 5, n_jobs = 4)

xgb_model_fit, xgb_model_summary = generate_model_summary(df_std
                                                         , xgb_model
                                                         #, train_features
                                                         #, train_label
                                                         , over_samp_feat
                                                         , os_train_label
                                                         , test_features
                                                         , test_label
                                                         , is_tree_model = True)

xgb_model_score = xgb_model_summary[0]
xgb_model_confusion_list = xgb_model_summary[1]
xgb_model_metrics = xgb_model_summary[2]
xgb_fpr = xgb_model_summary[3]
xgb_tpr = xgb_model_summary[4]
xgb_roc_score = xgb_model_summary[6]

print("Model Accuracy Score: ", xgb_model_summary[0]
      , "\nModel Confusion List (TN, FP, FN, TP): ", xgb_model_summary[1]
      , "\nModel Summary: \n", xgb_model_summary[2]
      , "\nROC Score: ", xgb_model_summary[6])

In [None]:
xgb_import_df = pd.DataFrame()
xgb_import_df['FEATURE_NAME'], xgb_import_df['FEATURE_IMPORTANCE'] = train_features.columns, xgb_model_fit.feature_importances_
xgb_import_cols = list(xgb_import_df.sort_values(by = ['FEATURE_IMPORTANCE'], axis = 0, ascending = False).head(30)['FEATURE_NAME'])
xgb_import_df.sort_values(by = ['FEATURE_IMPORTANCE'], axis = 0, ascending = False).head(30)

# Compare results of all models

In [None]:
print("Logistic Regression SMOTE Score: " + str(lr_model_score) + "\n"
      , "Random Forest SMOTE Score: " + str(rf_model_score) + "\n"
      , "XGBoost SMOTE Score: " + str(xgb_model_score) + "\n"
     )

In [None]:
print("Confusion Matrix: \nTrue Negative, False Positive, False Negative, True Positive \n\n"
      , "Logistic Regression SMOTE: " + str(lr_model_confusion_list) + "\n"
      , "Random Forest SMOTE: " + str(rf_model_confusion_list) + "\n"
      , "XGBoost SMOTE: " + str(xgb_model_confusion_list) + "\n"
     )

In [None]:
print("\nLogistic Regression SMOTE Model Metrics: \n", lr_model_metrics
      , "\nRandom Forest SMOTE Model Metrics: \n", rf_model_metrics
      , "\nXGBoost SMOTE Model Metrics: \n", xgb_model_metrics
     )

In [None]:
print("Logistic Regression SMOTE ROC Score: ", lr_roc_score
      , "\nRandom Forest SMOTE ROC Score: ", rf_roc_score
      , "\nXGBoost SMOTE ROC Score: ", xgb_roc_score
     )

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize = (20,18))
plt.plot(lr_fpr, lr_tpr, color = 'red', lw = 2, label = 'LR ROC Curve (area = %0.2f)' %lr_roc_score)
plt.plot(rf_fpr, rf_tpr, color = 'blue', lw = 2, label = 'RF ROC Curve (area = %0.2f)' %rf_roc_score)
plt.plot(xgb_fpr, xgb_tpr, color = 'green', lw = 2, label = 'XGBoost ROC Curve (area = %0.2f)' %xgb_roc_score)
plt.plot([0, 1], [0, 1], color = 'black', lw = 2, linestyle = '--')
plt.legend(loc = 'lower right')
plt.show()

# Serialize the data model for future use
#### Update version number if you are retraining the model.  USe versioning on serialized models for future comparisions of model effectiveness over time.

In [None]:
joblib.dump(lr_model_fit, './Model/dtr_logistic_regression_v2.0.pkl')
joblib.dump(rf_model_fit, './Model/dtr_random_forest_v2.0.pkl')
joblib.dump(xgb_model_fit, './Model/dtr_xgboost_v2.0.pkl')
joblib.dump(encoder, './Model/encoder_v1.0.pkl')
joblib.dump(scaler_fit, './Model/scaler_v1.0.pkl')

# Score Validation Dataset

In [None]:
query = """
         
        """

valid_df = pd.read_sql(query, cursor.connection)

In [None]:
valid_df.head()

In [None]:
valid_df_tr = valid_df.copy()
valid_df_tr = valid_df_tr.drop(['column'], axis = 1)

### Replace all missing values with 'None' or 0 depending on the Data Type of column

In [None]:
#dtype_dict_value = replace_values(valid_df_tr)
valid_df_tr = valid_df_tr.fillna(value = dtype_dict_value)

In [None]:
string_col_list

In [None]:
string_col_list = list(df.select_dtypes(include = ['object']).columns)
string_col_list

### Convert all columns that are Factor Levels or Flag columns into Category data types

In [None]:
string_col_list = list(valid_df_tr.select_dtypes(include = ['object']).columns)
encode_df = pd.DataFrame(encoder.transform(valid_df_tr[string_col_list]).toarray(), columns = encoder.get_feature_names(string_col_list))

encode_col_dict = create_encode_col_dict(valid_df_tr, encoder)

valid_df_tr = valid_df_tr.merge(encode_df, left_index = True, right_index = True)
valid_df_tr = valid_df_tr.drop(string_col_list,  axis = 1)

valid_df_tr = valid_df_tr.rename(columns = encode_col_dict)

valid_df_tr.head()

## Drop Highly Correlated Values

In [None]:
valid_df_tr = valid_df_tr.drop(dtr_unique_corr_cols, axis = 1)
valid_df_tr.shape

In [None]:
valid_df_tr.head()

### Standardize columns using Scaler Fit

In [None]:
label = valid_df_tr[LABEL_VAL]
column_headers = valid_df_tr.drop(LABEL_VAL, axis = 1).columns
valid_df_std = pd.DataFrame(scaler_fit.transform(valid_df_tr.drop(LABEL_VAL, axis = 1)), columns = column_headers)
valid_df_std = pd.DataFrame(label).merge(valid_df_std, left_index = True, right_index = True)

In [None]:
dtr_xgb_col_order = list(over_samp_feat.columns)

%store dtr_xgb_col_order

xgb_features = valid_df_std.reindex(columns = dtr_xgb_col_order)

xgb_features.head()

##### Create probability and scored df

In [None]:
features = valid_df_std.drop([LABEL_VAL], axis = 1)
valid_df_std['PRED_LABEL_LR'] = lr_model_fit.predict(features)
valid_df_std['PRED_LABEL_RF'] = rf_model_fit.predict(features)
valid_df_std['PRED_LABEL_XGB'] = xgb_model_fit.predict(xgb_features)

In [None]:
prob_df = pd.DataFrame()

prob_df['LR_PROB_ZERO'] = lr_model_fit.predict_proba(features)[: ,0]
prob_df['LR_PROB_ONE'] = lr_model_fit.predict_proba(features)[: ,1]
prob_df['RF_PROB_ZERO'] = rf_model_fit.predict_proba(features)[: ,0]
prob_df['RF_PROB_ONE'] = rf_model_fit.predict_proba(features)[: ,1]
prob_df['XGB_PROB_ZERO'] = xgb_model_fit.predict_proba(xgb_features)[:, 0]
prob_df['XGB_PROB_ONE'] = xgb_model_fit.predict_proba(xgb_features)[:, 1]

valid_df_std = valid_df_std.join(prob_df)
#valid_df_std.rename({0: 'PROB_ZERO', 1: 'PROB_ONE'}, axis = 1, inplace = True)
valid_df_std.head()

In [None]:
valid_df_std["AVG_PROB_ZERO"] = (valid_df_std["LR_PROB_ZERO"] + valid_df_std["RF_PROB_ZERO"])/2
valid_df_std["AVG_PROB_ONE"] = (valid_df_std["LR_PROB_ONE"] + valid_df_std["RF_PROB_ONE"])/2

valid_df_std[['DATORAMA_FLG'
              , 'PRED_LABEL_LR'
              , 'LR_PROB_ZERO'
              , 'LR_PROB_ONE'
              , 'PRED_LABEL_RF'
              , 'RF_PROB_ZERO'
              , 'RF_PROB_ONE'
              , 'PRED_LABEL_XGB'
              , 'XGB_PROB_ZERO'
              , 'XGB_PROB_ONE'
              , 'AVG_PROB_ZERO'
              , 'AVG_PROB_ONE']]

# Deprecated Code

In [None]:
# Deprecated

#dtr_low_mean_col_list = low_mean_cols(df_tr, .001)
#print(dtr_low_mean_col_list)
#df_tr = df_tr.drop(dtr_low_mean_col_list, axis = 1)

#%store dtr_low_mean_col_list

#### Remove all potential outliers

In [None]:
# Deprecated

#outlier_idx_list = potential_outliers(df_tr)
#print(outlier_idx_list)

#df_tr = df_tr.drop(outlier_idx_list, axis = 0)

##### Only scaling the non flag columns

In [None]:
# Deprecated

# Example for how to pass specific values to be standardized

#column_headers = list(df_tr.columns)
#flag_col_list = [col for col in column_headers if '_FLG' in col]
#column_headers = [col for col in column_headers if col not in flag_col_list]

#df_std, scaler_fit = standardize_cols(df_tr, 'DATORAMA_FLG', column_headers)

# Run K-fold - V1 Code

In [None]:
#k_fold_rec, k_fold_prec, k_fold_f_score = run_k_fold(df_std, 'DATORAMA_FLG', lr_model, 2)
#print(k_fold_rec, k_fold_prec, k_fold_f_score)

##### Scale only on the non flag columns in DF

In [None]:
# Deprecated
#column_headers = list(valid_df_tr.columns)
#flag_col_list = [col for col in column_headers if '_FLG' in col]
#column_headers = [col for col in column_headers if col not in flag_col_list]

#valid_df_std, valid_scaler_fit = standardize_cols(valid_df_tr, 'DATORAMA_FLG', column_headers)