# Logistic Regression Score Data
#### Note: If you are not retraining the model, you will just need to execute the Score Notebook on the refreshed data.  If you are retraining the model, ensure that the version number is updated prior to serializing the model to disk to version models over time for comparisions.
#### If rescoring data, refresh the following tables prior to rescore: 
#### After rescoring data refresh the following tables prior to creating target list: 

### Load all required modules including Oracle connection and Data Processing Functions.

In [None]:
import os 
import cx_Oracle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import statsmodels.api as sm
import joblib
import datetime
import shap

from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import KFold
from statistics import mean
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from itertools import zip_longest

# Update path to where function file resides
if os.name == 'nt':
    state = !cd
    
    # Load DB Connection File from Windows Machine
    os.chdir(r'directory name')
    from db_connection import oracle_connection
    
    # Load function file from Windows Machine
    os.chdir(r'directory name')
    from general_functions import *
elif os.name == 'posix':
    state = !pwd
    
    # Load DB Connection File from Mac Machine
    os.chdir('directry name')
    from db_connection import oracle_connection
    
    # Load function file from Mac Machine
    os.chdir('directory name')
    from general_functions import *
else:
    print('No OS!')

#Change directory back to working Jupyter Notebook Directory after importing connection module
os.chdir(state[0])

todays_date = datetime.date.today().strftime('%Y%m%d')

pd.options.display.max_columns = None
pd.options.display.max_rows = None

## Create DB Connection String

In [None]:
if os.name == 'nt':
    # Update path to where config file resides
    db_creds = os.path.expanduser('~') + 'directory name'
    creds = oracle_connection(db_creds)

    url = creds['host'] + ":" + creds['port'] + "/" + creds['database']

    db = cx_Oracle.connect(creds['user'], creds['password'], url)

    cursor = db.cursor()
elif os.name == 'posix':
    # Update path to where config file resides
    db_creds = os.path.expanduser('~') + 'directory name'
    creds = oracle_connection(db_creds)

    url = creds['host'] + ":" + creds['port'] + "/" + creds['database']

    db = cx_Oracle.connect(creds['user'], creds['password'], url, encoding = 'UTF-8')
    cursor = db.cursor()
else:
    print('No OS!')

# Score Accounts without Target product

In [None]:
query = """
        
        """

score_df = pd.read_sql(query, cursor.connection)

In [None]:
score_df.head()

In [None]:
score_df.shape

In [None]:
lr_model_fit = joblib.load('./Model/dtr_logistic_regression_v2.0.pkl')
rf_model_fit = joblib.load('./Model/dtr_random_forest_v2.0.pkl')
xgb_model_fit = joblib.load('./Model/dtr_xgboost_v2.0.pkl')
encoder = joblib.load('./Model/encoder_v1.0.pkl')
scaler_fit = joblib.load('./Model/scaler_v1.0.pkl')

In [None]:
LABEL_VAL = 'target column'

score_df_tr = score_df.copy()
score_df_tr = score_df_tr.drop(['column'], axis = 1)

### Replace all missing values with 'None' or 0 depending on the Data Type of column

In [None]:
dtype_dict_value = replace_values(score_df_tr, char_value = 'Unknown')
score_df_tr.fillna(value = dtype_dict_value, inplace = True)

### Convert all columns that are Factor Levels or Flag columns into Category data types

In [None]:
string_col_list = list(score_df_tr.select_dtypes(include = ['object']).columns)
encode_df = pd.DataFrame(encoder.transform(score_df_tr[string_col_list]).toarray(), columns = encoder.get_feature_names(string_col_list))

encode_col_dict = create_encode_col_dict(score_df_tr, encoder)

score_df_tr = score_df_tr.merge(encode_df, left_index = True, right_index = True)
score_df_tr = score_df_tr.drop(string_col_list,  axis = 1)

score_df_tr = score_df_tr.rename(columns = encode_col_dict)

score_df_tr.head()

## Drop Highly Correlated Columns

In [None]:
%store -r dtr_unique_corr_cols

score_df_tr = score_df_tr.drop(dtr_unique_corr_cols, axis = 1)

### Standardize columns using Scaler Fit

In [None]:
label = score_df_tr[LABEL_VAL]
column_headers = score_df_tr.drop(LABEL_VAL, axis = 1).columns
score_df_std = pd.DataFrame(scaler_fit.transform(score_df_tr.drop(LABEL_VAL, axis = 1)), columns = column_headers)
score_df_std = pd.DataFrame(label).merge(score_df_std, left_index = True, right_index = True)

score_df_std.head()

### Return column order for XGB Array

In [None]:
%store -r xgb_col_order

xgb_features = score_df_std.reindex(columns = xgb_col_order)

xgb_features.head()

# Create the final dataframe with scored labels and respective probabilities

In [None]:
features = score_df_std.drop([LABEL_VAL], axis = 1)
score_df_std['PRED_LABEL_LR'] = lr_model_fit.predict(features)
score_df_std['PRED_LABEL_RF'] = rf_model_fit.predict(features)
score_df_std['PRED_LABEL_XGB'] = xgb_model_fit.predict(xgb_features)

score_prob_df = pd.DataFrame()

score_prob_df['LR_PROB_ZERO'] = lr_model_fit.predict_proba(features)[: ,0]
score_prob_df['LR_PROB_ONE'] = lr_model_fit.predict_proba(features)[: ,1]
score_prob_df['RF_PROB_ZERO'] = rf_model_fit.predict_proba(features)[: ,0]
score_prob_df['RF_PROB_ONE'] = rf_model_fit.predict_proba(features)[: ,1]
score_prob_df['XGB_PROB_ZERO'] = xgb_model_fit.predict_proba(xgb_features)[:, 0]
score_prob_df['XGB_PROB_ONE'] = xgb_model_fit.predict_proba(xgb_features)[:, 1]

score_df_std = score_df_std.join(score_prob_df)

score_df_std['AVG_PROB_ZERO'] = (score_df_std['LR_PROB_ZERO'] + score_df_std['RF_PROB_ZERO'])/2
score_df_std['AVG_PROB_ONE'] = (score_df_std['LR_PROB_ONE'] + score_df_std['RF_PROB_ONE'])/2

score_df_std[['B2C_CMMRC_FLG'
              , 'PRED_LABEL_LR', 'LR_PROB_ZERO', 'LR_PROB_ONE'
              , 'PRED_LABEL_RF', 'RF_PROB_ZERO', 'RF_PROB_ONE'
              , 'AVG_PROB_ZERO', 'AVG_PROB_ONE', 'PRED_LABEL_XGB'
              , 'XGB_PROB_ZERO', 'XGB_PROB_ONE']].head(100)

#### Retirn list of column names from Random Forest Feature Importance

In [None]:
rf_import_df = pd.DataFrame()
rf_import_df['FEATURE_NAME'], rf_import_df['FEATURE_IMPORTANCE'] = features.columns, rf_model_fit.feature_importances_
rf_import_cols = list(rf_import_df.sort_values(by = ['FEATURE_IMPORTANCE'], axis = 0, ascending = False).head(20)['FEATURE_NAME'])
rf_import_df.sort_values(by = ['FEATURE_IMPORTANCE'], axis = 0, ascending = False).head(20)

%store -r dtr_lr_import_feats

rf_import_cols.extend(dtr_lr_import_feats)

import_cols = list(set(rf_import_cols))

exclude_list = ['COLUMN NAME']
import_cols = [col for col in import_cols if col not in exclude_list]

In [None]:
print(score_df_std['PRED_LABEL_LR'].value_counts(), "\n"
      , score_df_std['PRED_LABEL_RF'].value_counts(), "\n"
      , score_df_std['PRED_LABEL_XGB'].value_counts(), "\n"
     )

##### Generate Final Dataframe with Summary SHAP value output column that has friendly names

In [None]:
friendly_col_names_sql = """
                            SELECT
                                DB_COLUMN_NM
                                , FRIENDLY_NM
                            FROM MHUFFER.LM_FRIENDLY_NAMES
"""

friendly_col_df = pd.read_sql(friendly_col_names_sql, cursor.connection)

friendly_column_name = dict(zip(friendly_col_df.DB_COLUMN_NM, friendly_col_df.FRIENDLY_NM))
#print(friendly_column_name)

In [None]:
output_summary = shap_summary(xgb_model_fit, xgb_features, friendly_column_dict = friendly_column_name, standardized_df = True, scaler_fit = scaler_fit)

score_df_std['SUMMARY_OUTPUT'] = output_summary

In [None]:
score_final_df = score_df.merge(score_df_std, left_index = True, right_index = True)
check_cols = score_final_df.columns

keep_cols = ['ACCT_ID', 'PRED_LABEL_LR', 'LR_PROB_ZERO', 'LR_PROB_ONE', 'PRED_LABEL_RF', 'RF_PROB_ZERO', 'RF_PROB_ONE', 'AVG_PROB_ZERO', 'AVG_PROB_ONE', 'PRED_LABEL_XGB', 'XGB_PROB_ZERO', 'XGB_PROB_ONE', 'SUMMARY_OUTPUT']

score_final_df.drop(score_final_df.columns.difference(keep_cols), axis = 1, inplace = True)

score_final_df.head()

## Write scored data back to Oracle Database

In [None]:
drop_table_sql = """
"""

cursor.execute(drop_table_sql)

In [None]:
create_table_sql = """
    )
"""

cursor.execute(create_table_sql)

In [None]:
records = [tuple(x) for x in score_final_df.values]
cursor.executemany('''''', records)
db.commit()