In [29]:
from snowflake.snowpark import Session
import configparser

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
%matplotlib inline
matplotlib.rcParams.update({'font.size': 20})

# Code to establish connection and read data from Snowflake

In [31]:
config = configparser.ConfigParser()
config.read("snowflake_connection.ini")

['snowflake_connection.ini']

In [32]:
connection_parameters = {
    "user": f'{config["Snowflake"]["user"]}',
    "password": f'{config["Snowflake"]["password"]}',
    #"password": os.getenv('snowflake_password'),
    "account": f'{config["Snowflake"]["account"]}',
    #"account": os.getenv('snowflake_account'),
    "WAREHOUSE": f'{config["Snowflake"]["WAREHOUSE"]}',
    "DATABASE": f'{config["Snowflake"]["DATABASE"]}',
    "SCHEMA": f'{config["Snowflake"]["SCHEMA"]}'
}

In [33]:
def snowflake_connector(conn):
    try:
        session = Session.builder.configs(conn).create()
        print("connection successful!")
    except:
        raise ValueError("error while connecting with db")
    return session

session = snowflake_connector(connection_parameters)

connection successful!


In [34]:
application_train_sf  = session.table("CRA_APPLICATION_TRAIN_DETAILS")
application_test_sf  = session.table("CRA_APPLICATION_TEST_DETAILS")
bureau_sf  = session.table("CRA_BUREAU_DETAILS")
bureau_balance_sf  = session.table("CRA_BUREAU_BALANCE_DETAILS")
credit_card_balance_sf  = session.table("CRA_CREDIT_CARD_BALANCE_DETAILS")
installments_payments_sf  = session.table("CRA_INSTALLMENTS_PAYMENTS_DETAILS")
previous_application_sf  = session.table("CRA_PREVIOUS_APPLICATION_DETAILS")
POS_CASH_balance_sf  = session.table("CRA_POS_CASH_BALANCE_DETAILS")

# Convert Snowflake data into Pandas dataframes

In [35]:
df = application_train_sf.to_pandas()
df_bureau = bureau_sf.to_pandas()
df_previous_app = previous_application_sf.to_pandas()
df_installments = installments_payments_sf.to_pandas()

In [36]:
key = 'SK_ID_CURR'
bureau_cols = ['DAYS_CREDIT', 'DAYS_CREDIT_ENDDATE', 'DAYS_ENDDATE_FACT']
bureau_cols_max = ['BUREAU_MAX_' + c for c in bureau_cols]

df = pd.merge(
    left=df,
    right=df_bureau[[key] + bureau_cols].groupby(key).max().rename(
        columns=dict(zip(bureau_cols, bureau_cols_max))),
    left_on=key,
    right_index=True, 
    how='left'
)

# Example: sample of 3 loans
df[[key] + bureau_cols_max].sample(3)

Unnamed: 0,SK_ID_CURR,BUREAU_MAX_DAYS_CREDIT,BUREAU_MAX_DAYS_CREDIT_ENDDATE,BUREAU_MAX_DAYS_ENDDATE_FACT
111545,401114,,,
107554,396466,-58.0,1768.0,-56.0
49559,216583,-2081.0,-1529.0,-1533.0


In [37]:
key_prev = 'SK_ID_PREV'
payment_cols = ['AMT_PAYMENT']

# Min payment for all previous loans
df_previous_app = pd.merge(
    left=df_previous_app,
    right=df_installments[[key_prev] + payment_cols].groupby(key_prev).min(),
    left_on=key_prev,
    right_index=True,
    how='left'
)

# Example: SK_ID_CURR #365597
df_previous_app[[key] + [key_prev] + payment_cols][df_previous_app.SK_ID_CURR == 365597]

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,AMT_PAYMENT
1531913,365597,2027447,14459.94
1616475,365597,1459607,5489.73


In [38]:
df_previous_app.columns

Index(['SK_ID_PREV', 'SK_ID_CURR', 'NAME_CONTRACT_TYPE', 'AMT_ANNUITY',
       'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE',
       'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START',
       'FLAG_LAST_APPL_PER_CONTRACT', 'NFLAG_LAST_APPL_IN_DAY',
       'RATE_DOWN_PAYMENT', 'RATE_INTEREST_PRIMARY',
       'RATE_INTEREST_PRIVILEGED', 'NAME_CASH_LOAN_PURPOSE',
       'NAME_CONTRACT_STATUS', 'DAYS_DECISION', 'NAME_PAYMENT_TYPE',
       'CODE_REJECT_REASON', 'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE',
       'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE',
       'CHANNEL_TYPE', 'SELLERPLACE_AREA', 'NAME_SELLER_INDUSTRY',
       'CNT_PAYMENT', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION',
       'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION',
       'DAYS_LAST_DUE', 'DAYS_TERMINATION', 'NFLAG_INSURED_ON_APPROVAL',
       'CREATED_BY', 'CREATED_AT', 'AMT_PAYMENT'],
      dtype='object')

In [39]:
key = 'SK_ID_CURR'
prev_agg_cols = ['PREV_SUM_MIN_AMT_PAYMENT', 'PREV_MEAN_MIN_AMT_PAYMENT']

# Sum and mean of minimum payments across all previous loans
df_prev_agg = df_previous_app[[key] + payment_cols].groupby(key).agg(['sum', 'mean']);
df_prev_agg.columns = prev_agg_cols

df = pd.merge(
    left=df,
    right=df_prev_agg,
    left_on=key,
    right_index=True,
    how='left'
)

# Example: SK_ID_CURR #365597
df[[key] + prev_agg_cols][df.SK_ID_CURR == 365597]

Unnamed: 0,SK_ID_CURR,PREV_SUM_MIN_AMT_PAYMENT,PREV_MEAN_MIN_AMT_PAYMENT
211021,365597,19949.67,9974.835


In [40]:
base_cols = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
                'DAYS_BIRTH', 'AMT_CREDIT', 'AMT_ANNUITY',
                'DAYS_EMPLOYED', 'AMT_GOODS_PRICE', 'DAYS_ID_PUBLISH',
                'OWN_CAR_AGE'
               ]
feature_cols = base_cols + bureau_cols_max + prev_agg_cols
y = df.TARGET
X = df[feature_cols]
X = X.fillna(value=X.mean())

# Example: SK_ID_CURR #365597
X[df.SK_ID_CURR == 365597].transpose()

Unnamed: 0,211021
EXT_SOURCE_1,0.58934
EXT_SOURCE_2,0.507737
EXT_SOURCE_3,0.710674
DAYS_BIRTH,-10240.0
AMT_CREDIT,152820.0
AMT_ANNUITY,15241.5
DAYS_EMPLOYED,-543.0
AMT_GOODS_PRICE,135000.0
DAYS_ID_PUBLISH,-355.0
OWN_CAR_AGE,3.0


In [42]:
clf = RandomForestClassifier(
    n_estimators=50,
    criterion='gini',
    max_depth=5,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=True,
    oob_score=False,
    n_jobs=-1,
    random_state=0,
    verbose=0,
    warm_start=False,
    class_weight='balanced'
)

In [43]:
def plot_roc_curve(fprs, tprs):
    """Plot the Receiver Operating Characteristic from a list
    of true positive rates and false positive rates."""
    
    # Initialize useful lists + the plot axes.
    tprs_interp = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)
    f, ax = plt.subplots(figsize=(14,10))
    
    # Plot ROC for each K-Fold + compute AUC scores.
    for i, (fpr, tpr) in enumerate(zip(fprs, tprs)):
        tprs_interp.append(np.interp(mean_fpr, fpr, tpr))
        tprs_interp[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        ax.plot(fpr, tpr, lw=1, alpha=0.3,
                 label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
        
    # Plot the luck line.
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
             label='Luck', alpha=.8)
    
    # Plot the mean ROC.
    mean_tpr = np.mean(tprs_interp, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(mean_fpr, mean_tpr, color='b',
             label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
             lw=2, alpha=.8)
    
    # Plot the standard deviation around the mean ROC.
    std_tpr = np.std(tprs_interp, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                     label=r'$\pm$ 1 std. dev.')
    
    # Fine tune and show the plot.
    ax.set_xlim([-0.05, 1.05])
    ax.set_ylim([-0.05, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Receiver operating characteristic')
    ax.legend(loc="lower right")
    plt.show()
    return (f, ax)

def compute_roc_auc(index):
    y_predict = clf.predict_proba(X.iloc[index])[:,1]
    fpr, tpr, thresholds = roc_curve(y.iloc[index], y_predict)
    auc_score = auc(fpr, tpr)
    return fpr, tpr, auc_score

In [48]:
cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)
results = pd.DataFrame(columns=['training_score', 'test_score'])
fprs, tprs, scores = [], [], []
    
for (train, test), i in zip(cv.split(X, y), range(5)):
    clf.fit(X.loc[train], y.loc[train])
    _, _, auc_score_train = compute_roc_auc(train)
    fpr, tpr, auc_score = compute_roc_auc(test)
    scores.append((auc_score_train, auc_score))
    fprs.append(fpr)
    tprs.append(tpr)

plot_roc_curve(fprs, tprs);
pd.DataFrame(scores, columns=['AUC Train', 'AUC Test'])

InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.