In [1]:
# path variables
import sys
project_path = '/Users/naresh/Downloads/DS/growth/nsl_v2/nsl_v2_final/'
sys.path.insert(0, project_path+'config')
from config import SQLQuery

# core libraries
import pickle
import warnings
warnings.filterwarnings("ignore")
import datetime
import pandas as pd
import numpy as np
from datetime import date

from sklearn.preprocessing import OrdinalEncoder
from sklearn.decomposition import PCA
from xgboost import XGBClassifier, plot_importance
import matplotlib.pyplot as plt
from model_evaluations import model_metrics, cross_validation
from model_building import tune_hyperparameters

from sklearn.metrics import roc_curve, precision_recall_curve, precision_score, recall_score, roc_auc_score
from matplotlib import pyplot

In [2]:
%load_ext autoreload
%autoreload 2
from stability_monitoring import *

#### Load data

In [3]:
# north star customers definition
txn_days = 90
txn_credit_amount = 15000

# from_date = date(2022,10,1)
# to_date = date(2023,2,28)

In [4]:
q = SQLQuery('snowflake')

In [5]:
# # query to fetch the required data
# df_raw_app = q("""with 


# ALLOY_PERSONS as -- get all the alloy persons data (max of application_versions_id in case of duplicates)
# (
#   select * from (
# select * ,
#        row_number() over (partition by APPLICATION_ID order by APPLICATION_VERSION_ID desc) as rank
# from "PROD_DB"."DATA"."ALLOY_EVALUATIONS_PERSONS") 
# where rank=1
# ),

# APPLICATIONS as -- get all entries from applications table
# (
#   select *
#   from "PROD_DB"."DATA"."APPLICATIONS"
#   where application_id is not null
#   and date(APPLICATION_COMPLETE_DATETIME) <= '2020-12-31'
# )

#     select distinct
    
#     a.application_id,
#     a.estimated_monthly_revenue, a.incoming_ach_payments, a.outgoing_ach_and_checks,
#     a.check_deposit_amount, a.outgoing_wire_transfers, a.incoming_wire_transfer,
#     a.business_type, a.email_domain, a.current_bank, a.industry_category_name,
#     b.iovation_device_type, b.iovation_device_timezone, b.carrier, b.socure_sigma, b.socure_phonerisk,
#     b.socure_emailrisk, b.socure_reason_code, b.socure_phonerisk_reason_code, b.socure_emailrisk_reason_code
    
#     from APPLICATIONS a
#     left join ALLOY_PERSONS b
#     on a.application_id = b.application_id
#     """)

# df_raw_app.shape

In [6]:
# # drop null application ids
# df_raw_app = df_raw_app.dropna(subset='application_id')

In [7]:
# # drop duplicate columns
# df_raw_app = df_raw_app.loc[:,~df_raw_app.columns.duplicated()].copy()
# df_raw_app.shape

In [8]:
# raw data
file = 'all_apps_till_2020_12_31.pkl'
path = project_path + 'data/'
# df_raw_app.reset_index(inplace=True)
# df_raw_app.to_pickle(path+file)


df_raw_app = pd.read_pickle(path+file)

In [9]:
# segment_raw_data = q(
# """
# with 
# APPLICATIONS as -- get all entries from applications table
# (
#   select *
#   from "PROD_DB"."DATA"."APPLICATIONS"
#   where application_id is not null
#   and date(APPLICATION_COMPLETE_DATETIME) <= '2020-12-31'
# )

# ,segment_all as (
# select b.application_id, a.USER_ID, a.anonymous_id, a.CONTEXT_IP, a.OWNER_ID, context_page_path, screen_width, screen_height, timezone, sent_at, received_at
# from APPLICATIONS b
# left join SEGMENT_DB.ONBOARDING_PROD.PAGES a
# on a.application_id=b.application_id
# order by a.application_id, received_at asc
# )

# -- Pull all the records which crossed the 13th question  
# ,segment_till_incoming as (select a.application_id, a.context_page_path, a.received_at from 
# (select a.application_id, a.context_page_path, a.received_at, rank() over(partition by a.application_id order by a.received_at asc) as rk
# from segment_all a
# where a.context_page_path='/app/business-questions/incoming'
# ) a where rk=1 )

# -- Pull customer visited pages till the 13th question
# ,final as (select
# a.application_id, a.USER_ID, a.anonymous_id, a.CONTEXT_IP, a.OWNER_ID, a.context_page_path, a.screen_width, a.screen_height, a.timezone, a.sent_at, a.received_at 
# from segment_all a
# inner join segment_till_incoming b
# on a.application_id=b.application_id and a.received_at <= b.received_at
# order by a.application_id, a.received_at asc
# )

# select * from final order by application_id, received_at asc

# """
# )


In [10]:
def segment_features_1(df:pd.DataFrame, cols_list:list, app_level_df:pd.DataFrame, training:bool=True):
    """ 
        df : segment raw data
        cols_list: columns list on which the operations are performed
        app_level_df: application level data
        
    """
    
    new_cols = []
    for col in cols_list:
        tmp = df[['application_id',col]].drop_duplicates(subset=['application_id',col], keep='first')
        tmp2 = pd.DataFrame(tmp.application_id.value_counts()).reset_index()
        col = col+'_count'
        tmp2.rename(columns={'application_id':col, 'index':'application_id'}, inplace=True)
    
        # Merging with app level df
        app_level_df = pd.merge(app_level_df, tmp2, on='application_id', how='left')
        new_cols.append(col)
        
    if training:
        df_impute_segment = app_level_df[new_cols].median()
        df_impute_segment = pd.DataFrame(df_impute_segment, columns=['impute_value']).reset_index().rename(columns={'index':'feature'})
        df_impute_segment.to_pickle(project_path+'models/df_impute_segment.pkl') # Save the impute values as df

    else:
        df_impute_segment = pd.read_pickle(project_path+'models/df_impute_segment.pkl') # Load the impute values as df

    # screen height and width features
    tmp2 = pd.DataFrame()
    tmp2 = pd.concat([tmp2, df[['application_id','screen_height','screen_width']]])
    tmp2.drop_duplicates(subset=['application_id','screen_height','screen_width'],keep='first', inplace=True)

    new_cols2 = []
    list_value_type = ['max','min','mean','median']
    for value_type in list_value_type:
        colw = 'screen_width'
        tmp = pd.DataFrame(tmp2.groupby(['application_id'])[colw].agg([value_type]))
        tmp[value_type] = tmp[value_type].astype('float')
        tmp.rename(columns={value_type:colw+'_'+value_type}, inplace=True) 
        app_level_df = pd.merge(app_level_df, tmp, on='application_id', how='left')
        
        new_cols2.append(colw+'_'+value_type)

        colh = 'screen_height'
        tmp = pd.DataFrame(tmp2.groupby(['application_id'])[colh].agg([value_type]))
        tmp[value_type] = tmp[value_type].astype('float')
        tmp.rename(columns={value_type:colh+'_'+value_type}, inplace=True) 
        app_level_df = pd.merge(app_level_df, tmp, on='application_id', how='left')
        
        new_cols2.append(colh+'_'+value_type)
        
    if training:
        df_impute_segment2 = app_level_df[new_cols2].median()
        df_impute_segment2 = pd.DataFrame(df_impute_segment2, columns=['impute_value']).reset_index().rename(columns={'index':'feature'})
        df_impute_segment2.to_pickle(project_path+'models/df_impute_segment2.pkl') # Save the impute values as df

    else:
        df_impute_segment2 = pd.read_pickle(project_path+'models/df_impute_segment2.pkl') # Load the impute values as df

    return app_level_df


In [11]:
def segment_features_2(df:pd.DataFrame, app_level_df:pd.DataFrame, training:bool=True):
    """ 
        df : segment raw data
        cols_list: columns list on which the operations are performed
        app_level_df: application level data
        
    """
    # Cleaning the list of owner_ids
    def clean_string(x):
        if x != None:
            x = str(x)
            x = [val.strip("""'|[| |"|,|]""") for val in x.split('/') if not val.strip("""'|[| |"|,|]""") in ['',None] and len(val.strip("""'|[| |"|,|]"""))<36]
        return '/'.join(x)

    df['context_page_path_clean'] = df['context_page_path'].apply(clean_string)
    # One-hot encoding the unique page paths
    col = 'context_page_path_clean'
    tmp = df[col].str.get_dummies()
    tmp['application_id'] = df.application_id
    # Common pages that every applicant must visit
    common_pages = ['verify-email-otp','welcome','app/applicant/personal-info','app/applicant/phone'
    ,'app/applicant/otp-verify','app/applicant/address','app/applicant/dob-ssn','app/business/business-type'
    ,'app/business/address','app/business/other-info','app/business-questions/about-business'
    ,'app/business-questions/incoming']

    # no.of visits per page
    tmp2 = tmp[common_pages+['application_id']].groupby(['application_id']).sum()
    tmp2.reset_index(drop=False, inplace=True)
    tmp2 = pd.merge(app_level_df, tmp2, on='application_id', how='left')
    
    # no.of unique pages per user
    tmp3 = tmp.groupby(['application_id']).sum()
    tmp3.reset_index(drop=False, inplace=True)
    tmp3 = pd.merge(tmp3, app_level_df[['application_id']], on='application_id', how='left')
    drop_cols = ['404','app/applicant','app/business','application-denied','forgot-password','signup','status','undefined']
    drop_cols = [col for col in tmp3.columns.to_list() if col in drop_cols]
    tmp3 = tmp3.drop(columns=drop_cols)
    tmp3 = tmp3.dropna(subset=['application_id'])
    
    tmp4 = pd.DataFrame()
    tmp4['application_id'] = tmp3.application_id
    cols = tmp3.drop(columns=['application_id']).columns.to_list()
    tmp4[cols] = pd.DataFrame(np.where(tmp3[cols]>=1, 1, 0))
    tmp4 = tmp4.set_index('application_id')
    tmp4 = pd.DataFrame(tmp4.sum(axis=1), columns=['page_count'])
    tmp2 = pd.merge(tmp2,tmp4,on='application_id',how='left')

    if training:
        df_impute_segment3 = tmp2[common_pages+['page_count']].median()
        df_impute_segment3 = pd.DataFrame(df_impute_segment3, columns=['impute_value']).reset_index().rename(columns={'index':'feature'})
        df_impute_segment3.to_pickle(project_path+'models/df_impute_segment3.pkl') # Save the impute values as df

    else:
        df_impute_segment3 = pd.read_pickle(project_path+'models/df_impute_segment3.pkl') # Load the impute values as df

    return tmp2


In [12]:
def segment_features_3(app_level_df:pd.DataFrame):
    """ 
        df : segment raw data
        cols_list: columns list on which the operations are performed
        app_level_df: application level data
        
    """
    impute1 = pd.read_pickle(project_path+'models/df_impute_segment.pkl')
    impute2 = pd.read_pickle(project_path+'models/df_impute_segment2.pkl')
    impute3 = pd.read_pickle(project_path+'models/df_impute_segment3.pkl')
    
    ############# PART-1 #############
    # Function to fill nulls with median values
    def fill_null_values(df_impute:pd.DataFrame, data_df:pd.DataFrame,):
        df_dict = dict(df_impute.values)
        impute_cols = df_impute['feature'].to_list()
        for col in data_df.columns.to_list():
            if col in impute_cols:
                data_df[col] = data_df[col].fillna(df_dict[col])
                impute_cols.remove(col)
        return data_df

    # Filling nulls with median
    app_level_df = fill_null_values(impute1, app_level_df)
    app_level_df['sh_sw_ratio_count'] = app_level_df['screen_height_count']/app_level_df['screen_width_count']
    app_level_df['sh_sw_ratio_count'] = app_level_df['sh_sw_ratio_count'].astype('float')
    
    cols_list_1 = impute1.feature.to_list()
    # Feature Engg
    for col in cols_list_1:
        app_level_df[col] = np.where(app_level_df[col]==1,1,0)
        

    ############# PART-2 #############
    # Filling nulls with median
    app_level_df = fill_null_values(impute2, app_level_df)

    # Function to fill zero screen width and height with median values
    def fill_zero_values(df_impute:pd.DataFrame, data_df:pd.DataFrame,):
        df_dict = dict(df_impute.values)
        impute_cols = df_impute['feature'].to_list()
        for col in data_df.columns.to_list():
            if col in impute_cols:
                data_df[col] = np.where(data_df[col]==0, df_dict[col], data_df[col])
                impute_cols.remove(col)
        return data_df
    
    # Filling zeros with median
    app_level_df = fill_zero_values(impute2, app_level_df)
    # Creating Ratios
    list_value_type = ['max','min','mean','median']
    for value_type in list_value_type:
        colh = 'screen_height'
        colw = 'screen_width'
        app_level_df['sh_sw_ratio_'+value_type] = app_level_df[colh+'_'+value_type]/app_level_df[colw+'_'+value_type]

    
    ############# PART-3 #############
    cols_list_3 = impute3.feature.to_list()
    app_level_df = fill_null_values(impute3, app_level_df)
    
    return app_level_df

In [14]:
# segment_pages_oot = segment_raw_data.copy()
# # Filter only the existing applications in segment data
# segment_pages_oot = segment_pages_oot[~segment_pages_oot.received_at.isnull()]
# segment_pages_oot.reset_index(drop=True, inplace=True)
# segment_pages_oot = segment_pages_oot.sort_values(by=['application_id','received_at'])

# # Change data type
# cols = ['application_id','user_id','owner_id','anonymous_id','context_page_path','timezone']
# segment_pages_oot[cols] = segment_pages_oot[cols].astype('string')
# segment_pages_oot[['screen_width','screen_height']] = segment_pages_oot[['screen_width','screen_height']].astype('int')

# for col in segment_pages_oot.columns:
#     if col != 'user_id':
#         idx = segment_pages_oot.index[segment_pages_oot[col].isnull()].tolist()
#         idx.extend(segment_pages_oot.index[segment_pages_oot[col].isna()].tolist())
#         idx.extend(segment_pages_oot.index[segment_pages_oot[col] == ''].tolist())
#         idx.extend(segment_pages_oot.index[segment_pages_oot[col] == '[]'].tolist())
#         idx = list(set(idx))
#         segment_pages_oot.loc[idx, col] = None    


# df_oot = df_raw_app.copy()

# df_oot_tmp = pd.merge(segment_pages_oot, df_oot[['application_id']], on='application_id', how='inner')
# x_oot = df_oot_tmp.reset_index(drop=True)

# cols_list = ['timezone','user_id','owner_id','anonymous_id','context_ip','screen_width','screen_height']
# # Creating df with all apps
# app_level_data = pd.DataFrame()
# app_level_data['application_id'] = x_oot.application_id.unique()

# app_level_data = segment_features_1(df=x_oot, cols_list=cols_list, app_level_df=app_level_data, training=False)
# app_level_data = segment_features_2(df=x_oot, app_level_df=app_level_data, training=False)

# df_oot = pd.merge(df_oot[['application_id']], app_level_data, on='application_id', how='left')
# x_oot = df_oot.reset_index(drop=True)
# app_level_data_oot = segment_features_3(app_level_df=x_oot)


In [15]:
# app_level_data.shape[0], segment_raw_data.application_id.nunique()

In [16]:
# save the oot dataset
file = 'all_apps_segment_till_2020_12_31.pkl'
path = project_path + 'data/'
# app_level_data_oot.reset_index(inplace=True)
# app_level_data_oot.to_pickle(path+file)

app_level_data_oot = pd.read_pickle(path+file)

#### Feature creation

In [17]:
def convert_nulls_to_one_format(df:pd.DataFrame):
    for col in df.columns:
        idx = df.index[df[col].isnull()].tolist()
        idx.extend(df.index[df[col].isna()].tolist())
        idx.extend(df.index[df[col] == ''].tolist())
        idx.extend(df.index[df[col] == '[]'].tolist())
        idx = list(set(idx))
        df.loc[idx, col] = None
    return df


def fill_null_values(df_impute:pd.DataFrame, data_df:pd.DataFrame,):
    df_dict = dict(df_impute.values)
    impute_cols = df_impute['feature'].to_list()
    for col in data_df.columns.to_list():
        if col in impute_cols:
            data_df[col] = data_df[col].fillna(df_dict[col])
            impute_cols.remove(col)
    return data_df

In [18]:
def feature_engineering_alloy(df:pd.DataFrame, training:bool=True):
    
    # feature 1
    col = 'iovation_device_type'
    df[col] = df[col].str.lower()
    df[col] = np.where(df[col].isin(['windows','iphone','mac','android']),df[col],'other')
    
    # feature 3
    df['iovation_device_timezone'] = np.where(df['iovation_device_timezone'].isin(['300','360','480']),
                                                   df['iovation_device_timezone'], 'other')
    # feature 6 
    col = 'carrier'
    df[col] = df[col].str.lower()
    df[col] = np.where(df[col].str.contains('verizon'),'verizon',df[col])
    df[col] = np.where(df[col].str.contains('at&t'),'att',df[col])
    df[col] = np.where(df[col].str.match('att'),'att',df[col])
    df[col] = np.where(df[col].str.contains('t-mobile*'),'tmobile',df[col])
    df[col] = np.where(df[col].isin(['att','tmobile','verizon']),df[col],'other')
    
    # Socure reason codes
    # Cleaning the list of reason codes
    def clean_string(x):
        if x != None:
            x = [val.strip("""'|[| |"|,|]""") for val in x.split('\n') if not val.strip("""'|[| |"|,|]""") in ['',None]]
        return x

    socure_cols = ['socure_phonerisk_reason_code','socure_emailrisk_reason_code','socure_reason_code']
    df_socure = df[socure_cols]

    # Collecting the cleaned reason codes
    for col in socure_cols:
        df_socure[col] = df_socure[col].astype('str')
        df_socure[col] = df_socure[col].str.lower()
        df_socure[col] = df_socure[col].apply(clean_string)
        
    dict_cols = {}
    for col in socure_cols:
        df_socure = df_socure.drop(col, 1).join(df_socure[col].str.join('|').str.get_dummies())
        socure_cols = df_socure.columns[df_socure.columns.str.startswith('socure')].to_list()
        true_cols = list(set(df_socure.columns.to_list()) - set(socure_cols))
        new_cols = []
        for col2 in true_cols:
            new_cols.append(col+'_'+col2)
        dict_cols = dict(zip(true_cols, new_cols)) | dict_cols
        df_socure.rename(columns=dict_cols, inplace=True)

    df_socure = df_socure.T
    df_socure = df_socure[~df_socure.index.duplicated(keep='first')].T
    df_socure = df_socure.astype('int')
        
    socure_reason_codes_columns = pd.read_pickle(project_path+'models/socure_reason_codes_columns.pkl')
    df_tmp = pd.DataFrame(index=range(df.shape[0]),columns=socure_reason_codes_columns['feature'].to_list())
    df_tmp = df_tmp.fillna(0)
    
    df_tmp.update(df_socure)
    df_tmp = df_tmp.astype('int')
    df = pd.concat([df,df_tmp], axis=1)
    
    return df


In [19]:
def feature_engineering_app(df:pd.DataFrame):
    # business type
    df['business_group'] = np.where(df['business_type'] == 'sole_proprietorship', 0, 1)

    # email domain
    email_domain_group = pd.read_pickle(project_path+'models/email_domain_group.pkl')['email_domain'].to_list()
    df['email_domain_bucket'] = np.where(df['email_domain'].isin(email_domain_group), 0, 1)

    # estimated business numbers
    estimated_cols = ['estimated_monthly_revenue',
                      'incoming_ach_payments',
                      'check_deposit_amount',
                      'incoming_wire_transfer',
                      'outgoing_ach_and_checks',
                      'outgoing_wire_transfers']

    # grouping all responses into 5K+ and 5K-
    for col in estimated_cols:
        df[col] = df[col].str.lower()
        df[col] = np.where(df[col].isin(['$5k +', '$50k +']), 1, 0)

    # current bank
    hdb_group = ['bluevine', 'other-national-bank', 'td-ank', 'chase', 'usaa']
    df['current_bank_group'] = np.where(df['current_bank'].isin(hdb_group), 1, 0)

    return df
    

#### Filter for raw features

In [20]:
raw_features_app = [
 'email_domain',
 'estimated_monthly_revenue',
 'incoming_ach_payments',
 'check_deposit_amount',
 'incoming_wire_transfer',
 'outgoing_ach_and_checks',
 'outgoing_wire_transfers',
 'current_bank',
 'industry_category_name',
 'business_type'
]

# Alloy Features
# Numerical
num_cols = ['socure_emailrisk', 'socure_phonerisk','socure_sigma']
# Categorical
cat_columns = ['iovation_device_type','iovation_device_timezone','carrier']
# Reason codes - Socure
socure_reason_cols = ['socure_reason_code','socure_emailrisk_reason_code','socure_phonerisk_reason_code']
raw_features_segment = ['screen_width_mean','sh_sw_ratio_mean']

In [21]:
raw_features_alloy = num_cols + cat_columns + socure_reason_cols
raw_features = raw_features_app + raw_features_alloy + raw_features_segment

#### Independent variables

In [22]:
independent_features_app = ['estimated_monthly_revenue', 
                            'incoming_ach_payments', 
                            'check_deposit_amount', 
                            'incoming_wire_transfer',
                            'outgoing_ach_and_checks', 
                            'outgoing_wire_transfers',
                            'business_group', 
                            'email_domain_bucket', 
                            'industry_category_name',
                            'current_bank_group'
                           ]

# Alloy Features
reason_codes_cols = pd.read_pickle(project_path+'models/socure_reason_codes_columns.pkl'
                                            )['feature'].to_list()
# Final alloy columns
independent_features_alloy_tmp = num_cols + cat_columns + reason_codes_cols


### OOT Data

In [31]:
oot_df = df_raw_app.copy()

# Adding segment test data
segment_oot = app_level_data_oot.copy()
oot_df = pd.merge(oot_df, segment_oot, on='application_id', how='inner')
x_oot = oot_df[raw_features]

In [32]:
# convert all string features to lowercase
string_features = ['email_domain',
 'current_bank',
 'industry_category_name',
 'business_type']+cat_columns+socure_reason_cols

for col in string_features:
    x_oot[col] = x_oot[col].str.lower()

In [33]:
x_oot.shape

(72026, 21)

In [34]:
x_oot = convert_nulls_to_one_format(df=x_oot)
df_impute = pd.read_pickle(project_path+'models/df_impute.pkl')
x_oot = fill_null_values(df_impute, x_oot)

x_oot = feature_engineering_app(df=x_oot)
x_oot = feature_engineering_alloy(df=x_oot, training=False)

# Removing reason code features as these are already encoded
x_oot.drop(columns=socure_reason_cols, inplace=True)

# Encoding the categories
x_object_cols = x_oot[independent_features_alloy_tmp].select_dtypes(include=['object']).columns.to_list()
x_object_cols = x_object_cols+['industry_category_name']

x_object_onehot = pd.get_dummies(x_oot[x_object_cols]) # create dummies
x_object_onehot = x_object_onehot.astype('int')
x_oot = pd.concat([x_oot.drop(columns=x_object_cols), x_object_onehot], axis=1)
x_oot.columns= x_oot.columns.str.lower() # convert column names to lower case

In [35]:
# Filter the final independent features
# independent_features_alloy = num_cols + x_object_onehot.columns.to_list() + reason_codes_cols
# independent_features = independent_features_app + independent_features_alloy + raw_features_segment

# x_oot = x_oot[independent_features]

# train_cols = pd.read_pickle(project_path+'models/train_data_columns.pkl')['feature'].to_list()
train_cols = [
 'estimated_monthly_revenue',
 'incoming_ach_payments',
 'sh_sw_ratio_mean',
 'screen_width_mean',
 'industry_category_name_professional, scientific, and technical services',
 'business_group',
 'outgoing_ach_and_checks',
 'socure_sigma',
 'iovation_device_type_mac',
 'industry_category_name_real estate rental and leasing',
 'socure_emailrisk',
 'socure_emailrisk_reason_code_i566',
 'socure_phonerisk',
 'industry_category_name_retail trade',
 'socure_emailrisk_reason_code_i553',
 'iovation_device_type_android',
 'outgoing_wire_transfers',
 'socure_emailrisk_reason_code_r561',
 'check_deposit_amount',
 'socure_phonerisk_reason_code_i630',
 'socure_reason_code_r207',
 'socure_phonerisk_reason_code_i614',
 'iovation_device_timezone_480',
 'industry_category_name_administrative and support and waste management and remediation services',
 'socure_phonerisk_reason_code_r616',
 'email_domain_bucket',
 'incoming_wire_transfer',
 'industry_category_name_health care and social assistance',
 'socure_phonerisk_reason_code_r639',
 'carrier_tmobile'
]

df_tmp = pd.DataFrame(index=range(x_oot.shape[0]),columns=train_cols)
df_tmp = df_tmp.fillna(0)
df_tmp.update(x_oot)
x_oot = df_tmp.copy()
x_oot.shape

(72026, 30)

#### Fit model

In [36]:
# ###################################
# depth = 4
# weight = 6
# file_name = 'nsql_model_depth_'+str(depth)+'_weight_'+str(weight)+'.pkl'
# path = project_path + 'models/'
# xgb_model = pickle.load(open(project_path+file_name, "rb"))

xgb_model = pickle.load(open('../../nsl_v2_model_deployment/models/nsql_model_v2.pkl', "rb"))

top_features = [
 'estimated_monthly_revenue',
 'incoming_ach_payments',
 'sh_sw_ratio_mean',
 'screen_width_mean',
 'industry_category_name_professional, scientific, and technical services',
 'business_group',
 'outgoing_ach_and_checks',
 'socure_sigma',
 'iovation_device_type_mac',
 'industry_category_name_real estate rental and leasing',
 'socure_emailrisk',
 'socure_emailrisk_reason_code_i566',
 'socure_phonerisk',
 'industry_category_name_retail trade',
 'socure_emailrisk_reason_code_i553',
 'iovation_device_type_android',
 'outgoing_wire_transfers',
 'socure_emailrisk_reason_code_r561',
 'check_deposit_amount',
 'socure_phonerisk_reason_code_i630',
 'socure_reason_code_r207',
 'socure_phonerisk_reason_code_i614',
 'iovation_device_timezone_480',
 'industry_category_name_administrative and support and waste management and remediation services',
 'socure_phonerisk_reason_code_r616',
 'email_domain_bucket',
 'incoming_wire_transfer',
 'industry_category_name_health care and social assistance',
 'socure_phonerisk_reason_code_r639',
 'carrier_tmobile'
]


In [37]:
y_pred = xgb_model.predict(x_oot[top_features])
predicted_probas = xgb_model.predict_proba(x_oot[top_features])

In [38]:
x_oot['nsl_v2_prob'] = predicted_probas[:,1]
x_oot['nsl_v2_flag'] = np.where((x_oot['nsl_v2_prob']>0.3) & (x_oot.business_group==1), 1, 
                               np.where((x_oot['nsl_v2_prob']>0.5) & (x_oot.business_group==0), 1, 0)
                               )

In [39]:
x_oot.shape

(72026, 32)

In [40]:
df_raw_app['nsl_v2_prob'] = x_oot['nsl_v2_prob']
df_raw_app['nsl_v2_flag'] = x_oot['nsl_v2_flag']

In [43]:
df_write = df_raw_app[['application_id','nsl_v2_prob','nsl_v2_flag']]

In [44]:
# df_write.to_pickle(project_path+'results/part1_2020_12_31.pkl')

In [None]:
# from sqlalchemy.types import NVARCHAR
# from conf.config import SQLQuery
# q = SQLQuery('snowflake')

# df_write.to_sql(name='nsl_v2_scores_part1',
#                  con=q.engine, 
#                  schema='prod_db.adhoc',
#                  if_exists='append', 
#                  index=False, 
#                  chunksize=16000, 
#                  method='multi',
#                  dtype={col_name: NVARCHAR for col_name in df_write})