### Load Modules

In [1]:
# path variables
import sys
project_path = '/Users/naresh/Downloads/ds_models/onboarding_fraud_model_v2/'
sys.path.insert(0, project_path+'config')
from config import SQLQuery

# core libraries
import pickle
import warnings
warnings.filterwarnings("ignore")
import datetime
import pandas as pd
import numpy as np
import textstat
import nltk
import json
import re
import validators
import requests

from sklearn.preprocessing import OrdinalEncoder
from sklearn.decomposition import PCA
from xgboost import XGBClassifier, plot_importance
import matplotlib.pyplot as plt
from model_evaluations import model_metrics, cross_validation
from model_building import tune_hyperparameters
from sklearn.metrics import roc_curve, precision_recall_curve, precision_score, recall_score, roc_auc_score
from matplotlib import pyplot
from collections import Counter
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from fuzzywuzzy import fuzz
from itertools import permutations
from nltk.util import ngrams
from nltk.collocations import BigramAssocMeasures, TrigramAssocMeasures, BigramCollocationFinder, TrigramCollocationFinder


WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


Custom DS Modules

In [2]:
%load_ext autoreload
%autoreload 2
from stability_monitoring import *

### Load the Dataset

In [3]:
# raw data - apps and alloy sources
year = '2022_2023'
file = 'apps_raw_dataset_2022.pkl'
path = project_path + 'data/'
df_raw = pd.read_pickle(path + file)
df_raw.shape

(89792, 62)

In [4]:
# converting uppercase column names to lowercase column names
cols_lower = []
for col in df_raw.columns:
    cols_lower.append(col.lower())
    df_raw[col.lower()]=df_raw[col]

df_raw = df_raw[cols_lower] # selecting only lowercase columns

In [5]:
# Separating train and test data
x_oot = df_raw


### Filter for raw features

In [6]:
x_oot[['ein_ssn','has_international_business']] = x_oot[['ein_ssn','has_international_business']].astype('bool')

In [7]:
num_features = ['person_fraud_score', 'person_kyc_score', 'sentilink_abuse_score', 'sentilink_first_party_synthetic_score'
            ,'sentilink_third_party_synthetic_score', 'sentilink_id_theft_score', 'socure_sigma', 'socure_emailrisk', 'socure_phonerisk'
            ,'socure_addressrisk','number_of_employees']
num_features2 = ['business_address_zip', 'phone']
bool_features = ['ein_ssn','has_international_business']
cat_features = ['iovation_device_type', 'estimated_monthly_revenue', 'incoming_ach_payments', 'check_deposit_amount'
               , 'incoming_wire_transfer', 'outgoing_ach_and_checks', 'outgoing_wire_transfers', 'line_type', 'industry_category_name']
list_features = ['person_fraud_tags', 'person_kyc_tags', 'socure_reason_code', 'socure_kyc_field_validations'
                ,'socure_kyc_reason_code', 'socure_emailrisk_reason_code', 'socure_phonerisk_reason_code'
                ,'socure_addressrisk_reason_code', 'purpose_of_account', 'touch_point_emails', 'owner_list']
high_cardinality_features = ['iovation_device_timezone', 'iovation_device_ip', 'iovation_device_ip_isp', 'iovation_device_ip_org' 
                            ,'iovation_device_ip_city', 'iovation_device_ip_region', 'carrier', 'email', 'email_domain', 'industry_name'
                            , 'website', 'business_address_city', 'business_address_state', 'industry_category_from_pitch'
                            , 'company_name']
text_features = ['business_pitch']
id_features = ['application_id', 'fraud_score', 'deposit_score']
datetime_features = ['application_start_datetime', 'application_complete_datetime', 'application_resubmitted_datetime']

In [8]:
raw_features = num_features + num_features2 + bool_features + cat_features + list_features + high_cardinality_features + text_features + id_features + datetime_features
x_oot = x_oot[raw_features]
x_oot.shape

(89792, 57)

### Feature creation

In [9]:
df = x_oot

In [10]:
# # convert all string features to lowercase
# string_features = cat_features + list_features + high_cardinality_features + text_features

# for col in string_features:
#     print(col)
#     x_train[col] = x_train[col].str.lower()

In [11]:
# Converts all type of nulls to one null format
def convert_nulls_to_one_format(df:pd.DataFrame):
    for col in df.columns:
        idx = df.index[df[col].isnull()].tolist()
        idx.extend(df.index[df[col].isna()].tolist())
        idx.extend(df.index[df[col] == ''].tolist())
        idx.extend(df.index[df[col] == '[]'].tolist())
        idx = list(set(idx))
        df.loc[idx, col] = None
    return df

In [12]:
# # Saving the imputing values of each feature

# # Numerical
# df_impute_numerical = df[num_features].median()
# df_impute_numerical = pd.DataFrame(df_impute_numerical, columns=['impute_value']).reset_index().rename(columns={'index':'feature'})


# data_impute_list = []
# for col in cat_features+high_cardinality_features+list_features+bool_features:
#     if col not in ['email','touch_point_emails','owner_list','website','company_name']:
#         mode_val = df[col].mode()
#         if len(mode_val)==1:
#             data_impute_list.append([col, mode_val[0]])
#         else:
#             data_impute_list.append([col, mode_val[0]])
# # Categorical
# df_impute_categorical = pd.DataFrame(data_impute_list, columns=['impute_value', 'feature'])

# df_impute_custom = pd.DataFrame([['email','na'], ['touch_point_emails','na'], ['owner_list','na'], ['website','na'],
#                                 ['company_name','na']], columns=['feature','impute_value'])
# # Combining all imputes
# df_impute = pd.concat([df_impute_numerical,df_impute_categorical, df_impute_custom], axis=0)
# df_impute.reset_index(drop=True, inplace=True)

# df_impute.to_pickle(project_path+'models/df_impute.pkl') # Save the impute values as df

In [13]:
# Load impute df
df_impute = pd.read_pickle(project_path+'models/df_impute_'+year+'.pkl')

In [14]:
# Impute all the nulls
def fill_null_values(df_impute:pd.DataFrame, data_df:pd.DataFrame,):
    df_dict = dict(df_impute.values)
    impute_cols = df_impute['feature'].to_list()
    for col in data_df.columns.to_list():
        if col in impute_cols:
            data_df[col] = data_df[col].fillna(df_dict[col])
            impute_cols.remove(col)
    return data_df

In [15]:

# Convert all nulls to one format
df = convert_nulls_to_one_format(df=df)
# Data imputing
df = fill_null_values(df_impute, df)

#### Boolean Features Encoding

In [16]:
df['ein_ssn'] = df['ein_ssn']*1
df['ein_ssn'] = df['ein_ssn'].astype(int)
df['has_international_business'] = df['has_international_business']*1
df['has_international_business'] = df['has_international_business'].astype(int)

### Datetime Fetaures Engineering

In [17]:
# Filling nulls with the start date because the difference between start date and complete date remains less than 24 hours
df['application_complete_datetime'] = np.where(df['application_complete_datetime'].isnull(), df['application_start_datetime'], 
                                               df['application_complete_datetime'])
df['application_start_datetime'] = pd.to_datetime(df['application_start_datetime'])
df['application_complete_datetime'] = pd.to_datetime(df['application_complete_datetime'])

In [18]:
# Function to convert day of month to the week of month
def weekofmonth(val):
    if val<=7:
        return 1
    elif val<=14:
        return 2
    elif val<=21:
        return 3
    else:
        return 4

# Correcting the features values to standard format
replace_weekdaytonumber = {'sunday':0, 'monday':1, 'tuesday':2 , 'wednesday':3, 'thursday':4, 'friday':5, 'saturday':6}

# App start date features
# df['app_start_monthofyear'] = df['application_start_datetime'].dt.month
df['app_start_dateofmonth'] = df['application_start_datetime'].dt.day
df['app_start_weekofmonth'] = df['app_start_dateofmonth'].apply(weekofmonth)
df['app_start_dayofweek'] = df['application_start_datetime'].dt.day_name()
# df['app_start_dayofweek'] = df['app_start_dayofweek'].astype('string').apply(weekdaytonumber)
df['app_start_dayofweek'] = df['app_start_dayofweek'].replace(replace_weekdaytonumber)
df['app_start_hourofday'] = df['application_start_datetime'].dt.hour

# App complete date features
# df['app_complete_monthofyear'] = df['application_complete_datetime'].dt.month
df['app_complete_dateofmonth'] = df['application_complete_datetime'].dt.day
df['app_complete_weekofmonth'] = df['app_complete_dateofmonth'].apply(weekofmonth)
df['app_complete_dayofweek'] = df['application_complete_datetime'].dt.day_name()
# df['app_complete_dayofweek'] = df['app_complete_dayofweek'].astype('string').apply(weekdaytonumber)
df['app_complete_dayofweek'] = df['app_complete_dayofweek'].replace(replace_weekdaytonumber)
df['app_complete_hourofday'] = df['application_complete_datetime'].dt.hour


### Categorical Feature Engineering

In [19]:
col = 'iovation_device_type'
df[col] = df[col].str.lower()
df[col] = np.where(df[col].isin(['iphone', 'mac','ipad', 'ipod']), 'apple', df[col])
df[col] = np.where(df[col].isin(['chromeos', 'linux', 'handheld_other']), 'other', df[col])

estimated_features

In [20]:
estimated_features = ['estimated_monthly_revenue','incoming_ach_payments','check_deposit_amount','incoming_wire_transfer',
                      'outgoing_ach_and_checks','outgoing_wire_transfers']

# Correcting the features values to standard format
estimated_features_replace_values1 = {'$0':'0k', 
                                      '$0 - $1k':'1k', '<$1k':'1k', '$1 - $1k':'1k', 
                                      '$1k - $5k':'1k_plus', '$1k +':'1k_plus', 
                                      '$5k - $20k':'5k_plus', '$5k +':'5k_plus', '$20k - $50k':'5k_plus', 
                                      '$50k +':'50k_plus' }
for col in estimated_features:
    df[col] = df[col].str.lower()
df[estimated_features] = df[estimated_features].replace(estimated_features_replace_values1)

In [21]:
estimated_features_interaction = []
for value in ['0k','1k','1k_plus','5k_plus','50k_plus']:
    estimated_features_interaction.append('estimated_features_'+value+'_count')
    df['estimated_features_'+value+'_count'] = df[estimated_features][df[estimated_features]==value].count(axis=1)

df['estimated_features_same_value_flag'] = df[df[estimated_features_interaction]==6].any(axis=1)*1

In [22]:
estimated_features_outgoing = ['outgoing_ach_and_checks', 'outgoing_wire_transfers']
estimated_features_incoming = list(set(estimated_features)-set(estimated_features_outgoing))

In [23]:
# Replace the ordinal values into numerical ordinal values
estimated_features_replace_values2 = {'0k':0, '1k':500, '1k_plus':1000, '5k_plus':5000, '50k_plus':50000}
df[estimated_features] = df[estimated_features].replace(estimated_features_replace_values2)

In [24]:
# Deriving estimated interaction features
df['total_incoming_vol'] = df[estimated_features_incoming].sum(axis=1)
df['total_outgoing_vol'] = df[estimated_features_outgoing].sum(axis=1)
df['total_txns_vol'] = df['total_incoming_vol'] + df['total_outgoing_vol']
df['cashflow'] = df['total_incoming_vol'] - df['total_outgoing_vol']

df['ach_mrdc_incoming_vol'] = df[['incoming_ach_payments','check_deposit_amount']].sum(axis=1) 
df['ach_mrdc_outgoing_vol'] = df['outgoing_ach_and_checks']
df['ach_mrdc_total_txns_vol'] = df['ach_mrdc_incoming_vol'] + df['ach_mrdc_outgoing_vol']
df['ach_mrdc_cashflow'] = df['ach_mrdc_incoming_vol'] - df['ach_mrdc_outgoing_vol']

df['wire_incoming_vol'] = df['incoming_wire_transfer']
df['wire_outgoing_vol'] = df['outgoing_wire_transfers']
df['wire_total_txns_vol'] = df['wire_incoming_vol'] + df['wire_outgoing_vol']
df['wire_cashflow'] = df['wire_incoming_vol'] - df['wire_outgoing_vol']


# Ratio's
replace_dict = {np.nan:0, np.inf:0, -np.inf:0}
df['outgoing_to_incoming_ratio'] = df['total_outgoing_vol'].div(df['total_incoming_vol']).replace(replace_dict)
df['cashflow_to_total_txns_vol_ratio'] = df['cashflow'].div(df['total_txns_vol']).replace(replace_dict)
df['wire_outgoing_to_incoming_ratio'] = df['outgoing_wire_transfers'].div(df['total_incoming_vol']).replace(replace_dict)
df['ach_mrdc_outgoing_to_incoming_ratio'] = df['outgoing_ach_and_checks'].div(df[['incoming_ach_payments','check_deposit_amount'
                                                                                 ]].sum(axis=1)).replace(replace_dict)
df['incoming_ach_to_revenue_ratio'] = df['incoming_ach_payments'].div(df['estimated_monthly_revenue']).replace(replace_dict)
df['incoming_ach_mrdc_to_revenue_ratio'] = df[['incoming_ach_payments','check_deposit_amount']].sum(axis=1).div(df['estimated_monthly_revenue']).replace(replace_dict)

df['ach_mrdc_total_txns_vol_to_total_txns_vol_ratio'] = (df['ach_mrdc_incoming_vol']+ df['ach_mrdc_outgoing_vol'])/df['total_txns_vol']
df['ach_mrdc_incoming_to_total_incoming_vol_ratio'] = df['ach_mrdc_incoming_vol']/df['total_incoming_vol']
df['ach_mrdc_outgoing_to_total_outgoing_vol_ratio'] = df['ach_mrdc_outgoing_vol']/df['total_outgoing_vol']

df['wire_total_txns_vol_to_total_txns_vol_ratio'] = (df['wire_incoming_vol']+ df['wire_outgoing_vol'])/df['total_txns_vol']
df['wire_incoming_to_total_incoming_vol_ratio'] = df['wire_incoming_vol']/df['total_incoming_vol']
df['wire_outgoing_to_total_outgoing_vol_ratio'] = df['wire_outgoing_vol']/df['total_outgoing_vol']

# Ratios of number of employees
df['total_incoming_vol_to_employees_ratio'] = df['total_incoming_vol'].div(df['number_of_employees']).replace(replace_dict)
df['total_outgoing_vol_to_employees_ratio'] = df['total_outgoing_vol'].div(df['number_of_employees']).replace(replace_dict)
df['total_txns_vol_to_employees_ratio'] = df['total_txns_vol'].div(df['number_of_employees']).replace(replace_dict)
df['cashflow_to_employees_ratio'] = df['cashflow'].div(df['number_of_employees']).replace(replace_dict)

df['wire_incoming_vol_to_employees_ratio'] = df['wire_incoming_vol'].div(df['number_of_employees']).replace(replace_dict)
df['wire_outgoing_vol_to_employees_ratio'] = df['wire_outgoing_vol'].div(df['number_of_employees']).replace(replace_dict)
df['wire_total_txns_vol_to_employees_ratio'] = df['wire_total_txns_vol'].div(df['number_of_employees']).replace(replace_dict)
df['wire_cashflow_to_employees_ratio'] = df['wire_cashflow'].div(df['number_of_employees']).replace(replace_dict)

df['ach_mrdc_incoming_vol_to_employees_ratio'] = df['ach_mrdc_incoming_vol'].div(df['number_of_employees']).replace(replace_dict)
df['ach_mrdc_outgoing_vol_to_employees_ratio'] = df['ach_mrdc_outgoing_vol'].div(df['number_of_employees']).replace(replace_dict)
df['ach_mrdc_total_txns_vol_to_employees_ratio'] = df['ach_mrdc_total_txns_vol'].div(df['number_of_employees']).replace(replace_dict)
df['ach_mrdc_cashflow_to_employees_ratio'] = df['ach_mrdc_cashflow'].div(df['number_of_employees']).replace(replace_dict)

df['estimated_monthly_revenue_to_employees_ratio'] = df['estimated_monthly_revenue'].div(df['number_of_employees']).replace(replace_dict)


In [25]:
# tot = 0
# tot_fraud = 0
# for col in estimated_features_interaction:
#     print(col, ':',df[df[col]==6].shape[0], df[df[col]==6].target.mean())
#     tot = tot+df[df[col]==6].shape[0]
#     tot_fraud = tot_fraud + df[df[col]==6].target.sum()
# tot, tot_fraud, tot_fraud/tot, tot/df.shape[0], df.target.sum(), (df.target.sum()-tot_fraud)/(df.shape[0]-tot)

In [26]:

# Grouping the features values to the observed variance
def estimated_feature_engg(value):
    if value == None:
        return None
    value = value.lower()
    if value in ['$0', '$0 - $1k', '<$1k']:
        # return 'upto_1k'
        return 0
    elif value in ['$1k +', '$5k +', '$50k +']:
        # return '1k_plus'
        return 1
    else:
        return value

# Grouping the features values to the observed variance
def estimated_feature_engg(value):
    if value == None:
        return None
    if value<=500:
        return 0
    elif value>=1000:
        return 1
    else:
        return value

for col in estimated_features:
    df[col] = df[col].apply(estimated_feature_engg)

In [27]:
col = 'industry_category_name'
df[col] = df[col].str.lower()
df[col] = np.where(df[col].isin(['utilities','mining', 'agriculture, forestry, fishing and hunting', 'wholesale trade', 
                                 'accommodation and food services', 'administrative and support and waste management and remediation services',
                                 'construction', 'finance and insurance', 'mining', 'other services', 'health care and social assistance',
                                'manufacturing', 'public administration']), 'cat1', 'cat2')

### High Cardinality Features

In [28]:
col = 'iovation_device_timezone'
df[col] = df[col].astype('float64')
df[col] = np.where(df[col].isin([300,360,480]), df[col], 'other')

In [29]:
col = 'email_domain'
l1 = ['gmail.com']
l2 = ['yahoo.com','outlook.com','icloud.com','hotmail.com','aol.com','protonmail.com']
df[col] = np.where(df[col].isin(l1), 'l1', np.where(df[col].isin(l2), 'l2', 'other'))

In [30]:
col = 'iovation_device_ip_isp'
df[col] = df[col].str.lower()
df[col] = np.where(df[col].str.contains('t-mobile*'),'tmobile',df[col])
df[col] = np.where(df[col].str.contains('verizon'),'verizon',df[col])
df[col] = np.where(df[col].str.contains('at&t'),'att',df[col])
df[col] = np.where(df[col].str.match('att'),'att',df[col])
df[col] = np.where(df[col].str.match('charter [b|c]'),'charter',df[col])
df[col] = np.where(df[col].str.match('comcast'),'comcast',df[col])
df[col] = np.where(df[col].isin(['tmobile','att','charter','verizon','comcast']),df[col],'other')

In [31]:
col = 'iovation_device_ip_org'
df[col] = df[col].str.lower()
df[col] = np.where(df[col].str.contains('t-mobile*'),'tmobile',df[col])
df[col] = np.where(df[col].str.contains('verizon'),'verizon',df[col])
df[col] = np.where(df[col].str.contains('at&t'),'att',df[col])
df[col] = np.where(df[col].str.match('att'),'att',df[col])
df[col] = np.where(df[col].str.match('charter [b|c]'),'charter',df[col])
df[col] = np.where(df[col].str.match('comcast'),'comcast',df[col])
df[col] = np.where(df[col].isin(['tmobile','att','charter','verizon','comcast']),df[col],'other')

In [32]:
col = 'carrier'
df[col] = df[col].str.lower()
df[col] = np.where(df[col].str.contains('t-mobile*'),'tmobile',df[col])
df[col] = np.where(df[col].str.contains('verizon'),'verizon',df[col])
df[col] = np.where(df[col].str.contains('at&t'),'att',df[col])
df[col] = np.where(df[col].str.match('att'),'att',df[col])
df[col] = np.where(df[col].isin(['tmobile','att','verizon']),df[col],'other')

In [33]:
high_cardinality_features

['iovation_device_timezone',
 'iovation_device_ip',
 'iovation_device_ip_isp',
 'iovation_device_ip_org',
 'iovation_device_ip_city',
 'iovation_device_ip_region',
 'carrier',
 'email',
 'email_domain',
 'industry_name',
 'website',
 'business_address_city',
 'business_address_state',
 'industry_category_from_pitch',
 'company_name']

In [34]:
def ip_address_class(val:str):
    if val==None or val=='':
        return None
    else:
        if len(val)<=15:
            val_split = val.split('.')
            if 0<=int(val_split[0])<=127:
                class_val = 'a'
            elif 128<=int(val_split[0])<=191:
                class_val = 'b'
            elif 192<=int(val_split[0])<=223:
                class_val = 'c'
            elif 224<=int(val_split[0])<=239:
                class_val = 'd'
            elif 240<=int(val_split[0])<=255:
                class_val = 'e'
            else:
                class_val = 'na'
            return class_val
        else:
            return None

col = 'iovation_device_ip'
df['ip_class'] = df[col].apply(ip_address_class)

business_pitch

In [35]:
def extract_readability_features(text):
    if text==None or text=='':
        return_features = {'flesch_reading_ease': 0, 'smog_index': 0, 'flesch_kincaid_grade': 0, 'coleman_liau_index': 0, 
                           'automated_readability_index': 0, 'dale_chall_readability_score': 0, 'difficult_words': 0,
                           'linsear_write_formula': 0, 'gunning_fog': 0, 'text_standard': '-1th and 0th grade'}
        return_features = tuple(return_val.values())
    
    else:
        return_features = {
            'flesch_reading_ease': textstat.flesch_reading_ease(text),
            'smog_index': textstat.smog_index(text),
            'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text),
            'coleman_liau_index': textstat.coleman_liau_index(text),
            'automated_readability_index': textstat.automated_readability_index(text),
            'dale_chall_readability_score': textstat.dale_chall_readability_score(text),
            'difficult_words': textstat.difficult_words(text),
            'linsear_write_formula': textstat.linsear_write_formula(text),
            'gunning_fog': textstat.gunning_fog(text),
            'text_standard': textstat.text_standard(text)
        }
        return_features = tuple(return_features.values())
        
    return return_features


In [36]:
df.shape

(89792, 109)

In [37]:
col = 'business_pitch'
readability_columns = ['flesch_reading_ease', 'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index', 'automated_readability_index',
                       'dale_chall_readability_score', 'difficult_words', 'linsear_write_formula', 'gunning_fog', 'text_standard']
df[readability_columns] = pd.DataFrame(df[col].apply(extract_readability_features).tolist(),index=df.index)


In [38]:
# engg text_standard feature
# text_standard_vals = list(set(df['text_standard'].values))
# text_standard_vals.sort()
# text_standard_vals_replace = [0, -1, 1, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 2, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 3, 31, 32, 326, 
#                               33, 34, 35, 36, 37, 39, 4, 41, 43, 5, 54, 6, 68, 7, 8, 9, 10]
# text_standard_vals_replace = [x + 1 for x in text_standard_vals_replace]
# df['text_standard_ordinals'] = df['text_standard'].replace(text_standard_vals, text_standard_vals_replace)

# text_standard feature grouping
df['text_standard_levels'] = np.where(df['text_standard'].isin(['6th and 7th grade','7th and 8th grade','8th and 9th grade', '9th and 10th grade']
                                               ), 'l2', (np.where(df['text_standard'].isin(['-1th and 0th grade','0th and 1st grade','1st and 2nd grade','2nd and 3rd grade','3rd and 4th grade',
                                                                                            '4th and 5th grade','5th and 6th grade']), 'l1', 'l3'))
                      )

In [39]:
# Linguistic features
def extract_linguistic_features(text):
   
    if text==None or text=='':
        return (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
    else:
        # print(text,'\n\n')
        words = nltk.word_tokenize(text)
        total_tokens = len(words)
        sentences = nltk.sent_tokenize(text)
        syllable_count = sum(textstat.syllable_count(word) for word in words)
        polysyllable_count = len([word for word in words if textstat.syllable_count(word) >= 3])
        pos_tags = nltk.pos_tag(words)
        pos_counts = Counter(tag for word, tag in pos_tags)

        # POS Tag Ratios
        pos_ratios = {tag: count / total_tokens for tag, count in pos_counts.items()}
        pos_ratios = dict(zip(['pos_ratio_'+key for key in pos_ratios.keys()], pos_ratios.values())) # adding prefix to the keys

        # # POS Tag Sequences
        # pos_sequence = ' '.join(tag for word, tag in pos_tags)
        # words_pos = nltk.word_tokenize(pos_sequence)

        # # POS Tag N-grams
        # bigrams = list(ngrams([tag for word, tag in pos_tags], 2))
        # trigrams = list(ngrams([tag for word, tag in pos_tags], 3))
        # # Count bigrams and trigrams
        # bigram_counts = Counter(bigrams)
        # trigram_counts = Counter(trigrams)

        # # Find important bigrams using mutual information
        # bigram_measures = BigramAssocMeasures()
        # bigram_finder = BigramCollocationFinder.from_words(words)
        # important_bigrams = bigram_finder.nbest(bigram_measures.pmi, 3)
        
        # # Find important trigrams using mutual information
        # trigram_measures = TrigramAssocMeasures()
        # trigram_finder = TrigramCollocationFinder.from_words(words)
        # important_trigrams = trigram_finder.nbest(trigram_measures.pmi, 3)

        # POS Diversity
        pos_diversity = len(pos_counts) / total_tokens

        # Custom POS Tag Features (e.g., Noun to Verb Ratio)
        noun_count = sum(count for tag, count in pos_counts.items() if tag.startswith('NN'))
        verb_count = sum(count for tag, count in pos_counts.items() if tag.startswith('VB'))
        noun_verb_ratio = noun_count / (verb_count + 1)
        
        blob = TextBlob(text)
        analyzer = SentimentIntensityAnalyzer()
        vader_sentiment = analyzer.polarity_scores(text)

        pos_counts = dict(zip(['pos_count_'+key for key in pos_counts.keys()], pos_counts.values())) # adding prefix to the keys
        linguistic_features = {
            'total_word_count': total_tokens,
            'sentence_count': len(sentences),
            'average_sentence_length': total_tokens / len(sentences),
            'average_word_length': sum(len(word) for word in words) / total_tokens,
            'syllable_count': syllable_count,
            'polysyllable_count': polysyllable_count,
            'unique_words_count': len(set(words)),
            'type_token_ratio': len(set(words)) / total_tokens,
            'lexical_density': sum(1 for tag in pos_tags if tag[1] in ['NN', 'VB', 'JJ', 'RB']) / total_tokens,
            'pos_counts': pos_counts,
            'textblob_polarity': blob.sentiment.polarity, 
            'textblob_subjectivity': blob.sentiment.subjectivity,
            'vader_neg': vader_sentiment['neg'],
            'vader_neu': vader_sentiment['neu'],
            'vader_pos': vader_sentiment['pos'],
            'vader_compound': vader_sentiment['compound'],
            'pos_diversity': pos_diversity,
            'noun_to_verb_ratio': noun_verb_ratio,
            'pos_ratios':pos_ratios

        }
        linguistic_features = tuple(linguistic_features.values())
        
        return linguistic_features
        

In [40]:
col = 'business_pitch'
linguistic_columns = ['total_word_count','sentence_count','average_sentence_length','average_word_length','syllable_count',
                      'polysyllable_count','unique_words_count','type_token_ratio','lexical_density','pos_counts','textblob_polarity',
                      'textblob_subjectivity','vader_neg','vader_neu','vader_pos','vader_compound','pos_diversity','noun_to_verb_ratio',
                      'pos_ratios']
df[linguistic_columns] = pd.DataFrame(df[col].apply(extract_linguistic_features).tolist(),index=df.index)

# exploding POS tags count features and creating important columns
pos_tags_count_features = ['pos_count_NN','pos_count_NNS','pos_count_NNP','pos_count_NNPS','pos_count_VB','pos_count_VBP','pos_count_VBG', 
                           'pos_count_VBN','pos_count_VBZ','pos_count_VBD','pos_count_CC','pos_count_PRP','pos_count_JJ','pos_count_TO',
                           'pos_count_IN','pos_count_RB']
df = pd.concat([df,pd.json_normalize(df['pos_counts'])[pos_tags_count_features].fillna(0)], axis=1)

# exploding POS tags ratio features and creating important columns
pos_tags_ratio_features = ['pos_ratio_NN','pos_ratio_NNS','pos_ratio_NNP','pos_ratio_NNPS','pos_ratio_VB','pos_ratio_VBP','pos_ratio_VBG', 
                           'pos_ratio_VBN','pos_ratio_VBZ','pos_ratio_VBD','pos_ratio_CC','pos_ratio_PRP','pos_ratio_JJ','pos_ratio_TO', 
                           'pos_ratio_IN','pos_ratio_RB']
df = pd.concat([df,pd.json_normalize(df['pos_ratios'])[pos_tags_ratio_features].fillna(0)], axis=1)

In [41]:
linguistic_columns2 = list(set(linguistic_columns)-set(['pos_counts','pos_ratios']))
df[linguistic_columns2] = df[linguistic_columns2].fillna(0)

### List Values Feature Engineering

In [42]:
socure_cols = ['socure_phonerisk_reason_code','socure_addressrisk_reason_code','socure_emailrisk_reason_code'
              ,'socure_reason_code', 'socure_kyc_reason_code']
df_socure = df[['application_id'] + socure_cols]

In [43]:
# Cleaning the list of reason codes
def clean_socure_strings(x):
    if x != None:
        x = [val.strip("""'|[| |"|,|]""") for val in x.split('\n') if not val.strip("""'|[| |"|,|]""") in ['',None]]
    return x

socure_cols = ['socure_phonerisk_reason_code','socure_addressrisk_reason_code','socure_emailrisk_reason_code','socure_reason_code','socure_kyc_reason_code']
df_socure = df[['application_id'] + socure_cols]

# Collecting the cleaned reason codes
for col in socure_cols:
    df_socure[col] = df_socure[col].str.lower()
    df_socure[col] = df_socure[col].apply(clean_socure_strings)
    

In [44]:
socure_cols = ['socure_phonerisk_reason_code','socure_addressrisk_reason_code','socure_emailrisk_reason_code','socure_reason_code','socure_kyc_reason_code']

dict_cols = {}
for col in socure_cols:
    df_socure = df_socure.drop(col, 1).join(df_socure[col].str.join('|').str.get_dummies())
    socure_cols = df_socure.columns[df_socure.columns.str.startswith('socure')].to_list()
    true_cols = list(set(df_socure.columns.to_list()) - set(socure_cols) - set(['application_id','target']))
    new_cols = []
    for col2 in true_cols:
        new_cols.append(col+'_'+col2)
    dict_cols = dict(zip(true_cols, new_cols)) | dict_cols
    df_socure.rename(columns=dict_cols, inplace=True)

df_socure = df_socure.T
df_socure = df_socure[~df_socure.index.duplicated(keep='first')].T

In [45]:
socure_phone_cols = df_socure.columns[df_socure.columns.str.startswith('socure_phone')].to_list()
socure_address_cols = df_socure.columns[df_socure.columns.str.startswith('socure_address')].to_list()
socure_email_cols = df_socure.columns[df_socure.columns.str.startswith('socure_email')].to_list()
socure_reason_cols = df_socure.columns[df_socure.columns.str.startswith('socure_reason')].to_list()
socure_kyc_cols = df_socure.columns[df_socure.columns.str.startswith('socure_kyc')].to_list()

df_socure['socure_phonerisk_code_count'] = df_socure[socure_phone_cols].sum(axis=1)
df_socure['socure_addressrisk_code_count'] = df_socure[socure_address_cols].sum(axis=1)
df_socure['socure_emailrisk_code_count'] = df_socure[socure_email_cols].sum(axis=1)
df_socure['socure_reason_code_count'] = df_socure[socure_reason_cols].sum(axis=1)
df_socure['socure_kyc_code_count'] = df_socure[socure_kyc_cols].sum(axis=1)
df_socure['socure_all_codes_count'] = df_socure[socure_phone_cols+socure_address_cols+socure_email_cols+socure_reason_cols+socure_kyc_cols].sum(axis=1)


In [46]:
# Loading only the trained columns and adding to the test data
socure_cols_derived = pd.read_pickle(project_path+'models/socure_reason_codes_columns_'+year+'.pkl')
df_tmp = pd.DataFrame(index=range(df.shape[0]),columns=socure_cols_derived['feature'].to_list())
df_tmp = df_tmp.fillna(0)

df_tmp.update(df_socure)
df_tmp = df_tmp.astype('int')
df = pd.concat([df,df_tmp], axis=1)

socure_kyc_field_validations

In [47]:
def socure_kyc_field_extraction(val):
    if val==None or val==[] or val=={} or val=='':
        return (None, None, None, None, None, None, None, None, None)
    else:
        val = json.loads(val.lower())
        city = val['city']
        dob = val['dob']
        firstname = val['firstname']
        mobilenumber = val['mobilenumber']
        ssn = val['ssn']
        state = val['state']
        streetaddress = val['streetaddress']
        surname = val['surname']
        zip_val = val['zip']

        return (surname, firstname, dob, mobilenumber, ssn, state, city, streetaddress, zip_val)


In [48]:
socure_kyc_fields_cols = ['socure_kyc_surname_score', 'socure_kyc_firstname_score', 'socure_kyc_dob_score', 'socure_kyc_mobilenumber_score', 
                         'socure_kyc_ssn_score', 'socure_kyc_state_score', 'socure_kyc_city_score', 'socure_kyc_streetaddress_score', 'socure_kyc_zip_score']
df[socure_kyc_fields_cols] = pd.DataFrame(df.socure_kyc_field_validations.apply(socure_kyc_field_extraction).tolist(),index=df.index)


purpose_of_account

In [49]:
col = 'purpose_of_account'

# There are 5 unique purpose of accounts categories
unique_vals = ['payroll','accounting','operating','travel expenses','business expenses']
# Creating dummies for each unique string
purpose_of_accounts_derived_cols = []
for val in unique_vals:
    new_col = 'purpose_'+val.replace(" ", "_")
    purpose_of_accounts_derived_cols.append(new_col)
    df[new_col] = 0
    idx = df.index[df[col].str.contains(val)==True].tolist()
    df.loc[idx, new_col] = 1

df['purpose_of_account_count'] = df[purpose_of_accounts_derived_cols].sum(axis=1)

In [50]:
# unique_cols = ['payroll','accounting','operating','travel_expenses','business_expenses']
# for i in unique_cols:
#     print(df.groupby(['purpose_'+i])['target'].agg(['sum', 'mean']))

person_fraud_tags

In [51]:
col = 'person_fraud_tags'
df[col] = df[col].str.lower()
df[col+'_flag'] = np.where(df[col].str.contains('not')==True, 1, 0)

person_kyc_tags

In [52]:
col = 'person_kyc_tags'
df[col] = df[col].str.lower()
df[col+'_flag'] = np.where(df[col].str.contains('not')==True, 1, 0)

owner_list

In [53]:
# Cleaning the list of string and extracting owner name
def clean_owner_list_strings(x):
    if x == None:
        return ''
    x = x.strip("""'|[| |"|,|]""")
    if x != '' :
        try:
            x = [val.strip("""'|[| |"|,|]""") for val in x.split('\n') if not val.strip("""'|[| |"|,|]""") in ['',None,[]]]
            x = ''.join(x[0].split(',')[:-3])
            x = re.sub(r'\s+', ' ', x)
        except:
            return x
    return x

df['owner_name'] = df[col].apply(clean_owner_list_strings)

In [54]:
# Function to get a fuzzy match score of name against email
def name_matching_with_email(email_id: str, owner_name: str):
    if email_id == None or email_id == '' or owner_name.strip('[|]') == '':
        return (0,0,0)
    # print(email_id)
    email_id = email_id.lower()
    owner_name = owner_name.lower()
    
    email_str1 = re.sub('[0-9]', '',' '.join(email_id.split('@')[0].split('.')))
    if len(email_str1)>=3 :
        name_str1 = [val.strip(' ') for val in re.split(' |-',owner_name) if not val.strip(' ') in ['']]
        permute_list = list(permutations(name_str1))
        max_ratio = 0
        max_partial = 0
        max_token = 0
        name_match_score = 0
        
        for i in range(len(name_str1)):
    
            if len(name_str1[i])>=3 and name_str1[i] in email_id:
                name_match_score = name_match_score + (100/len(name_str1))
            
            permute_str1 = ' '.join(list(permute_list[i]))
            permute_str2 = ''.join(list(permute_list[i]))
            
            score_ratio1 = fuzz.ratio(email_str1, permute_str1)
            score_partial1 = fuzz.partial_ratio(email_str1, permute_str1)
            score_token1 = fuzz.token_sort_ratio(email_str1, permute_str1)
    
            score_ratio2 = fuzz.ratio(email_str1, permute_str2)
            score_partial2 = fuzz.partial_ratio(email_str1, permute_str2)
            score_token2 = fuzz.token_sort_ratio(email_str1, permute_str2)
    
            max_ratio = max([max_ratio, score_ratio1, score_ratio2])
            max_partial = max([max_partial, score_partial1, score_partial2])
            max_token = max([max_token, score_token1, score_token2])

    else:
        return (0,0,0)
        
    return (max_ratio, max_partial, name_match_score)

In [55]:
ownername_email_match = ['ownername_email_fuzzy_match_ratio','ownername_email_fuzzy_match', 'ownername_email_substring_match']
df[ownername_email_match] = pd.DataFrame(df.apply(lambda x: name_matching_with_email(x['email'], x['owner_name']), axis=1
                                                     ).tolist(),index=df.index)

companyname_email_match = ['companyname_email_fuzzy_match_ratio','companyname_email_fuzzy_match', 'companyname_email_substring_match']
df[companyname_email_match] = pd.DataFrame(df.apply(lambda x: name_matching_with_email(x['email'], x['company_name']), axis=1
                                                     ).tolist(),index=df.index)

In [56]:
df['company_name_email_match_flag'] = np.where(((df.companyname_email_fuzzy_match_ratio>=75)|(df.companyname_email_substring_match>=20)) |
((df.companyname_email_fuzzy_match_ratio>=70) & (df.companyname_email_fuzzy_match>=80)), 1, 0)

df['owner_name_email_match_flag'] = np.where(((df.ownername_email_fuzzy_match_ratio>=75)|(df.ownername_email_substring_match>=20)) |
((df.ownername_email_fuzzy_match_ratio>=70) & (df.ownername_email_fuzzy_match>=80)), 1, 0)

In [57]:
df.shape

(89792, 417)

In [58]:
df.to_pickle(project_path+'data/all_apps_engg_data_'+year+'.pkl')

In [59]:
independent_features = pd.read_pickle(project_path+'data/train_independent_features_'+year+'.pkl')['feature'].to_list()
df = df[independent_features]

In [60]:
data_type_dict = df.dtypes.astype(str).to_dict()
data_type_mapping = {'float64':np.float64,'object':object,'bool':bool,'Int64':np.int64}
data_type_dict = {k: data_type_mapping[v] for k, v in data_type_dict.items() if v in data_type_mapping}

df = df.astype(data_type_dict)

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89792 entries, 0 to 89791
Columns: 374 entries, socure_phonerisk_reason_code_r662 to dale_chall_readability_score
dtypes: float64(90), int64(273), object(11)
memory usage: 256.2+ MB


In [62]:
# Removing reason code features as these are already encoded

# Encoding the categorical features
x_object_cols = df.select_dtypes(include=['object']).columns.to_list()
x_object_onehot = pd.get_dummies(df[x_object_cols]) # create dummies
x_object_onehot = x_object_onehot.astype('int')
df = pd.concat([df.drop(columns=x_object_cols), x_object_onehot], axis=1)
df.columns = df.columns.str.lower() # convert column names to lower case


In [63]:
# Filter only the final independent features used in the training data
train_cols = pd.read_pickle(project_path+'models/train_data_columns_'+year+'.pkl')['feature'].to_list()
df_tmp = pd.DataFrame(index=range(df.shape[0]),columns=train_cols)
df_tmp = df_tmp.fillna(0) # creating a dummy df using the columns from train dataset
df_tmp.update(df) # update the dummy df with test df values
df = df_tmp.copy()
df.shape

(89792, 412)

In [70]:
# Load the PCA model
with open(path+'pca_model.pkl', 'rb') as file:
    pca_loaded = pickle.load(file)
df = pd.DataFrame(pca_loaded.transform(df))

In [71]:
file_name = 'catboost_model_pca_'+year+'.pkl'
path = project_path + 'models/'
model = pickle.load(open(path+file_name, "rb"))


In [72]:
all_apps_pred = model.predict_proba(df)[:,1:].flatten()

CatBoostError: /Users/zomb-ml-platform-msk/go-agent-21.2.0/pipelines/BuildMaster/catboost.git/catboost/libs/data/model_dataset_compatibility.cpp:81: At position 0 should be feature with name pca_1 (found 0).

In [66]:
df_raw['prob'] = all_apps_pred

In [67]:
df_raw[df_raw.prob>=0.6].shape[0]/df_raw.shape[0]

0.08426140413399857

In [68]:
datetime.datetime.now()

datetime.datetime(2024, 6, 11, 11, 27, 45, 28935)