In [1]:
# path variables
import sys
project_path = '/Users/naresh/Downloads/DS/growth/nsl_v2/nsl_v2_final/'

# core libraries
import datetime
import pandas as pd
from IPython.display import display
import numpy as np
import re
import nltk
import spacy
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
stop_words = stopwords.words('english')
nlp = spacy.load("en_core_web_lg")
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# load custom modules
from model_building import split_test_train

In [2]:
# raw data
file = 'nsl_raw_dataset_2023-05-30.pkl'
path = project_path + 'data/'
df_raw = pd.read_pickle(path + file)
df_raw.shape

(124703, 163)

In [3]:
# train test flag
file = 'nsl_train_test_flag_2023-05-30.pkl'
path = project_path + 'data/'
df_train_test = pd.read_pickle(path + file)
df_train_test.shape

(124703, 6)

In [4]:
# processed business pitch
file = 'nsl_bp_processed_dataset_2023-05-19.pkl'
path = project_path + 'data/'
df_bp = pd.read_pickle(path + file)
df_raw = pd.merge(df_raw, df_bp[['application_id','business_pitch_lema_spacy']], on='application_id', how='left')

In [5]:
df = pd.merge(df_train_test[['application_id', 'train_flag', 'test_flag']],df_raw, on='application_id', how='left')
x_train = df[df['train_flag'] == 1]
y_train = df[df['train_flag'] == 1]['ns_flag']
x_test = df[df['test_flag'] == 1]
y_test = df[df['test_flag'] == 1]['ns_flag']

In [6]:
df = x_train.copy(deep=True)
df.shape

(87292, 166)

In [7]:
cat_columns = ['number_of_employees', 'ein_ssn', 'estimated_monthly_revenue', 'incoming_ach_payments', 'check_deposit_amount', 'incoming_wire_transfer'
               , 'outgoing_ach_and_checks', 'outgoing_wire_transfers', 'email_domain', 'purpose_of_account', 'current_bank', 'industry_category_name', 'business_type',
               'business_address_city', 'business_address_state', 'website']

In [8]:
# Categorical
df_impute_apps = df[cat_columns].mode().T
df_impute_apps = df_impute_apps.reset_index().rename(columns={'index':'feature', 0:'impute_value'})
df_impute_apps.to_pickle(project_path+'models/df_impute_apps.pkl') # Save the impute values as df


#### Raw features

In [9]:
# convert all string features to lowercase
string_features = ['email_domain',
 'purpose_of_account',
 'current_bank',
 'industry_category_name',
 'business_type',
 'business_address_city',
 'business_address_state',
 'website']

for col in string_features:
    df[col] = df[col].str.lower()

#### Business type

In [10]:
df['business_group'] = np.where(df['business_type'] == 'sole_proprietorship', 'sp', 'non-sp')
df['business_group'].value_counts()

non-sp    59318
sp        27974
Name: business_group, dtype: int64

In [11]:
pd.crosstab(df['business_group'], df['ns_flag'], normalize='index')

ns_flag,0,1
business_group,Unnamed: 1_level_1,Unnamed: 2_level_1
non-sp,0.896743,0.103257
sp,0.952098,0.047902


#### Email Domain

In [12]:
df['email_domain_bucket'] = np.where(df['email_domain'].isin(list(df.groupby("email_domain").filter(lambda x: (len(x) >= 10))['email_domain'].unique())), 'major_domain', 'custom_domain')
                            # np.where(df['email_domain'].isin(list(df.groupby("email_domain").filter(lambda x: ((len(x) > 1) & (len(x) < 10)))['email_domain'].unique())), 'franchise_domain', 'personal_domain'))
df['email_domain_bucket'].value_counts()

major_domain     60868
custom_domain    26424
Name: email_domain_bucket, dtype: int64

In [13]:
pd.DataFrame(list(df.groupby("email_domain").filter(lambda x: (len(x) >= 10))['email_domain'].unique()), columns=['email_domain']).to_pickle(project_path+'models/email_domain_group.pkl')


In [14]:
pd.crosstab(df['email_domain_bucket'], df['ns_flag'], normalize='index')

ns_flag,0,1
email_domain_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1
custom_domain,0.881963,0.118037
major_domain,0.9286,0.0714


#### Estimated Numbers

In [15]:
estimated_cols = ['estimated_monthly_revenue',
                  'incoming_ach_payments',
                  'check_deposit_amount',
                  'incoming_wire_transfer',
                  'outgoing_ach_and_checks',
                  'outgoing_wire_transfers']

# grouping all responses into 5K+ and 5K-
for col in estimated_cols:
    df[col] = df[col].str.lower()
    df[col] = np.where(df[col].isin(['$5k +', '$50k +']), '$5K+', '$5K-')
    print(df[col].value_counts())

$5K-    54091
$5K+    33201
Name: estimated_monthly_revenue, dtype: int64
$5K-    64093
$5K+    23199
Name: incoming_ach_payments, dtype: int64
$5K-    68905
$5K+    18387
Name: check_deposit_amount, dtype: int64
$5K-    73694
$5K+    13598
Name: incoming_wire_transfer, dtype: int64
$5K-    72097
$5K+    15195
Name: outgoing_ach_and_checks, dtype: int64
$5K-    77108
$5K+    10184
Name: outgoing_wire_transfers, dtype: int64


In [16]:
for col in estimated_cols:
    display(pd.crosstab(df[col], df['ns_flag'], normalize='index'))

ns_flag,0,1
estimated_monthly_revenue,Unnamed: 1_level_1,Unnamed: 2_level_1
$5K+,0.847746,0.152254
$5K-,0.955445,0.044555


ns_flag,0,1
incoming_ach_payments,Unnamed: 1_level_1,Unnamed: 2_level_1
$5K+,0.836071,0.163929
$5K-,0.942864,0.057136


ns_flag,0,1
check_deposit_amount,Unnamed: 1_level_1,Unnamed: 2_level_1
$5K+,0.864034,0.135966
$5K-,0.927944,0.072056


ns_flag,0,1
incoming_wire_transfer,Unnamed: 1_level_1,Unnamed: 2_level_1
$5K+,0.869613,0.130387
$5K-,0.922762,0.077238


ns_flag,0,1
outgoing_ach_and_checks,Unnamed: 1_level_1,Unnamed: 2_level_1
$5K+,0.825403,0.174597
$5K-,0.933257,0.066743


ns_flag,0,1
outgoing_wire_transfers,Unnamed: 1_level_1,Unnamed: 2_level_1
$5K+,0.870679,0.129321
$5K-,0.920268,0.079732


#### Number of employees

In [17]:
df["number_of_employees"] = pd.to_numeric(df["number_of_employees"])
df['number_of_employees_bin'] = df["number_of_employees"]
df['number_of_employees_bin'] = np.where((df['number_of_employees']>=2) & (df['number_of_employees']<=5), '2-5', 
                                np.where((df['number_of_employees']>5), '5+', df['number_of_employees_bin']))
df['number_of_employees_bin'].value_counts()

1      57621
2-5    26459
5+      3212
Name: number_of_employees_bin, dtype: int64

In [18]:
pd.crosstab(df['number_of_employees_bin'], df['ns_flag'], normalize='index')

ns_flag,0,1
number_of_employees_bin,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.917096,0.082904
2-5,0.912695,0.087305
5+,0.882316,0.117684


#### Purpose of account

In [19]:
# Purpose of account - options selected
df['purpose_of_account_options_selected'] = df['purpose_of_account'].str.count(',')+1
df['purpose_of_account_options_selected'] = np.where(df['purpose_of_account_options_selected']<=3, '3-', '3+')
df['purpose_of_account_options_selected'].value_counts()

3-    63904
3+    23388
Name: purpose_of_account_options_selected, dtype: int64

In [20]:
pd.crosstab(df['purpose_of_account_options_selected'], df['ns_flag'], normalize='index')

ns_flag,0,1
purpose_of_account_options_selected,Unnamed: 1_level_1,Unnamed: 2_level_1
3+,0.903241,0.096759
3-,0.918597,0.081403


#### Industry type

In [21]:
# industry_df = pd.crosstab(df['industry_category_name'], df['ns_flag'], normalize='index')
# hdi_group = industry_df[industry_df[1] > df['ns_flag'].value_counts(normalize=True)[1]].index.to_list()
# df['industry_group'] = np.where(df['industry_category_name'].isin(hdi_group), 'hdi_group', 'ldi_group')
# df['industry_group'].value_counts()

# pd.DataFrame(hdi_group, columns=['value']).to_pickle(project_path+'models/hdi_group.pkl')
# pd.crosstab(df['industry_group'], df['ns_flag'], normalize='index')

In [22]:
# col = 'industry_category_name'
# df[col] = df[col].str.lower()
# tmp1 = pd.DataFrame(df[col].value_counts()).reset_index().rename(columns={col:'volume'})
# tmp2 = pd.DataFrame(df.groupby([col]).ns_flag.mean()).reset_index().rename(columns={'ns_flag':'target_rate'})

# tmp = pd.merge(tmp1, tmp2, left_on='index',right_on=col, how='inner')

# tmp['vol_percent'] = tmp['volume'] * 100/df.shape[0]
# tmp['target'] = np.round(tmp['volume'] * tmp['target_rate'] , 0)
# tmp = tmp[[col]+['volume','vol_percent','target_rate','target']].sort_values(by=['vol_percent'], ascending=False)

# high_ns_values = tmp[(tmp.vol_percent>4) & (tmp.target_rate>=df.ns_flag.mean())][col].to_list()
# low_ns_values = tmp[(tmp.vol_percent>4) & (tmp.target_rate<df.ns_flag.mean())][col].to_list()


#### Business Age

In [23]:
df['date_of_establishment_dt'] = pd.to_datetime(df['date_of_establishment'], format='%Y-%m', errors='coerce').dropna()
df['business_age'] = round((df['application_start_datetime'] - df['date_of_establishment_dt']) / np.timedelta64(1, 'Y'),2)
df['business_age'] = df['business_age'].fillna(0)
df['business_age_bucket'] = np.where(df['business_age'] <= 1, '1', 
                                    np.where((df['business_age']>1) & (df['business_age']<=3), '1-3', '3+'))
df['business_age_bucket'].value_counts().sort_values()

3+      6660
1-3    10655
1      69977
Name: business_age_bucket, dtype: int64

In [24]:
pd.crosstab(df['business_age_bucket'], df['ns_flag'], normalize='index')

ns_flag,0,1
business_age_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.921674,0.078326
1-3,0.912435,0.087565
3+,0.842192,0.157808


#### Website

In [25]:
df['website_flag'] = np.where(df['website'].isna(), 0, 1)
df['website_flag'].value_counts()

0    55520
1    31772
Name: website_flag, dtype: int64

In [26]:
pd.crosstab(df['website_flag'], df['ns_flag'], normalize='index')

ns_flag,0,1
website_flag,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.919561,0.080439
1,0.905609,0.094391


#### State

In [27]:
# state_df = pd.crosstab(df['business_address_state'], df['ns_flag'], normalize='index')
# hds_group = state_df[state_df[1] > df['ns_flag'].value_counts(normalize=True)[1]].index.to_list()
# df['state_group'] = np.where(df['business_address_state'].isin(hds_group), 'hds_group', 'lds_group')
# df['state_group'].value_counts()

In [28]:
col = 'business_address_state'
df[col] = df[col].str.lower()
tmp1 = pd.DataFrame(df[col].value_counts()).reset_index().rename(columns={col:'volume'})
tmp2 = pd.DataFrame(df.groupby([col]).ns_flag.mean()).reset_index().rename(columns={'ns_flag':'target_rate'})

tmp = pd.merge(tmp1, tmp2, left_on='index',right_on=col, how='inner')

tmp['vol_percent'] = tmp['volume'] * 100/df.shape[0]
tmp['target'] = np.round(tmp['volume'] * tmp['target_rate'] , 0)
tmp = tmp[[col]+['volume','vol_percent','target_rate','target']].sort_values(by=['vol_percent'], ascending=False)

In [29]:
high_ns_values = tmp[(tmp.vol_percent>2) & (tmp.target_rate>=df.ns_flag.mean())][col].to_list()
low_ns_values = tmp[(tmp.vol_percent>2) & (tmp.target_rate<df.ns_flag.mean())][col].to_list()

df[col] = df[col].str.lower()
idx_1 = df.index[df[col].isin(high_ns_values)].tolist() 
idx_2 = df.index[df[col].isin(low_ns_values)].tolist() 

# nulls
idx_nulls = df.index[df[col].isnull()].tolist()
idx_nulls.extend(df.index[df[col].isna()].tolist())
idx_nulls.extend(df.index[df[col]==''].tolist())
idx_nulls = list(set(idx_nulls))
idx_full = df.index.to_list()
# other
idx_other = list(set(idx_full) - set(idx_1+idx_2) - set(idx_nulls))

# Encoding into categories
df.loc[idx_1, col] = 'high_ns'
df.loc[idx_2, col] = 'low_ns'
df.loc[idx_other, col] = 'other'

In [30]:
pd.DataFrame(high_ns_values, columns=[col]).to_pickle(project_path+'models/high_ns_busi_state.pkl')
pd.DataFrame(low_ns_values, columns=[col]).to_pickle(project_path+'models/low_ns_busi_state.pkl')

In [31]:
df[df.business_address_state.isin(high_ns_values)].ns_flag.mean(), df[df.business_address_state.isin(low_ns_values)].ns_flag.mean(), df[~(df.business_address_state.isin(high_ns_values+low_ns_values))].ns_flag.mean()


(nan, nan, 0.08551757320258443)

In [32]:
tmp[tmp[col].isin(high_ns_values)].vol_percent.sum(), tmp[tmp[col].isin(high_ns_values)].target.sum()

(33.951564862759476, 3079.0)

In [33]:
tmp[~(tmp[col].isin(high_ns_values+low_ns_values))].vol_percent.sum(), tmp[~(tmp[col].isin(high_ns_values+low_ns_values))].target.sum()


(26.645053384044356, 2078.0)

In [34]:
tmp[tmp[col].isin(low_ns_values)].vol_percent.sum(), tmp[tmp[col].isin(low_ns_values)].target.sum()

(39.40338175319617, 2308.0)

In [35]:
# pd.DataFrame(hds_group, columns=['value']).to_pickle(project_path+'models/hds_group.pkl')

In [36]:
pd.crosstab(df[col], df['ns_flag'], normalize='index')

ns_flag,0,1
business_address_state,Unnamed: 1_level_1,Unnamed: 2_level_1
high_ns,0.89611,0.10389
low_ns,0.932899,0.067101
other,0.910658,0.089342


#### City

In [37]:
# city_df = pd.crosstab(df['business_address_city'], df['ns_flag'], normalize='index')
# hdc_group = city_df[city_df[1] > df['ns_flag'].value_counts(normalize=True)[1]].index.to_list()
# df['city_group'] = np.where(df['business_address_city'].isin(hdc_group), 'hdc_group', 'ldc_group')
# df['city_group'].value_counts()

In [38]:
col = 'business_address_city'
df[col] = df[col].str.lower()
tmp1 = pd.DataFrame(df[col].value_counts()).reset_index().rename(columns={col:'volume'})
tmp2 = pd.DataFrame(df.groupby([col]).ns_flag.mean()).reset_index().rename(columns={'ns_flag':'target_rate'})

tmp = pd.merge(tmp1, tmp2, left_on='index',right_on=col, how='inner')

tmp['vol_percent'] = tmp['volume'] * 100/df.shape[0]
tmp['target'] = np.round(tmp['volume'] * tmp['target_rate'] , 0)
tmp = tmp[[col]+['volume','vol_percent','target_rate','target']].sort_values(by=['vol_percent'], ascending=False)

In [39]:
high_ns_values = tmp[(tmp.vol_percent>1) & (tmp.target_rate>=df.ns_flag.mean())][col].to_list()
low_ns_values = tmp[(tmp.vol_percent>1) & (tmp.target_rate<df.ns_flag.mean())][col].to_list()

df[col] = df[col].str.lower()
idx_1 = df.index[df[col].isin(high_ns_values)].tolist() 
idx_2 = df.index[df[col].isin(low_ns_values)].tolist() 

# nulls
idx_nulls = df.index[df[col].isnull()].tolist()
idx_nulls.extend(df.index[df[col].isna()].tolist())
idx_nulls.extend(df.index[df[col]==''].tolist())
idx_nulls = list(set(idx_nulls))
idx_full = df.index.to_list()
# other
idx_other = list(set(idx_full) - set(idx_1+idx_2) - set(idx_nulls))

# Encoding into categories
df.loc[idx_1, col] = 'high_ns'
df.loc[idx_2, col] = 'low_ns'
df.loc[idx_other, col] = 'other'

In [40]:
pd.DataFrame(high_ns_values, columns=[col]).to_pickle(project_path+'models/high_ns_busi_city.pkl')
pd.DataFrame(low_ns_values, columns=[col]).to_pickle(project_path+'models/low_ns_busi_city.pkl')

In [41]:
df[df.business_address_city.isin(high_ns_values)].ns_flag.mean(), df[~(df.business_address_city.isin(high_ns_values+low_ns_values))].ns_flag.mean(), df[df.business_address_city.isin(low_ns_values)].ns_flag.mean()


(nan, 0.08551757320258443, nan)

In [42]:
# # save the city group
# file = 'hdc_group_list.pkl'
# path = project_path + 'data/'
# with open(path + file, 'wb') as f:
#     pickle.dump(hdc_group, f)

In [43]:
pd.crosstab(df[col], df['ns_flag'], normalize='index')

ns_flag,0,1
business_address_city,Unnamed: 1_level_1,Unnamed: 2_level_1
high_ns,0.86758,0.13242
low_ns,0.928956,0.071044
other,0.914453,0.085547


#### Current bank

In [44]:
hdb_group = ['bluevine', 'other-national-bank', 'td-ank', 'chase', 'usaa']
df['current_bank_group'] = np.where(df['current_bank'].isin(hdb_group), 'hdb_group', 'ldb_group')
df['current_bank_group'].value_counts()

ldb_group    80327
hdb_group     6965
Name: current_bank_group, dtype: int64

In [45]:
pd.crosstab(df['current_bank_group'], df['ns_flag'], normalize='index')

ns_flag,0,1
current_bank_group,Unnamed: 1_level_1,Unnamed: 2_level_1
hdb_group,0.862312,0.137688
ldb_group,0.919006,0.080994


#### Website is similar to Email Domain

In [46]:
df['website_processed'] = df['website'].fillna('')
df['website_email_domain_match'] = df.apply(lambda x: x.email_domain in x.website_processed, axis=1)
df['website_email_domain_match'].value_counts()

False    74204
True     13088
Name: website_email_domain_match, dtype: int64

In [47]:
pd.crosstab(df['website_email_domain_match'], df['ns_flag'], normalize='index')

ns_flag,0,1
website_email_domain_match,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.920409,0.079591
True,0.880883,0.119117


#### Business Pitch

In [48]:
# REPLACE_BY_SPACE_RE = re.compile('[/()]{}\[\]\|@,:!*]')
# BAD_SYMBOLS_RE = re.compile('[^0-9a-z ]')

# def cleaning_gen(data):
#     review = re.sub('[^a-zA-z]',' ',data)
#     review = review.lower()
#     review = BAD_SYMBOLS_RE.sub('',review)
#     review = REPLACE_BY_SPACE_RE.sub(' ', review)
#     doc = nlp(review)
#     tokens = []
#     for token in doc:
#         tokens.append(token)
#     tokens = [token.lemma_ for token in doc if token.pos_ in ['VERB','NOUN']]
#     tokens = [c for c in tokens if c not in stop_words]
#     tokens = [c for c in tokens if len(c)>1]
#     tokens = ' '.join(tokens)
#     return (tokens)

# df['business_pitch_lema_spacy'] = df['business_pitch'].apply(cleaning_gen)

In [49]:
# Define ngram and cutoff
ngram = 1
cutoff = 800

# NS
words = ' '.join([i for i in df[df.ns_flag == 1]['business_pitch_lema_spacy']]).split()
ngram_high_df = pd.Series(nltk.ngrams(words, ngram)).value_counts().rename_axis('ngrams').reset_index(name='high_dep_freq')

# NonNs
words = ' '.join([i for i in df[df.ns_flag == 0]['business_pitch_lema_spacy']]).split()
ngram_low_df = pd.Series(nltk.ngrams(words, ngram)).value_counts().rename_axis('ngrams').reset_index(name='low_dep_freq')

# Ngram Ratio 
ngram_df = ngram_high_df.merge(ngram_low_df, on='ngrams')
ngram_df['total_freq'] = ngram_df['high_dep_freq'] + ngram_df['low_dep_freq']
ngram_df['high_ratio'] = ngram_df['high_dep_freq']/ngram_df['total_freq']
ngram_df = ngram_df[ngram_df['total_freq'] > cutoff].sort_values(by='high_ratio', ascending=False)
ngram_tuple = ngram_df[ngram_df['high_ratio']>0.12]['ngrams']
bag_of_words = '|'.join(str(v) for v in ngram_tuple)
bag_of_words = bag_of_words.replace('(', '').replace(')', '').replace(',', '').replace("'", "")
bag_of_words

'strategy|firm|technology|datum|software|consulting|project|consult|film|practice|development|lease|investment|team|term|system|industry|manage|hold|security|management|hire|growth|basis|contract|partner|agency|value|operation|solution|estate|web|drive|organization|property|application|production|develop|contractor|content|process|rental|marketing|relate|therapy|space|rent|insurance|advertising'

In [50]:
# Selected BOW by eliminating few usual/common words
bag_of_words = ['software', 'firm', 'technology', 'consulting', 'strategy', 'consult', 'project', 'datum', 'practice'
                , 'investment', 'film', 'lease', 'development', 'hire', 'industry', 'management', 'research'
                , 'operation', 'term', 'system', 'contract', 'growth', 'partner', 'agency', 'estate', 'production'
                , 'therapy', 'property', 'contractor', 'web', 'marketing', 'insurance', 'lead', 'content', 'program']

In [51]:
df['bow_flag'] = np.where(df['business_pitch_lema_spacy'].str.contains('|'.join(bag_of_words)), 1, 0)
df['bow_flag'].value_counts()

0    50148
1    37144
Name: bow_flag, dtype: int64

In [52]:
pd.crosstab(df['bow_flag'], df['ns_flag'], normalize='index')

ns_flag,0,1
bow_flag,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.943946,0.056054
1,0.874704,0.125296


In [53]:
df['bow_flag'].value_counts()

0    50148
1    37144
Name: bow_flag, dtype: int64

In [54]:
df['bow_flag'].value_counts(normalize=True)

0    0.574486
1    0.425514
Name: bow_flag, dtype: float64

In [55]:
pd.DataFrame(bag_of_words, columns=['bow']).to_pickle(project_path+'models/business_pitch_bow.pkl')