In [None]:
import pickle
import warnings

import boto3
import numpy as np
import pandas as pd
import seaborn as sns
import snowflake.connector as sf
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import os
from dotenv import load_dotenv

load_dotenv("../../.env")
from common import configuration

In [None]:
#set preferances

pd.options.display.max_columns = None
pd.set_option('display.float_format', lambda x: '%.3f' % x)
warnings.filterwarnings(action='once')
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
sns.set_style("darkgrid")

In [None]:
conn = sf.connect(user=os.getenv('SNOWFLAKE_USER'),
                  password=os.getenv('SNOWFLAKE_PASSWORD'),
                  account=os.getenv('SNOWFLAKE_HOST')
                  )


def run_query(connection, query):
    cursor = connection.cursor()
    cursor.execute(query)
    cursor.close()


snowflakecursor = conn.cursor()
try:
    sql = 'alter warehouse {} resume'.format(os.getenv('SNOWFLAKE_WAREHOUSE'))
    run_query(conn, sql)
except:
    pass
sql = 'use database {}'.format(os.getenv('SNOWFLAKE_DATABASE'))
run_query(conn, sql)
sql = 'use database {}'.format(os.getenv('SNOWFLAKE_DATABASE'))


In [None]:
data = pd.read_csv("../../../data.csv", header=None)
data = data.rename(columns = {0: 'BANK_ACCOUNT_NUMBER', 1: 'PREDICT'}, inplace = False)
data = data[['BANK_ACCOUNT_NUMBER', 'PREDICT']]
prod_id_cols = '''
select BANK_ACCOUNT_NUMBER, max(PRODUCT_ID) as PROD_ID from "LILI_ANALYTICS"."ODS"."MYSQL_DW_CUSTOMER_MONTHLY_NEW"
group by BANK_ACCOUNT_NUMBER
having PROD_ID = 2502 '''


prod_id = pd.read_sql(prod_id_cols, conn)
print(prod_id.dtypes)
print(data.dtypes)
prod_id['BANK_ACCOUNT_NUMBER'] = prod_id['BANK_ACCOUNT_NUMBER'].astype(str).astype(int)
data = data.merge(prod_id, on=['BANK_ACCOUNT_NUMBER'], how='left')
checker = data[data['PROD_ID']==2502]
checker.reset_index(drop=True)
checker

In [None]:
date_from = '2021-05-01 00:00:00'


customers_logins = '''
SELECT bank_account_id,
    SUM(Zeroifnull(login_count)) AS login_count
    FROM   (SELECT bank_account_id,
    customer_id
    FROM   dwh.fact_mysql_customer_monthly
    GROUP  BY 1,
            2) bank_to_customer
    LEFT JOIN (SELECT customer_id,
    COUNT(DISTINCT id) AS login_count
    FROM   "LILI_ANALYTICS_DEV"."ODS"."MYSQL_CUSTOMER_LOGIN"
    WHERE  create_time >= ' ''' + date_from + ''' '
    AND create_time <= Dateadd(month, 1,' ''' + date_from + ''' ')
    GROUP  BY customer_id) customers_login
    ON bank_to_customer.customer_id = customers_login.customer_id 
    GROUP BY 1
    '''
customer_login_per_date_range = pd.read_sql(customers_logins, conn)

second_table_cols = '''
        select BANK_ACCOUNT_ID, sum(TRANSACTION_AMOUNT) as total_money_in1
        from "LILI_ANALYTICS"."DWH"."FACT_MYSQL_ACCOUNT_TRANSACTION_ALL"
        where transaction_date>dateadd(day, 30, ' ''' + date_from + ''' ') and transaction_date<=dateadd(day, 60,' ''' + date_from + ''' ')
        and act_type in ('PM','AD') and transaction_amount>0
        group by BANK_ACCOUNT_ID
    '''
label = pd.read_sql(second_table_cols, conn)

financial_data_query = '''
    SELECT
        dw.bank_account_id,
        max(dw.account_active) as account_active,
        min(signup_date) as signup_date,
        dateadd(day, 1, ' ''' + date_from + ''' ') as period_end,
        max(transaction_date) as last_transaction_in_time,
        count(case when ata.rolling_balance < 0 then 1 else null end) as had_negative_balance,
        max(case when act_type='PM' then transaction_date else null end) as last_transaction_money_in_in_time,
        abs(sum(case when act_type in ('ST','VS','IS','VI', 'DB', 'SD', 'MP') and ata.type = 'W' then ata.transaction_amount else 0 end)) as ATM_sum,
        abs(sum(case when act_type in ('ST','VS','IS','VI', 'DB', 'SD', 'MP') and ata.type <> 'W' then ata.transaction_amount else 0 end)) as Swipe_sum,
        abs(sum(case when act_type in ('ST','VS','IS','VI', 'DB', 'SD', 'MP') then ata.transaction_amount else 0 end)) as Spend_sum,
        sum(case when act_type='PM' and ata.type='FM' then abs(ata.transaction_amount) else 0 end) - sum(case when (ata.type='FM' and act_type='AD' and details='Direct Deposit Return') then abs(ata.transaction_amount) else 0 end) as direct_deposit_sum,
        sum(case when act_type='PM' and (ata.type in ('VT','VA','VH','MX')) then abs(ata.transaction_amount) else 0 end) as direct_pay_sum,
        abs(sum(case when (act_type='PM' and ata.type='AC') and ata.transaction_amount > 0 then ata.transaction_amount else 0 end)) as ACH_sum,
        sum(case when act_type='PM' and (ata.type='OR') then abs(ata.transaction_amount) else 0 end) as Check_sum,
        sum(case when act_type='PM' and (ata.type in ('GT', 'GO', 'CE')) then abs(ata.transaction_amount) else 0 end) as Greendot_sum,
        sum(case when (act_type='AD' and details='Debit Card transfer') then abs(ata.transaction_amount) else 0 end) as Card_Deposit_sum,
        sum(case when (act_type='PM' and ata.type<>'C2') or (act_type='AD' and (details='Debit Card transfer' or ata.type='FM')) and transaction_amount > 1 then transaction_amount else null end) - sum(case when (ata.type='FM' and act_type='AD' and details='Direct Deposit Return') then abs(ata.transaction_amount) else 0 end) as Total_money_in,
        count(case when (act_type='PM' and ata.type<>'C2') or (act_type='AD' and (details='Debit Card transfer' or ata.type='FM')) and transaction_amount > 1 then transaction_amount  else null end) - count(case when (ata.type='FM' and act_type='AD' and details='Direct Deposit Return') then abs(ata.transaction_amount) else null end) as Total_money_in_Count,
        avg(ata.rolling_balance) as average_balance,
        count(distinct case when dds.type='PAYROLL' then 1 else null end) as did_payroll,
        count(distinct case when dds.type='Marketplace' then 1 else null end) as did_marketplace,
        count(distinct case when dds.type='FINANCIAL INSTITUTION' then 1 else null end) as did_financial_institution,
        count(distinct case when dds.type='Unemployment' then 1 else null end) as did_unemployment,
        count(distinct case when dds.type='Tax Refund' then 1 else null end) as did_tax_refund,
        max(pro_customer) as pro_customer
    FROM (SELECT bank_account_id, signup_date, account_active, max(pro_customer) as pro_customer 
            FROM DWH.fact_mysql_customer_monthly 
            GROUP BY 1,2,3) dw
    LEFT JOIN DWH.fact_mysql_account_transaction_all ata 
        ON dw.bank_account_id=ata.bank_account_id and ata.transaction_date >= ' ''' + date_from + ''' ' and ata.transaction_date <= dateadd(month, 1,  ' ''' + date_from + ''' ' ) 
    LEFT JOIN ODS.mysql_direct_deposit_sources dds 
        ON dds.merchant=ata.details
    GROUP BY 1
    ORDER by 1'''

added_bank_account_num_cols = '''
    SELECT BANK_ACCOUNT_ID, 
            MAX(BANK_ACCOUNT_NUMBER) as BANK_ACCOUNT_NUMBER, 
            MAX((CASE WHEN PRODUCT_ID = 20643 THEN 1 ELSE 0 END)) AS IS_PRO_SINCE_SIGNUP 
    FROM "LILI_ANALYTICS"."ODS"."MYSQL_DW_CUSTOMER_MONTHLY_NEW"
    GROUP BY BANK_ACCOUNT_ID
    '''

negative_balance_cols = '''
    select t.BANK_ACCOUNT_ID, t.CURRENT_BALANCE
        from "LILI_ANALYTICS_DEV"."ODS"."MYSQL_CUSTOMER_BALANCE_HISTORY" t
        inner join (
        select BANK_ACCOUNT_ID, max(case when VALID_DATE <= Dateadd(month, 1, ' ''' + date_from + ''' ') then VALID_DATE end) as MAXDATE
        from "LILI_ANALYTICS_DEV"."ODS"."MYSQL_CUSTOMER_BALANCE_HISTORY" tm
        group by BANK_ACCOUNT_ID
        ) tm on t.BANK_ACCOUNT_ID = tm.BANK_ACCOUNT_ID and t.VALID_DATE = tm.MAXDATE
        '''

negative_balance = pd.read_sql(negative_balance_cols, conn)
added_bank_account_num = pd.read_sql(added_bank_account_num_cols, conn)
financial_data = pd.read_sql(financial_data_query, conn)
table_all = financial_data.merge(customer_login_per_date_range, on=['BANK_ACCOUNT_ID'], how='left')
table_all = table_all.merge(added_bank_account_num, on=['BANK_ACCOUNT_ID'], how='left')
table_all['DID_NONE'] = (1 - table_all[
        ['DID_PAYROLL', 'DID_MARKETPLACE', 'DID_FINANCIAL_INSTITUTION', 'DID_UNEMPLOYMENT', 'DID_TAX_REFUND']].max(
        axis=1))
table_all['TIME_FROM_LAST_TRANSACTION'] = \
        (pd.to_datetime(table_all['PERIOD_END'], format="%Y%m") - table_all['LAST_TRANSACTION_IN_TIME']) \
            .astype('timedelta64[D]')
table_all['TIME_FROM_LAST_MONEY_IN'] = \
        (pd.to_datetime(table_all['PERIOD_END'], format="%Y%m%") - table_all['LAST_TRANSACTION_MONEY_IN_IN_TIME']) \
            .astype('timedelta64[D]')
table_all['TIME_FROM_SIGNUP_DATE'] = \
        (pd.to_datetime(table_all['PERIOD_END'], format="%Y%m") - table_all['SIGNUP_DATE']) \
            .astype('timedelta64[D]')
table_all = table_all.drop(['SIGNUP_DATE', 'PERIOD_END', 'LAST_TRANSACTION_IN_TIME'], axis=1)
table_all['AVG_MONEY_IN'] = np.where(table_all['TOTAL_MONEY_IN'] == 0.00, 0.00,
                                         table_all['TOTAL_MONEY_IN'] / table_all['TOTAL_MONEY_IN_COUNT'])

table_all = table_all.merge(label, on=['BANK_ACCOUNT_ID'], how='left')
table_all.rename(columns = {'TOTAL_MONEY_IN1': 'LABEL'}, inplace = True)
table_all['LABEL'] = table_all['LABEL'].fillna(0)
table_all = table_all.fillna(0)
table_all = table_all.reset_index(drop=True)
table_all = table_all[
        ['BANK_ACCOUNT_NUMBER', 'ACCOUNT_ACTIVE', 'PRO_CUSTOMER', 'TIME_FROM_SIGNUP_DATE', 'IS_PRO_SINCE_SIGNUP',
         'HAD_NEGATIVE_BALANCE', 'ATM_SUM', 'SWIPE_SUM',
         'SPEND_SUM', 'DIRECT_DEPOSIT_SUM', 'DIRECT_PAY_SUM', \
         'ACH_SUM', 'CHECK_SUM', 'GREENDOT_SUM', 'CARD_DEPOSIT_SUM', 'TOTAL_MONEY_IN', 'TOTAL_MONEY_IN_COUNT',
         'AVERAGE_BALANCE', \
         'DID_PAYROLL', 'DID_MARKETPLACE', 'DID_FINANCIAL_INSTITUTION', 'DID_UNEMPLOYMENT', 'DID_TAX_REFUND', \
         'LOGIN_COUNT', 'DID_NONE', 'TIME_FROM_LAST_TRANSACTION', 'TIME_FROM_LAST_MONEY_IN', 'AVG_MONEY_IN', 'LABEL']]
table_all

In [None]:
table_all['BANK_ACCOUNT_NUMBER'] = table_all['BANK_ACCOUNT_NUMBER'].astype(str).astype(int)
merger = table_all.merge(checker, on=['BANK_ACCOUNT_NUMBER'], how='right')
merger

#Loading the necessary data

date_from = '2021-06-01 00:00:00'


first_table_cols = '''
        select bank_account_id, count(distinct cl.id) as login_count
        from (select bank_account_id, customer_id from DWH.fact_mysql_customer_monthly group by 1,2) dw
        join "LILI_ANALYTICS_DEV"."ODS"."MYSQL_CUSTOMER_LOGIN" cl on cl.customer_id=dw.customer_id
        where create_time>=' ''' + date_from + ''' ' and create_time<=dateadd(day, 30,' '''+ date_from + ''' ')
        group by bank_account_id
'''
customer_login_per_date_range = pd.read_sql(first_table_cols, conn)

second_table_cols = '''
        select BANK_ACCOUNT_ID, sum(TRANSACTION_AMOUNT) as total_money_in1
        from "LILI_ANALYTICS"."DWH"."FACT_MYSQL_ACCOUNT_TRANSACTION_ALL"
        where transaction_date>dateadd(day, 30, ' ''' + date_from + ''' ') and transaction_date<=dateadd(day, 60,' ''' + date_from + ''' ')
        and act_type in ('PM','AD') and transaction_amount>0
        group by BANK_ACCOUNT_ID
    '''
bank_account_did_transactions = pd.read_sql(second_table_cols, conn)
    
    
third_table_cols = '''
    select
    dw.customer_id,
    dw.bank_account_id,
    min(signup_date) as signup_date,
    dateadd(day, 30,' '''+ date_from + ''' ') as period_end,
    max(transaction_date) as last_transaction_in_time,
    count(case when ata.rolling_balance < 0 then 1 else null end) as had_negative_balance,
    max(case when act_type='PM' then transaction_date else null end) as last_transaction_money_in_in_time,
    abs(sum(case when act_type in ('ST','VS','IS','VI', 'DB', 'SD', 'MP') and ata.type = 'W' then ata.transaction_amount else 0 end)) as ATM_sum,
    abs(sum(case when act_type in ('ST','VS','IS','VI', 'DB', 'SD', 'MP') and ata.type <> 'W' then ata.transaction_amount else 0 end)) as Swipe_sum,
    abs(sum(case when act_type in ('ST','VS','IS','VI', 'DB', 'SD', 'MP') then ata.transaction_amount else 0 end)) as Spend_sum,
    sum(case when act_type='PM' and ata.type='FM' then abs(ata.transaction_amount) else 0 end) - sum(case when (ata.type='FM' and act_type='AD' and details='Direct Deposit Return') then abs(ata.transaction_amount) else 0 end) as direct_deposit_sum,
    sum(case when act_type='PM' and (ata.type in ('VT','VA','VH','MX')) then abs(ata.transaction_amount) else 0 end) as direct_pay_sum,
    abs(sum(case when (act_type='PM' and ata.type='AC') and ata.transaction_amount > 0 then ata.transaction_amount else 0 end)) as ACH_sum,
    sum(case when act_type='PM' and (ata.type='OR') then abs(ata.transaction_amount) else 0 end) as Check_sum,
    sum(case when act_type='PM' and (ata.type in ('GT', 'GO', 'CE')) then abs(ata.transaction_amount) else 0 end) as Greendot_sum,
    sum(case when (act_type='AD' and details='Debit Card transfer') then abs(ata.transaction_amount) else 0 end) as Card_Deposit_sum,
    sum(case when (act_type='PM' and ata.type<>'C2') or (act_type='AD' and (details='Debit Card transfer' or ata.type='FM')) and transaction_amount > 1 then transaction_amount else null end) - sum(case when (ata.type='FM' and act_type='AD' and details='Direct Deposit Return') then abs(ata.transaction_amount) else 0 end) as Total_money_in,
    count(case when (act_type='PM' and ata.type<>'C2') or (act_type='AD' and (details='Debit Card transfer' or ata.type='FM')) and transaction_amount > 1 then transaction_amount  else null end) - count(case when (ata.type='FM' and act_type='AD' and details='Direct Deposit Return') then abs(ata.transaction_amount) else null end) as Total_money_in_Count,
    avg(ata.rolling_balance) as average_balance,
    count(distinct case when dds.type='PAYROLL' then 1 else null end) as did_payroll,
    count(distinct case when dds.type='Marketplace' then 1 else null end) as did_marketplace,
    count(distinct case when dds.type='FINANCIAL INSTITUTION' then 1 else null end) as did_financial_institution,
    count(distinct case when dds.type='Unemployment' then 1 else null end) as did_unemployment,
    count(distinct case when dds.type='Tax Refund' then 1 else null end) as did_tax_refund,
    max(pro_customer) as pro_customer
    from (select customer_id, bank_account_id, signup_date, account_active, max(pro_customer) as pro_customer from DWH.fact_mysql_customer_monthly group by 1,2,3,4) dw
    join DWH.fact_mysql_account_transaction_all ata on dw.bank_account_id=ata.bank_account_id and ata.transaction_date > ' ''' + date_from + ''' ' and ata.transaction_date <= dateadd(day, 30, ' ''' + date_from + ''' ') 
    left join ODS.mysql_direct_deposit_sources dds on dds.merchant=ata.details
    where account_active=1
    group by 1, 2
    having Total_money_in>500
    order by 1'''

#added_bank_account_num_cols = '''
#select BANK_ACCOUNT_ID, max(BANK_ACCOUNT_NUMBER) as BANK_ACCOUNT_NUMBER from "LILI_ANALYTICS"."ODS"."MYSQL_DW_CUSTOMER_MONTHLY_NEW"
#group by BANK_ACCOUNT_ID
#'''
#added_bank_account_num = pd.read_sql(added_bank_account_num_cols, conn)


third_table = pd.read_sql(third_table_cols, conn)
first_join = third_table.merge(customer_login_per_date_range, on=['BANK_ACCOUNT_ID'], how='left')
table_all = first_join.merge(bank_account_did_transactions, on=['BANK_ACCOUNT_ID'], how='left')
#table_all = table_all.merge(added_bank_account_num, on=['BANK_ACCOUNT_ID'], how='left')
table_all.rename(columns = {'TOTAL_MONEY_IN1': 'LABEL'}, inplace = True)
table_all['LABEL'] = table_all['LABEL'].fillna(0)
table_all['LOGIN_COUNT'] = table_all['LOGIN_COUNT'].fillna(0)
table_all['TOTAL_MONEY_IN'] = table_all['TOTAL_MONEY_IN'].fillna(0)
table_all['DID_NONE'] = (1-table_all[['DID_PAYROLL','DID_MARKETPLACE','DID_FINANCIAL_INSTITUTION','DID_UNEMPLOYMENT', 'DID_TAX_REFUND']].max(axis=1))
table_all = table_all[table_all['PRO_CUSTOMER']==1]
table_all = table_all.reset_index(drop=True)
table_all2 = table_all

table_all['TIME_FROM_LAST_TRANSACTION'] = \
(pd.to_datetime(table_all['PERIOD_END'], format="%Y%m") - table_all['LAST_TRANSACTION_IN_TIME'])\
.astype('timedelta64[D]')
table_all['TIME_FROM_LAST_MONEY_IN'] = \
(pd.to_datetime(table_all['PERIOD_END'], format="%Y%m%") - table_all['LAST_TRANSACTION_MONEY_IN_IN_TIME'])\
.astype('timedelta64[D]')
table_all = table_all.drop(['SIGNUP_DATE','CUSTOMER_ID','BANK_ACCOUNT_ID', 'PERIOD_END', 'LAST_TRANSACTION_IN_TIME',\
                           'LAST_TRANSACTION_MONEY_IN_IN_TIME', 'PRO_CUSTOMER'], axis=1)
table_all['AVG_MONEY_IN'] = np.where(table_all['TOTAL_MONEY_IN']==0.00, 0.00, table_all['TOTAL_MONEY_IN']/table_all['TOTAL_MONEY_IN_COUNT'])
table_all

checker = third_table[third_table['PRO_CUSTOMER']==1]
checker.shape

In [None]:
#Categorize money_in_may in the following manner:
# x<20       -1
# 20<=x<50   -2
# 50<=x<100  -3
# 100<=x<200 -4
# 200<x      -5

table_ready = merger

table_ready['LABEL'] = pd.to_numeric(table_ready['LABEL'], downcast="float")

table_ready['LABEL'].loc[(table_ready['LABEL'] < 20)] = -1
table_ready['LABEL'].loc[(table_ready['LABEL'] >= 20)&(table_ready['LABEL'] < 1065)] = -2
table_ready['LABEL'].loc[(table_ready['LABEL'] >= 1065)&(table_ready['LABEL'] < 2090)] = -3
table_ready['LABEL'].loc[(table_ready['LABEL'] >= 2090)&(table_ready['LABEL'] < 3940)] = -4
table_ready['LABEL'].loc[(table_ready['LABEL'] >= 3940)] = -5
table_ready['LABEL'] = -1 * table_ready['LABEL'] - 1

a = table_ready[table_ready['LABEL'] == 0]
b = table_ready[table_ready['LABEL'] == 1]
c = table_ready[table_ready['LABEL'] == 2]
d = table_ready[table_ready['LABEL'] == 3]
e = table_ready[table_ready['LABEL'] == 4]


print(a.shape[0]+b.shape[0]+c.shape[0]+d.shape[0]+e.shape[0])
print(a.shape)
print(b.shape)
print(c.shape)
print(d.shape)
print(e.shape)


table_ready = table_ready.fillna(0)
table_ready = table_ready[
        ['ACCOUNT_ACTIVE', 'PRO_CUSTOMER', 'TIME_FROM_SIGNUP_DATE', 'IS_PRO_SINCE_SIGNUP',
         'HAD_NEGATIVE_BALANCE', 'ATM_SUM', 'SWIPE_SUM',
         'SPEND_SUM', 'DIRECT_DEPOSIT_SUM', 'DIRECT_PAY_SUM', \
         'ACH_SUM', 'CHECK_SUM', 'GREENDOT_SUM', 'CARD_DEPOSIT_SUM', 'TOTAL_MONEY_IN', 'TOTAL_MONEY_IN_COUNT',
         'AVERAGE_BALANCE', \
         'DID_PAYROLL', 'DID_MARKETPLACE', 'DID_FINANCIAL_INSTITUTION', 'DID_UNEMPLOYMENT', 'DID_TAX_REFUND', \
         'LOGIN_COUNT', 'DID_NONE', 'TIME_FROM_LAST_TRANSACTION', 'TIME_FROM_LAST_MONEY_IN', 'AVG_MONEY_IN', 'LABEL']]




#NOTE! THIS SPECIFIC CHANGE IS ONLY FOR THIS CASE!!!
table_ready['IS_PRO_SINCE_SIGNUP'].loc[(table_ready['IS_PRO_SINCE_SIGNUP'] == 0)] = 1



#-------------------------------

table_ready

In [None]:
#Let's modelize!

#First step - make seperate labels from features and convert to numpy arrays

# Labels are the values we want to predict
labels = np.array(table_ready['LABEL'])
# Remove the labels from the features
# axis 1 refers to the columns
features= table_ready.drop('LABEL', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)

In [None]:
features.shape

BUCKET = 'lili-ml-tests'
DAG = 'test-dag'
EXECUTION_DATE = '2020-07-11T07:19:58.478946+00:00'
config = configuration.Config(dag=DAG,
                execution_date=EXECUTION_DATE,
                snowflake_connection=None,
                s3_connection=None,
                stages_bucket=BUCKET
                )

In [None]:
loaded_model = pickle.load(open('../../model_dumps/ODmodel_2021-08-02T08:11:52.106906.pkl', 'rb'))
type(loaded_model)

#making the classes more balanced using over-sampling (SMOTE)

buck1 = []
buck2 = []
buck3 = []
buck4 = []
buck5 = []

for i in range(0, len(train_labels)):
    if train_labels[i]==-1:
        buck1.append(i)
    elif train_labels[i]==-2:
        buck2.append(i)
    elif train_labels[i]==-3:
        buck3.append(i)
    elif train_labels[i]==-4:
        buck4.append(i)
    else:
        buck5.append(i)
        
print("bucket 1 has " + str(len(buck1)))
print("bucket 2 has " + str(len(buck2)))
print("bucket 3 has " + str(len(buck3)))
print("bucket 4 has " + str(len(buck4)))
print("bucket 5 has " + str(len(buck5)))

print("after SMOTE:")

sm = SMOTE(random_state=42)
train_feat_res, train_lbl_res = sm.fit_resample(train_features, train_labels)

buck1 = []
buck2 = []
buck3 = []
buck4 = []
buck5 = []


for i in range(0, len(train_lbl_res)):
    if train_lbl_res[i]==-1:
        buck1.append(i)
    elif train_lbl_res[i]==-2:
        buck2.append(i)
    elif train_lbl_res[i]==-3:
        buck3.append(i)
    elif train_lbl_res[i]==-4:
        buck4.append(i)
    else:
        buck5.append(i)

        
print("bucket 1 has " + str(len(buck1)))
print("bucket 2 has " + str(len(buck2)))
print("bucket 3 has " + str(len(buck3)))
print("bucket 4 has " + str(len(buck4)))
print("bucket 5 has " + str(len(buck5)))
        

#making the classes more balanced using under-sampling

buck1 = []
buck2 = []
buck3 = []
buck4 = []
buck5 = []

train_labels_und = []
train_features_und = []

for i in range(0, len(train_labels)):
    if train_labels[i]==-1:
        buck1.append(i)
    elif train_labels[i]==-2:
        buck2.append(i)
    elif train_labels[i]==-3:
        buck3.append(i)
    elif train_labels[i]==-4:
        buck4.append(i)
    else:
        buck5.append(i)
        
print("bucket 1 has " + str(len(buck1)))
print("bucket 2 has " + str(len(buck2)))
print("bucket 3 has " + str(len(buck3)))
print("bucket 4 has " + str(len(buck4)))
print("bucket 5 has " + str(len(buck5)))

bucket_1_r = np.random.choice(buck1, 1600, replace=False)
bucket_5_r = np.random.choice(buck5, 1600, replace=False)


train_label_indices = np.concatenate((bucket_1_r, buck2,buck3,buck4, bucket_5_r))
for i in range(0, len(train_label_indices)):
    train_labels_und.append(train_labels[train_label_indices[i]])
    train_features_und.append(train_features[train_label_indices[i]])

print("undersampling...")

buck1 = []
buck2 = []
buck3 = []
buck4 = []
buck5 = []

for i in range(0, len(train_labels_und)):
    if train_labels_und[i]==-1:
        buck1.append(i)
    elif train_labels_und[i]==-2:
        buck2.append(i)
    elif train_labels_und[i]==-3:
        buck3.append(i)
    elif train_labels_und[i]==-4:
        buck4.append(i)
    else:
        buck5.append(i)
        
print("bucket 1 has " + str(len(buck1)))
print("bucket 2 has " + str(len(buck2)))
print("bucket 3 has " + str(len(buck3)))
print("bucket 4 has " + str(len(buck4)))
print("bucket 5 has " + str(len(buck5)))


print(len(train_features_und))
print(len(train_labels_und))

In [None]:
# Use the forest's predict method on the test data
predictions = loaded_model.predict(config, features)
preds = []
#need to change results a bit
predictions_to_overdraft = {
    0: 0,
    20: 1,
    40: 2,
    60: 3,
    100: 4
}


for elt in predictions:
    preds.append(predictions_to_overdraft[elt[0]])



In [None]:
preds_proba = clf.predict_proba(features)
preds_proba

In [None]:
t = labels[preds_proba[:,0]>0.3].astype(int)+5
l = np.bincount(labels.astype(int)+5)/len(labels)
(np.bincount(t)/len(t))

In [None]:
lenth = len(predictions)
print(lenth)

In [None]:
print("Accuracy:",metrics.accuracy_score(preds, labels))

comparush = (preds>=labels)
unique, counts = np.unique(comparush, return_counts=True)
("the predictions that were at least equal or worse are " + str(counts[1]/(counts[0]+counts[1])))
#the current algorithm gives 85.5% of clients overdraft less or equal to what they should


In [None]:
#checking improved accuracy after fine-tuning
preds_proba = loaded_model.predict_proba(features)
predictions2 = predictions
for i in range(0, len(preds_proba)):
    chosen_cat = np.argmax(preds_proba[i])-5
    prob       = max(preds_proba[i])
    if prob<=0.55 and chosen_cat==-5:
        predictions2[i] = np.argmax([preds_proba[i][1], preds_proba[i][2], preds_proba[i][3], preds_proba[i][4]])-4
#    if prob<=0.49 and chosen_cat==-4:
#        predictions2[i] = np.argmax([preds_proba[i][2], preds_proba[i][3], preds_proba[i][4]])-3
#    if prob<=0.36 and chosen_cat==-3:
#        predictions2[i] = np.argmax([preds_proba[i][3], preds_proba[i][4]])-2

predictions = loaded_model.predict(features)
print("Original Accuracy:",metrics.accuracy_score(predictions, labels))
comparush = (predictions>=labels)
unique, counts = np.unique(comparush, return_counts=True)
print("the predictions that were at least equal or worse are " + str(counts[1]/(counts[0]+counts[1])))


print("Altered Accuracy:",metrics.accuracy_score(predictions2, labels))
comparush = (predictions2>=labels)
unique, counts = np.unique(comparush, return_counts=True)
print("the predictions that were at least equal or worse are " + str(counts[1]/(counts[0]+counts[1])))


In [None]:
##### confusion matrix
ax = sns.heatmap(confusion_matrix(labels ,preds),cmap="Blues",annot=True, fmt=".2f")
ax.xaxis.set_ticklabels(['100','60','40','20','No'])
ax.yaxis.set_ticklabels(['100','60','40','20','No'])

#np.histogram(preds_proba[:,3])
print(classification_report(labels, preds, target_names=['100','60','40','20','NO']))

In [None]:
##### confusion matrix
ax = sns.heatmap(confusion_matrix(labels ,predictions2),cmap="Blues",annot=True, fmt=".2f")
ax.xaxis.set_ticklabels(['100','60','40','20','No'])
ax.yaxis.set_ticklabels(['100','60','40','20','No'])
np.histogram(preds_proba[:,3])
print(classification_report(labels, predictions2, target_names=['100','60','40','20','NO']))

In [None]:
labels2 = table_all2['LABEL']
numbers = []
for i in range(0, len(predictions)):
        if predictions[i]==-1:
            numbers.append(labels2[i])

numbers = np.sort(numbers)            
numbers[2418]

In [None]:
sum1 = 0
sum2 = 0
sum3 = 0
sum4 = 0
sum5 = 0

for elt in predictions:
    if elt == -1:
        sum1=sum1+1
    if elt == -2:
        sum2=sum2+1
    if elt == -3:
        sum3=sum3+1
    if elt == -4:
        sum4=sum4+1
    if elt == -5:
        sum5=sum5+1
        
        
print("under 500$ is " + str(sum1))
print("between 500$ and 800$ is " + str(sum2))
print("between 800$ and 1400$ is " + str(sum3))
print("between 1400$ and 200$ is " + str(sum4))
print("more than 2000$ is " + str(sum5))


print("ratios")

print("under 500$ is " + str(sum1/len(predictions)))
print("between 500$ and 800$ is " + str(sum2/len(predictions)))
print("between 800$ and 1400$ is " + str(sum3/len(predictions)))
print("between 1400$ and 200$ is " + str(sum4/len(predictions)))
print("more than 2000$ is " + str(sum5/len(predictions)))



In [None]:
indices = []
count = 0
for i in range(0, len(predictions)):
    if predictions[i]==-1:
        count=count+1
        indices.append(i)
        
print(count)

table_all.iloc[indices]

In [None]:
table_all2['LABEL'].hist(bins=50)

table_all2['LABEL'].describe(np.linspace(0,1,51))

In [None]:
sum = 0
for elt in labels:
    if elt < -1:
        sum=sum+1
        
print(sum/len(labels)) 

In [None]:
cnf_mat = confusion_matrix(labels, predictions)

In [None]:
sum = 0
for i in range(0,5):
    for j in range(0,5):
        if i==1 and j==0:
            sum = sum - 100*cnf_mat[i][j]
        if i==2 and j==0:
            sum = sum - 150*cnf_mat[i][j]
        if i==2 and j==1:
            sum = sum - 50*cnf_mat[i][j]
        if i==3 and j==0:
            sum = sum - 180*cnf_mat[i][j]
        if i==3 and j==1:
            sum = sum - 80*cnf_mat[i][j]
        if i==3 and j==2:
            sum = sum - 30*cnf_mat[i][j]
           
     
print("loss is " + str(sum))

In [None]:
sum = 0
for i in range(0,5):
    for j in range(0,5):
        if j==1 and i==0:
            sum = sum + 100*cnf_mat[i][j]
        if j==2 and i==0:
            sum = sum + 150*cnf_mat[i][j]
        if j==2 and i==1:
            sum = sum + 50*cnf_mat[i][j]
        if j==3 and i==0:
            sum = sum + 180*cnf_mat[i][j]
        if j==3 and i==1:
            sum = sum + 80*cnf_mat[i][j]
        if j==3 and i==2:
            sum = sum + 30*cnf_mat[i][j]
        
     
print("loss is " + str(sum))

In [None]:
diff = abs(labels - predictions)
sum=0
for elt in diff:
    sum=sum+elt
print(sum/len(labels))

In [None]:
diff = abs(labels - predictions)
sum=0
count=0
for elt in diff:
    sum=sum+elt
    if elt!=0:
        count=count+1
        
print(sum/count)

In [None]:
count = [0,0,0,0,0]
for i in range(0,len(labels)):
    if labels[i]==-2:
        count[-1*int(predictions[i])-1] = count[-1*int(predictions[i])-1] + 1
print(count)

In [None]:
table_all2['LABEL'].describe(np.linspace(0,1,51))

In [None]:
table_all2

In [None]:
truth = table_all2['LABEL']

In [None]:
labels[12634]

In [None]:
truth[12634]

In [None]:
money = 0
count = 0
count2 = 0
money2 = 0
count3 = 0
money3 = 0
count4 = 0
money4 = 0
count_yes = 0
count_no  = 0
predictions = loaded_model.predict(features)

for i in range(0,len(predictions)):
    if predictions[i] == - 2 and truth[i] < 20:
            count = count + 1
            money = money + (20-truth[i])
    if predictions[i] == - 3 and truth[i] < 40:
            count2 = count2 + 1
            money2 = money2 + (40-truth[i])
    if predictions[i] == - 4 and truth[i] < 60:
            count3 = count3 + 1
            money3 = money3 + (60-truth[i])
    if predictions[i] == - 5 and truth[i] < 100:
            count4 = count4 + 1
            print(truth[i])
            money4 = money4 + (100-truth[i])
            
            
print(count)
print(count2)
print(count3)
print(count4)
print(money+money2+money3+money4)

In [None]:
preds_proba = loaded_model.predict_proba(features)

In [None]:
count = 0
count2 = 0
for i in range(0,len(predictions2)):
    if predictions[i] == - 2:
            count=count+1
            if truth[i]>=20:
                count2 = count2 + 1
print(count2/count)
count = 0
count2 = 0
for i in range(0,len(predictions2)):
    if predictions[i] == - 3:
            count=count+1
            if truth[i]>=40:
                count2 = count2 + 1
print(count2/count)
count = 0
count2 = 0
for i in range(0,len(predictions2)):
    if predictions[i] == - 4:
            count=count+1
            if truth[i]>=60:
                count2 = count2 + 1
print(count2/count)
count = 0
count2 = 0
for i in range(0,len(predictions2)):
    if predictions[i] == - 5:
            count=count+1
            if truth[i]>=100:
                count2 = count2 + 1
print(count2/count) 


#0.8540344514959202
#0.9095149253731343
#0.9273109243697479
#0.879375

In [None]:
count = 0
predictions2 = loaded_model.predict(features)
for i in range(0, len(preds_proba)):
    chosen_cat = np.argmax(preds_proba[i]) - 5
    prob       = max(preds_proba[i])
    if chosen_cat == -2 and prob<0.441:
#        print(preds_proba[i])
        pred = np.argmax([preds_proba[i][0],preds_proba[i][1],preds_proba[i][2], preds_proba[i][4]]) - 5
        if pred==-2:
            predictions2[i] = -1

        

money = 0
for i in range(0,len(predictions2)):
    if predictions2[i] == - 2 and truth[i] < 20:
            money = money + (20-truth[i])
    if predictions2[i] == - 3 and truth[i] < 40:
            money = money + (40-truth[i])
    if predictions2[i] == - 4 and truth[i] < 60:
            money = money + (60-truth[i])
    if predictions2[i] == - 5 and truth[i] < 100:
            money = money + (100-truth[i])
print(count)           
print(money)            
            


In [None]:
predictions2 = loaded_model.predict(features)
for i in range(0, len(preds_proba)):
    chosen_cat = np.argmax(preds_proba[i]) - 5
    prob       = max(preds_proba[i])
    if chosen_cat == -1 and prob<0.41:
        pred = np.argmax([preds_proba[i][0],preds_proba[i][1],preds_proba[i][2], preds_proba[i][3]]) - 5
        if pred == -2 or pred == -3:
            predictions2[i] = pred

    if chosen_cat == -5 and prob < 0.5:
        pred = np.argmax([preds_proba[i][1],preds_proba[i][2],preds_proba[i][3], preds_proba[i][4]]) - 4
        if pred == -3 or pred == -2:
            predictions2[i] = pred
         
    if chosen_cat == -4 and prob < 0.3:
        pred = np.argmax([preds_proba[i][2],preds_proba[i][3],preds_proba[i][4]]) - 3
        if pred == -3 or pred == -2:
            predictions2[i] = pred


        
count2 = 0
count = 0
for j in range(0, len(predictions2)):
    if predictions2[j]==-5:
        count = count + 1
        if labels[j]==-5:
            count2 = count2 + 1
            
            
money = 0
for i in range(0,len(predictions2)):
    if predictions2[i] == - 2 and truth[i] < 20:
            money = money + (20-truth[i])
    if predictions2[i] == - 3 and truth[i] < 40:
            money = money + (40-truth[i])
    if predictions2[i] == - 4 and truth[i] < 60:
            money = money + (60-truth[i])
    if predictions2[i] == - 5 and truth[i] < 100:
            money = money + (100-truth[i])            
            
            
print(money)               
print(count)
print(count2/count)

In [None]:
count = 0
count2 = 0

count3 = 0
count4 = 0
table_rel = table_all[['ATM_SUM','SWIPE_SUM','SPEND_SUM','DIRECT_DEPOSIT_SUM','DIRECT_PAY_SUM','AVERAGE_BALANCE', 'DID_PAYROLL']]
predictions2 = loaded_model.predict(features)
for i in range(0, len(preds_proba)):
    chosen_cat = np.argmax(preds_proba[i]) - 5
    prob       = max(preds_proba[i])
    if chosen_cat == -1 and truth[i]>100:
        #print(preds_proba[i])
        #print(table_rel.iloc[i])
        count = count + 1
    #if table_rel['AVERAGE_BALANCE'].iloc[i] > 100:
#            count2 = count2 + 1
#            print(preds_proba[i])
#            print(table_rel.iloc[i])
    if chosen_cat == -1 and table_rel['AVERAGE_BALANCE'].iloc[i] > 100 and table_rel['SWIPE_SUM'].iloc[i] > 100:
#        #print(preds_proba[i])
#        #print(table_rel.iloc[i])
        count3 = count3 + 1
        if truth[i]>100:
            count4 = count4 + 1

#print(count2/count)
print(count4/count3)
print(count)

In [None]:
count = 0
for i in range(0,len(predictions2)):
    if predictions2[i] == - 1 and truth[i] > 500:
            count = count + 1
print(count)

count = 0
for i in range(0,len(predictions)):
    if predictions[i] == - 1 and truth[i] > 500:
            count = count + 1
print(count)

In [None]:
money = 0
money2 = 0
money3 = 0
money4 = 0


count = 0
count2 = 0
count3 = 0
count4 = 0
count_yes = 0
count_no  = 0
predictions2 = loaded_model.predict(features)

for i in range(0,len(predictions)):
    if predictions2[i] == - 2 and truth[i] < 20:
            print("index " + str(i) + " predictions " + str(predictions2[i]))
            count = count + 1
            money = money + (20-truth[i])
    if predictions2[i] == - 3 and truth[i] < 40:
            print("index " + str(i))
            count2 = count2 + 1
            money2 = money2 + (40-truth[i])
    if predictions2[i] == - 4 and truth[i] < 60:
            print("index " + str(i))
            count3 = count3 + 1
            money3 = money3 + (60-truth[i])
    if predictions2[i] == - 5 and truth[i] < 100:
            print("index " + str(i))
            count4 = count4 + 1
            money4 = money4 + (100-truth[i])
            
            
print(count)
print(money)
print(count2)
print(money2)
print(count3)
print(money3)
print(count4)
print(money4)

print(str(money) + " and number of people is " + str(count+count2+count3+count4))

In [None]:
for i in range(0, len(predictions)):
    if table_all2['TOTAL_MONEY_IN_COUNT'].iloc[i]==1:
        print("prediction: " + str(predictions[i]) + " and real is " + str(labels[i]) + " and truth is " + str(truth[i]))
print("next category")        
        
for i in range(0, len(predictions)):
        print("prediction: " + str(predictions[i]) + ", real is " + str(labels[i]) + " and total count is " + str(table_all2['TOTAL_MONEY_IN_COUNT'].iloc[i]))



In [None]:
table_all2[table_all2['BANK_ACCOUNT_ID']=='183467']

In [None]:
for i in range(0,len(predictions)):
    if predictions[i]==-5 and labels[i]>=-2:
        print(table_all2['BANK_ACCOUNT_ID'].iloc[i])

In [None]:
table_all2.iloc[106]

In [None]:
table_all2

In [None]:
count=0
count2=0
count3=0
count4=0

for i in range(0,len(predictions)):
    if predictions[i]==-2:
        count=count+1
    if predictions[i]==-3:
        count2=count2+1
    if predictions[i]==-4:
        count3=count3+1
    if predictions[i]==-5:
        count4=count4+1
        

print(count*20+count2*40+count3*60+count4*100)