In [1]:
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics 
import matplotlib.pyplot as plt 
import numpy as np 
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
from math import sqrt 
import pickle 


In [2]:
#load in csv file 

df = pd.read_csv('/Users/mprasad4/Portfolio_Health_Score-V1.csv')
df.shape


  df = pd.read_csv('/Users/mprasad4/Portfolio_Health_Score-V1.csv')


(735311, 39)

In [3]:
#declare the list of features 

#features= [ 'product', 'application_channel','country', 'business_form','sic_group','mcc', 'location_state' ,'ban_control', 'credit_score','uw_fraud_score','productgroup','tenure_in_days','active_merchant','average_trns_size','decline_rate','decline_count','approved_count','auth_count','auth_amount','ach_count','ach_amount','return_count','return_amount','return_rate','ach_loss','cc_count','cc_amount', 'cb_count','cb_amount','cb_rate','cc_loss']

features= [  'country', 'business_form','sic_group', 'location_state' ,'ban_control', 'credit_score','uw_fraud_score','productgroup','tenure_in_days','active_merchant','average_trns_size','decline_count','approved_count','auth_count','auth_amount','ach_count','ach_amount','return_count','return_amount','ach_loss','cc_count','cc_amount', 'cb_count','cb_amount','cc_loss']

target= 'close_reason_new'

df[features].dtypes


country               object
business_form         object
sic_group             object
location_state        object
ban_control             bool
credit_score         float64
uw_fraud_score       float64
productgroup          object
tenure_in_days       float64
active_merchant       object
average_trns_size    float64
decline_count        float64
approved_count       float64
auth_count             int64
auth_amount          float64
ach_count            float64
ach_amount           float64
return_count           int64
return_amount        float64
ach_loss             float64
cc_count             float64
cc_amount            float64
cb_count               int64
cb_amount            float64
cc_loss              float64
dtype: object

In [4]:
df['ban_control']= df['ban_control'].replace({True:1, False:0})

df[features].dtypes


country               object
business_form         object
sic_group             object
location_state        object
ban_control            int64
credit_score         float64
uw_fraud_score       float64
productgroup          object
tenure_in_days       float64
active_merchant       object
average_trns_size    float64
decline_count        float64
approved_count       float64
auth_count             int64
auth_amount          float64
ach_count            float64
ach_amount           float64
return_count           int64
return_amount        float64
ach_loss             float64
cc_count             float64
cc_amount            float64
cb_count               int64
cb_amount            float64
cc_loss              float64
dtype: object

In [5]:
df['active_merchant']= df['ban_control'].replace({"Active":1, "Inactive":0})

df[features].dtypes


country               object
business_form         object
sic_group             object
location_state        object
ban_control            int64
credit_score         float64
uw_fraud_score       float64
productgroup          object
tenure_in_days       float64
active_merchant        int64
average_trns_size    float64
decline_count        float64
approved_count       float64
auth_count             int64
auth_amount          float64
ach_count            float64
ach_amount           float64
return_count           int64
return_amount        float64
ach_loss             float64
cc_count             float64
cc_amount            float64
cb_count               int64
cb_amount            float64
cc_loss              float64
dtype: object

In [6]:
#recode missing values 

print(df[features].isnull().sum())


country                   0
business_form           377
sic_group                 0
location_state            0
ban_control               0
credit_score         174843
uw_fraud_score          288
productgroup              0
tenure_in_days       456299
active_merchant           0
average_trns_size    456299
decline_count        658494
approved_count       460210
auth_count                0
auth_amount          456299
ach_count            535695
ach_amount           539417
return_count              0
return_amount        735311
ach_loss             735311
cc_count             557122
cc_amount            561531
cb_count                  0
cb_amount            728351
cc_loss              728351
dtype: int64


In [7]:
#cont. recoding missing values - fill zero for columns

#cont. recoding missing values 
fillzero= ['close_reason_new','tenure_in_days','average_trns_size','decline_count','approved_count','auth_amount','ach_count','ach_amount','return_amount','return_rate','ach_loss','cc_count','cc_amount','cb_amount','cc_loss']
df[fillzero]=df[fillzero].fillna(0)


print(df[features].isnull().sum())




country                   0
business_form           377
sic_group                 0
location_state            0
ban_control               0
credit_score         174843
uw_fraud_score          288
productgroup              0
tenure_in_days            0
active_merchant           0
average_trns_size         0
decline_count             0
approved_count            0
auth_count                0
auth_amount               0
ach_count                 0
ach_amount                0
return_count              0
return_amount             0
ach_loss                  0
cc_count                  0
cc_amount                 0
cb_count                  0
cb_amount                 0
cc_loss                   0
dtype: int64


In [8]:
print(df[target].isnull().sum())

0


In [9]:
#cont. recoding missing values - mean credit score 

for feature in ['credit_score']: 
    df[feature].fillna(value=df[feature].mean(), inplace=True) 
    
print(df[features].isnull().sum())



country                0
business_form        377
sic_group              0
location_state         0
ban_control            0
credit_score           0
uw_fraud_score       288
productgroup           0
tenure_in_days         0
active_merchant        0
average_trns_size      0
decline_count          0
approved_count         0
auth_count             0
auth_amount            0
ach_count              0
ach_amount             0
return_count           0
return_amount          0
ach_loss               0
cc_count               0
cc_amount              0
cb_count               0
cb_amount              0
cc_loss                0
dtype: int64


In [10]:
#cont. recoding missing values - drop rows with null 


df = df.dropna(subset=['business_form', 'uw_fraud_score'])


print(df[features].isnull().sum())

country              0
business_form        0
sic_group            0
location_state       0
ban_control          0
credit_score         0
uw_fraud_score       0
productgroup         0
tenure_in_days       0
active_merchant      0
average_trns_size    0
decline_count        0
approved_count       0
auth_count           0
auth_amount          0
ach_count            0
ach_amount           0
return_count         0
return_amount        0
ach_loss             0
cc_count             0
cc_amount            0
cb_count             0
cb_amount            0
cc_loss              0
dtype: int64


In [11]:
#sample of data

df[features].head()

Unnamed: 0,country,business_form,sic_group,location_state,ban_control,credit_score,uw_fraud_score,productgroup,tenure_in_days,active_merchant,...,ach_count,ach_amount,return_count,return_amount,ach_loss,cc_count,cc_amount,cb_count,cb_amount,cc_loss
0,United States,Sole Proprietorship,Services,IA,0,959.0,41.142,other,94.0,0,...,4.0,75005.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0
1,United States,Limited Liability Corp,Construction,MD,0,853.0,108.24,qb_checking,191.0,0,...,1.0,195.0,0,0.0,0.0,14.0,22277.0,0,0.0,0.0
2,United States,Limited Liability Corp,Services,CA,0,676.0,6.443,qbse,25.0,0,...,2.0,0.0,0,0.0,0.0,2.0,1550.0,0,0.0,0.0
3,United States,Limited Liability Corp,Wholesale Trade,IA,0,806.0,35.645,qb_checking,159.0,0,...,0.0,0.0,0,0.0,0.0,13.0,1233.82,0,0.0,0.0
4,United States,Sole Proprietorship,Services,AZ,0,576.0,3.283,qbo,70.0,0,...,6.0,23689.23,0,0.0,0.0,0.0,0.0,0,0.0,0.0


In [12]:
df[target].head()

0    0
1    0
2    0
3    0
4    0
Name: close_reason_new, dtype: object

In [13]:
#recode target vairable as numeric 
df['close_reason_new'] = np.where(df['close_reason_new']==0,0,1)
df['close_reason_new'].value_counts(dropna=False)


0    692807
1     41839
Name: close_reason_new, dtype: int64

In [14]:
df['ban_control'].value_counts(dropna=False)

0    694450
1     40196
Name: ban_control, dtype: int64

In [17]:
#transform categorical values into numercial with one-hot encoding

df[features].dtypes
#df['decline_rate']= df['decline_rate'].astype(float)
#df[features].dtypes

#one-hot encoding for features
#encoder = ce.OneHotEncoder(cols= ['product', 'application_channel', 'country', 'business_form','sic_group','mcc','location_state','productgroup'],handle_missing='value' , use_cat_names=True)
                                  
encoder = ce.OneHotEncoder(cols= ['country', 'business_form','sic_group','location_state','productgroup'],handle_missing='value' , use_cat_names=True)
                                  
                                
features_encoded= encoder.fit_transform(df[features])


In [18]:
features_encoded.dtypes

country_United States                     int64
country_Canada                            int64
business_form_Sole Proprietorship         int64
business_form_Limited Liability Corp      int64
business_form_Partnership                 int64
                                         ...   
cc_count                                float64
cc_amount                               float64
cb_count                                  int64
cb_amount                               float64
cc_loss                                 float64
Length: 117, dtype: object

In [19]:
#df[features_encoded]= features_encoded.astype(int)

print(features_encoded.shape)

features_encoded.dtypes

(734646, 117)


country_United States                     int64
country_Canada                            int64
business_form_Sole Proprietorship         int64
business_form_Limited Liability Corp      int64
business_form_Partnership                 int64
                                         ...   
cc_count                                float64
cc_amount                               float64
cb_count                                  int64
cb_amount                               float64
cc_loss                                 float64
Length: 117, dtype: object

In [20]:
# Get a Dictionary containing the pairs of column names & data type objects.
dataTypeDict = dict(features_encoded.dtypes)
print('Data type of each column of Dataframe :')
print(dataTypeDict)

Data type of each column of Dataframe :
{'country_United States': dtype('int64'), 'country_Canada': dtype('int64'), 'business_form_Sole Proprietorship': dtype('int64'), 'business_form_Limited Liability Corp': dtype('int64'), 'business_form_Partnership': dtype('int64'), 'business_form_Non-Profit Organization': dtype('int64'), 'business_form_Corporation': dtype('int64'), 'business_form_Private Limited Company': dtype('int64'), 'business_form_Limited Liability Partnership': dtype('int64'), 'business_form_Public Limited Company': dtype('int64'), 'sic_group_Services': dtype('int64'), 'sic_group_Construction': dtype('int64'), 'sic_group_Wholesale Trade': dtype('int64'), 'sic_group_Manufacturing': dtype('int64'), 'sic_group_Retail Trade': dtype('int64'), 'sic_group_Transportation, Communications, Electric, Gas, And Sanitary Services': dtype('int64'), 'sic_group_Agriculture, Forestry, And Fishing': dtype('int64'), 'sic_group_Finance, Insurance, And Real Estate': dtype('int64'), 'sic_group_Publ

In [21]:
df[target].dtypes
df['close_reason_new'].shape

(734646,)

In [22]:
features_encoded.head()

Unnamed: 0,country_United States,country_Canada,business_form_Sole Proprietorship,business_form_Limited Liability Corp,business_form_Partnership,business_form_Non-Profit Organization,business_form_Corporation,business_form_Private Limited Company,business_form_Limited Liability Partnership,business_form_Public Limited Company,...,ach_count,ach_amount,return_count,return_amount,ach_loss,cc_count,cc_amount,cb_count,cb_amount,cc_loss
0,1,0,1,0,0,0,0,0,0,0,...,4.0,75005.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0
1,1,0,0,1,0,0,0,0,0,0,...,1.0,195.0,0,0.0,0.0,14.0,22277.0,0,0.0,0.0
2,1,0,0,1,0,0,0,0,0,0,...,2.0,0.0,0,0.0,0.0,2.0,1550.0,0,0.0,0.0
3,1,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0,0.0,0.0,13.0,1233.82,0,0.0,0.0
4,1,0,1,0,0,0,0,0,0,0,...,6.0,23689.23,0,0.0,0.0,0.0,0.0,0,0.0,0.0


In [52]:

#df[features_encoded] = pd.DataFrame(features_encoded)

Index(['country_United States', 'country_Canada',
       'business_form_Sole Proprietorship',
       'business_form_Limited Liability Corp', 'business_form_Partnership',
       'business_form_Non-Profit Organization', 'business_form_Corporation',
       'business_form_Private Limited Company',
       'business_form_Limited Liability Partnership',
       'business_form_Public Limited Company',
       ...
       'ach_count', 'ach_amount', 'return_count', 'return_amount', 'ach_loss',
       'cc_count', 'cc_amount', 'cb_count', 'cb_amount', 'cc_loss'],
      dtype='object', length=117)

# Model Building 

In [23]:
#specify x and y 

y= df['close_reason_new']
x= features_encoded

In [24]:
#train-test split 
    # not sure what test_size to use?ra

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.25, random_state=42)

In [25]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(x_train)


In [38]:
# Use the scaler to transform the dataset
x_test_scaled = scaler.fit_transform(x_test)
x_test_scaled[0]

array([ 1.75420710e-01, -1.75420710e-01,  1.29302500e+00, -7.21252114e-01,
       -2.60281492e-01, -1.87195196e-01, -4.68226063e-01, -7.66588952e-02,
       -1.88158520e-02, -1.51239298e-02, -9.32003018e-01, -3.81711282e-01,
       -2.31249375e-01, -8.81146308e-02,  2.13828656e+00, -2.74737617e-01,
       -2.26081520e-01, -2.25588837e-01, -5.31309692e-02, -8.74216809e-02,
       -1.30598482e-01, -3.40485026e-01, -1.46597354e-01, -1.55699246e-01,
       -1.07183499e-01, -1.21064982e-01, -3.35203604e-01, -1.59113900e-01,
       -1.57523875e-01, -1.74196758e-01, -1.14360924e-01, -1.12390246e-01,
       -1.19972234e-01, -9.21362842e-02,  7.85523248e+00, -1.21895734e-01,
       -1.19527811e-01, -1.55937540e-01, -2.71468533e-01, -5.42023985e-02,
       -7.90305158e-02, -1.37727399e-01, -3.19906520e-01, -1.96739591e-01,
       -1.54576659e-01, -1.54834954e-01, -4.91161928e-02, -8.68825501e-02,
       -1.43446538e-01, -1.08732007e-01, -6.18541112e-02, -1.81895125e-01,
       -1.14775555e-01, -

In [39]:
# Fit the model to the training dataset

logreg = LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=10000)
#logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)


In [40]:
# coefficients and intercept

logreg.intercept_
logreg.coef_

array([[-2.04357495e-02,  2.04357495e-02,  3.90391757e-02,
        -1.26813553e-02, -3.69579281e-02,  4.47613220e-03,
        -1.37492959e-02,  2.61900932e-02,  1.58009539e-02,
        -1.17891748e-01,  1.24231847e-02, -4.26176204e-02,
         3.00235414e-02,  1.12345806e-02,  4.37451691e-02,
        -3.41238619e-02,  4.43691685e-02, -7.87935931e-02,
        -5.87803771e-03, -2.06618964e-02, -7.21176283e-03,
        -4.13166653e-03,  8.59066060e-03, -2.76046262e-02,
         2.30037292e-02,  5.35646880e-03, -1.04848654e-02,
         4.20296608e-02,  3.31175503e-02,  6.59228745e-02,
        -5.25810172e-03,  6.92638512e-02,  6.89219082e-03,
        -3.77161033e-02,  7.70358619e-03,  4.64393381e-02,
         6.79847943e-03,  1.63965829e-02,  2.04262355e-01,
         7.27130253e-03, -1.84831735e-01,  1.24262216e-03,
         4.23805063e-02,  7.48698266e-03,  2.68017569e-02,
        -1.09507859e-02, -1.54604858e-01, -1.72990228e-02,
        -3.82434441e-03,  7.44738897e-03, -2.88334955e-0

In [41]:
#Predit the y-values on the testing dataset

y_preds= logreg.predict(x_test)
y_probs= logreg.predict_proba(x_test)


X has feature names, but LogisticRegression was fitted without feature names


X has feature names, but LogisticRegression was fitted without feature names



# Model Evaluation

In [42]:
#evaluate the model 

print(metrics.classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97    173325
           1       0.84      0.12      0.21     10337

    accuracy                           0.95    183662
   macro avg       0.89      0.56      0.59    183662
weighted avg       0.94      0.95      0.93    183662



In [43]:
# save your eval report as an html file

report = metrics.classification_report(y_test, y_preds, output_dict=True)
evalreport = pd.DataFrame(report).transpose()
evalreport.to_html('/Users/mprasad4/evalreport.html')
evalreport

Unnamed: 0,precision,recall,f1-score,support
0,0.950241,0.998563,0.973803,173325.0
1,0.836507,0.123247,0.21484,10337.0
accuracy,0.949298,0.949298,0.949298,0.949298
macro avg,0.893374,0.560905,0.594322,183662.0
weighted avg,0.94384,0.949298,0.931087,183662.0


In [44]:
# true positives, etc.
y_score = logreg.predict_proba(x_test)[:, 1]
fpr, tpr, thresh=metrics.roc_curve(y_test, y_score)
roc_df=pd.DataFrame(zip(fpr, tpr, thresh), columns=['FPR','TPR','Threshold'])
roc_df.head()


X has feature names, but LogisticRegression was fitted without feature names



Unnamed: 0,FPR,TPR,Threshold
0,0.0,0.0,2.0
1,0.001414,0.122956,1.0
2,0.001425,0.122956,1.0
3,0.001425,0.123053,1.0
4,0.001431,0.123053,1.0


In [45]:
import plotly.express as px

def make_rocauc(i):
    nearest=roc_df.iloc[(roc_df['Threshold']-i).abs().argsort()[:1]]['Threshold'].values[0]
    q=roc_df[roc_df['Threshold']==nearest].index[0]
    print(nearest, q)
    fig = px.area(roc_df, x="FPR", y="TPR",
                  title=f'ROC Curve (AUC={metrics.auc(fpr, tpr):.3f})',
                  hover_data={'Threshold':':.2f',
                              'FPR':':.2f',
                              'TPR':':.2f',
                             },width=800, height=700)

    fig.add_annotation(x=roc_df.iloc[q][0], y=roc_df.iloc[q][1],
            text=f"Threshold nearest {i*100:.0f}% = {roc_df.iloc[q][2]:.2f}",
                showarrow=True,
                arrowhead=1)

    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )
    fig.update_yaxes(scaleanchor="x", scaleratio=1)
    fig.update_xaxes(constrain='domain')
    return fig
make_rocauc(.60)

0.9999988464496472 7


In [46]:
# display with plotly
import plotly.express as px
fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={metrics.auc(fpr, tpr):.3f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.write_json('/Users/mprasad4/evalreport.html')
fig.show()

# Make predicitions on new data

In [34]:
# check out one row of the test data
x_test.iloc[0]

country_United States                   1.0
country_Canada                          0.0
business_form_Sole Proprietorship       1.0
business_form_Limited Liability Corp    0.0
business_form_Partnership               0.0
                                       ... 
cc_count                                0.0
cc_amount                               0.0
cb_count                                0.0
cb_amount                               0.0
cc_loss                                 0.0
Name: 719667, Length: 117, dtype: float64

In [35]:
# show a prediction & probability for that value
print(logreg.predict([x_test.iloc[0].values])[0])
print(logreg.predict_proba([x_test.iloc[0].values]).max())

0
1.0


In [36]:
# pickle your model

import pickle
filename = open('portfolio_health_score.pkl', 'wb')
pickle.dump(logreg, filename)
filename.close()

# Features Importance 


In [48]:
# 'Attributes' is another name for our list of features (aka predictors, independent variables)
attributes=x_test.columns
print(attributes)
# 'Feature importances' is another name for our coefficients (ie., the impace of each feature on the outcome or DV)
feature_importances=logreg.coef_
print(feature_importances)

Index(['country_United States', 'country_Canada',
       'business_form_Sole Proprietorship',
       'business_form_Limited Liability Corp', 'business_form_Partnership',
       'business_form_Non-Profit Organization', 'business_form_Corporation',
       'business_form_Private Limited Company',
       'business_form_Limited Liability Partnership',
       'business_form_Public Limited Company',
       ...
       'ach_count', 'ach_amount', 'return_count', 'return_amount', 'ach_loss',
       'cc_count', 'cc_amount', 'cb_count', 'cb_amount', 'cc_loss'],
      dtype='object', length=117)
[[-2.04357495e-02  2.04357495e-02  3.90391757e-02 -1.26813553e-02
  -3.69579281e-02  4.47613220e-03 -1.37492959e-02  2.61900932e-02
   1.58009539e-02 -1.17891748e-01  1.24231847e-02 -4.26176204e-02
   3.00235414e-02  1.12345806e-02  4.37451691e-02 -3.41238619e-02
   4.43691685e-02 -7.87935931e-02 -5.87803771e-03 -2.06618964e-02
  -7.21176283e-03 -4.13166653e-03  8.59066060e-03 -2.76046262e-02
   2.30037292e-

In [51]:
# let's take a look at the results
feature_imp = pd.DataFrame(list(zip(attributes, feature_importances)), columns=['features', 'coeffs'])
feature_imp=feature_imp.set_index('features')
feature_imp=feature_imp.sort_values('coeffs')
feature_imp

Unnamed: 0_level_0,coeffs
features,Unnamed: 1_level_1
country_United States,"[-0.020435749516752942, 0.020435749516752942, ..."


In [54]:
# plot that as a bar chart
feature_imp.plot(kind='bar')

TypeError: no numeric data to plot

In [55]:
# with plotly
import plotly.graph_objects as go
data = go.Bar(x=list(feature_imp.index), y=feature_imp['coeffs'])
coefs = go.Figure([data])
coefs

# Things to work on in next iteration:  

In [None]:
#work on balancing the samples (open vs closed accounts) to increase the precision/recall/f1-score predictions for closed accounts 
#remove multicollinearity
#continue to clean data and add in new features 
#try different machine learning models- like random forest or kn nearest neighbor 