In [4]:
pip install imbalanced-learn

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import warnings 
warnings.simplefilter("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import sklearn as skl
import imblearn as im
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

In [6]:
#Importing training data and test data

df_train = pd.read_csv('credit_train.csv')
df_test = pd.read_csv('credit_test.csv')

In [7]:
#Getting information about the training data
df_train.info()
df_train.isnull().sum()
df_train.shape

In [8]:
Summary = pd.DataFrame(df_train.describe())
Summary

In [9]:
##EDA

##### Correlation Plot

In [10]:
sns.set(rc={'figure.figsize':(13,8)})
sns.heatmap(df_train.corr(),annot=True, linewidths=1, annot_kws={"size":11})
plt.pyplot.title('Correlation Heatmap')
plt.pyplot.show()

###### Histogram of Years of credit history vs Customer Count

In [11]:
df_train.hist(column='Years of Credit History', bins=25, grid=False, figsize=(10,6), edgecolor = 'k')
plt.pyplot.xlabel('Years of Credit History')
plt.pyplot.ylabel('Customer Count')
plt.pyplot.title('Customer Count vs Years of Credit History ')
plt.pyplot.show()

In [12]:
#The above graph is normally distributed with outliers greater than 50 years of credit history , 
#therefore we removed values >50 in year of credit history

##### Box Plot of Current Loan Amount

In [13]:
df_train['Current Loan Amount'].plot(kind='box', title='Loan Amount')
#plt.pyplot.ylim(0,1)
plt.pyplot.show()

In [14]:
#removed outlier of current loan amount = 99999999.0

###### Stacked bar chart of Loan Status for years in current job

In [15]:
df_train.groupby('Years in current job')['Loan Status'].value_counts().unstack(level=1).plot.bar(stacked=True)
plt.pyplot.ylabel('Customer Count')
plt.pyplot.title('Number of Customers vs Years in current job')
plt.pyplot.show()

In [16]:
#There is similar Loan status proportion amongst all the Years in current job 

##### Bar Chart of Loan Status

In [17]:
sns.set(rc={'figure.figsize':(5,3)})
sns.countplot(data=df_train, x='Loan Status', palette='turbo').set(title='Customer count vs Loan Status')

In [18]:
df_train['Loan Status'].value_counts()

In [19]:
#The decision variable which is loan status has majority of data inclined towards "Fully paid" category which inturn would result in overfitting of data ,
# to resolve this issue , we performed resampling

#### Data Cleaning 

In [20]:
def DataCleaning(df):
        ##Filtering out outlier data for credit score using Boolean Mask
        mask = df['Credit Score'] <=850
        df = df[mask]

        ##Filtering out value of Current Loan Amount that is not equal to 99999999.0
        df = df[df['Current Loan Amount'] != 99999999.0]

        ##Filtering out outlier data for Years of Credit History using Boolean Mask

        df = df[df['Years of Credit History'] <=50]

        ##Filtering null values 
        #df['Bankruptcies'].dropna(inplace=True)
        #df['Years in current job'].dropna(inplace=True)

        ##Adding null values with zero
        df['Years in current job'].fillna(0,inplace=True)

        ##Removing redundant columns
        df.drop(columns=['Loan ID','Customer ID'],inplace=True)
        df.drop(columns='Months since last delinquent',inplace=True)

        ##Filtering out values of Years in current job that is not equal to 0
        df = df[df['Years in current job'] != 0]

        ##Adding null values of Bankruptcies column with 10 for subsequent filtering
        df['Bankruptcies'].fillna(10,inplace=True)

         ##Filtering out values of Bankruptcies that is not equal to 10
        df = df[df['Bankruptcies'] != 10]

        ##Encoding values of Term, Years in current job, Purpose, Home Ownership
        df['Term'].replace(to_replace=['Short Term', 'Long Term'], value=[1,0],inplace=True)
        df['Years in current job'].replace(to_replace=['8 years', '3 years', '< 1 year', '2 years', '10+ years',
       '4 years', '5 years', '1 year', '7 years', '6 years', '9 years'], value=[1,2,3,4,5,6,7,8,9,10,11],inplace=True)
        df['Home Ownership'].replace(to_replace=['Home Mortgage', 'Own Home', 'Rent', 'HaveMortgage'], value=[1,2,3,4],inplace=True)
        df['Purpose'].replace(to_replace=['Home Improvements', 'Debt Consolidation', 'Buy House', 'other',
       'Take a Trip', 'Other', 'Business Loan', 'Buy a Car',
       'small_business', 'Medical Bills', 'vacation',
       'Educational Expenses', 'wedding', 'major_purchase', 'moving',
       'renewable_energy'], value=[1,2,3,4,5,4,6,7,8,9,10,11,12,13,14,15],inplace=True)

        ##Replacing column header names
        df.columns = df.columns.str.replace(' ', '_')
        return df


#####TRAINING DATA#####
##Encoding Loan Status Categorical variable with 0 and 1
df_train['Loan Status'] = df_train['Loan Status'].replace(to_replace=['Fully Paid','Charged Off'],value=[1,0])

df_train = DataCleaning(df_train)
df_train

####TEST DATA#####
df_test = DataCleaning(df_test)
df_test

##### Correlation Plot post cleaning 

In [21]:
sns.set(rc={'figure.figsize':(15,8)})
sns.heatmap(df_train.corr(),annot=True, linewidths=1, annot_kws={"size":11})
plt.pyplot.title('Correlation Heatmap')
plt.pyplot.show()

## Logistic Regression

In [22]:
##Creating X and y variable where X is the independent variable and y is the dependent variable
X = df_train[['Current_Loan_Amount', 'Term', 'Credit_Score',
       'Annual_Income', 'Years_in_current_job', 'Home_Ownership', 'Purpose',
       'Monthly_Debt', 'Years_of_Credit_History', 'Number_of_Open_Accounts',
       'Number_of_Credit_Problems', 'Current_Credit_Balance',
       'Maximum_Open_Credit', 'Bankruptcies', 'Tax_Liens']]
y = df_train['Loan_Status']

In [23]:
#Creating object for LogisticRegression()
logreg = LogisticRegression()

#Using train_test_split to partition the data into train and test.

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

print(X_train.shape)
print(y_train.shape)

In [24]:
##Model Fitting
logreg.fit(X_train,y_train)

In [25]:
y_pred = logreg.predict(X_test)

In [26]:
np.count_nonzero(y_pred)

In [27]:
np.unique(y_pred, return_counts=True)

In [28]:
prob_accuracy_score = skl.metrics.accuracy_score(y_test,y_pred)
prob_accuracy_score

In [29]:
#Decision variable Loan Status count before data resampling
y.value_counts()

In [30]:
## Using RandomOverSampler we are trying to resolve over sampling by adding duplicate rows from minority class in the data.
over_sampling = im.over_sampling.RandomOverSampler(sampling_strategy=0.7)
X_over_sampling, y_over_sampling = over_sampling.fit_resample(X,y)

In [31]:
#Decision variable Loan Status count after data resampling
y_over_sampling.value_counts()

In [32]:
print(X_over_sampling.shape)
print(y_over_sampling.shape)

In [33]:
logreg.fit(X_over_sampling,y_over_sampling)
y_pred_over = logreg.predict(X_test)

In [34]:
y_pred_over

In [35]:
np.unique(y_pred_over, return_counts=True)

In [36]:
prob_accuracy_score_over = skl.metrics.accuracy_score(y_test,y_pred_over)
prob_accuracy_score_over

In [37]:
df_test

In [38]:
#Predict the loan status outcome on new dataset for recommendation

In [39]:
recom = logreg.predict(df_test)

In [40]:
recom

In [41]:
#Adding loan status predicted values as a column in the existing data
df_test.reset_index(drop=True, inplace = True)

In [42]:
df_prediction_recom = pd.DataFrame(recom)

In [43]:
df_Logistic_final=pd.concat([df_test, df_prediction_recom], axis=1)
df_Logistic_final

In [44]:
df_Logistic_final.rename(columns={0:'Loan Status Predicted'},inplace=True)
df_Logistic_final

In [45]:
df_Logistic_final['Loan Status Predicted'].value_counts()

In [46]:
#adding loan status name column for making piechart
df_Logistic_final['Loan_Status_Name'] = np.where(df_Logistic_final['Loan Status Predicted'] == 1, "Fully Paid", "Charged off")

In [47]:
#quality check
df_Logistic_final['Loan_Status_Name'].value_counts()

In [48]:
#pie chart of the predicted values (% split)
df_Logistic_final.groupby('Loan_Status_Name').count().plot(kind='pie',y='Loan Status Predicted',autopct='%1.0f%%',
                                                           title='Loan Status Share',radius = 0.8)
plt.pyplot.ylabel("")
plt.pyplot.show()

# Decision Tree Algorithm

In [49]:
# Create Decision Tree classifer object
dt = DecisionTreeClassifier(criterion="entropy", max_depth=4)

# Train Decision Tree Classifer
dt.fit(X_over_sampling, y_over_sampling)

# making predictions

prediction = dt.predict(X_over_sampling)
print('Prediction {}'.format(prediction))

In [50]:
y_over_names = ['Fully Paid','Charged Off' ]
X_over_names = list(X_over_sampling.columns)

In [51]:
plt.pyplot.figure(figsize=(30, 20))
plot_tree(dt, filled=True, fontsize=12, feature_names=X_over_names, 
          rounded=True, class_names = y_over_names)
plt.pyplot.title("Decision tree of the iris dataset")
plt.pyplot.show()

In [52]:
from sklearn.model_selection import train_test_split

X_train_dec, X_test_dec, y_train_dec, y_test_dec =train_test_split(X_over_sampling, y_over_sampling, test_size=0.3,random_state=21, stratify=y_over_sampling)

In [53]:
dt = DecisionTreeClassifier(criterion="entropy", max_depth=20)
test = dt.fit(X_train_dec, y_train_dec)
y_pred = dt.predict(X_test_dec)
print("Test set predictions:\n {}".format(y_pred))
dt.score(X_test_dec, y_test_dec)

In [54]:
test.feature_importances_

In [55]:
prediction_decision_tree = dt.predict(df_test)

In [56]:
prediction_decision_tree

In [57]:
type(df_test)

In [58]:
df_test.reset_index(drop=True, inplace = True)

In [59]:
prediction_decision_tree.shape

In [60]:
df_test.shape

In [61]:
df_prediction_decision_tree = pd.DataFrame(prediction_decision_tree)

In [62]:
df_DecisionTree_Final=pd.concat([df_test, df_prediction_decision_tree], axis=1)
df_DecisionTree_Final

In [63]:
df_DecisionTree_Final.rename(columns={0:'Loan Status Predicted'},inplace=True)
df_DecisionTree_Final

In [64]:
# Comparing cols to check the output % match of logistic regression output and decision tree output
output_comparison = recom==prediction_decision_tree
output_comparison_Series= pd.Series(output_comparison)

In [65]:
output_comparison_Series.value_counts()

In [66]:
#Percentage of matched output
output_comparison_Series.value_counts(normalize=True)

In [67]:
df_Logistic_final

In [68]:
 df_Logistic_final.describe()  

In [69]:
df_charged_off = df_Logistic_final[df_Logistic_final['Loan Status Predicted'] == 0]

In [70]:
bins = [705033, 955033, 1205033, 1455033, 1705033, 1955033, 2205033, 2455033, 2705033, 2955033]
plt.pyplot.hist(df_charged_off["Annual_Income"], bins=bins)

In [71]:
# df_charged_off["Annual_Income"].hist(bins=int)

# plt.hist(df_charged_off["Annual_Income"], bins=10)

w=100000

plt.pyplot.hist(df_charged_off["Annual_Income"], bins=np.arange(min(df_charged_off["Annual_Income"]), max(df_charged_off["Annual_Income"]) + w, w))
#plt.xticks(rotation =0)
plt.pyplot.ticklabel_format(style='plain')
plt.pyplot.title('Charged off distribution')
plt.pyplot.xlabel('Annual Income')
plt.pyplot.ylabel('Number of Charged off Customers')
plt.pyplot.show()


In [72]:

w=10

plt.pyplot.hist(df_charged_off["Credit_Score"], bins=np.arange(min(df_charged_off["Credit_Score"]), max(df_charged_off["Credit_Score"]) + w, w))
#plt.xticks(rotation =0)
plt.pyplot.ticklabel_format(style='plain')
plt.pyplot.title('Charged off distribution by Credit Score')
plt.pyplot.xlabel('Credit Score')
plt.pyplot.ylabel('Number of Charged off Customers')
plt.pyplot.show()

In [73]:
import plotly.express as px
fig = px.histogram(df_DecisionTree_Final, x="Credit_Score", title="Customers distribution by Credit Score", 
                   color="Loan Status Predicted",
                  color_discrete_sequence=['indianred','blue'])

fig.show()

In [74]:
##updating the legend names
newnames = {'1.0':'Fully paid', '0.0': 'Charged off'}
fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                      legendgroup = newnames[t.name],
                                      hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                     )
                  )


In [75]:
import plotly.express as px
fig1 = px.histogram(df_Logistic_final, x="Credit_Score",
                   title="Charged off Customers distribution by Credit Score",
                   color="Loan Status Predicted",
                  color_discrete_sequence=['indianred','blue'])

fig1.show()

In [76]:
##updating the legend names
newnames = {'1.0':'Fully paid', '0.0': 'Charged off'}
fig1.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                      legendgroup = newnames[t.name],
                                      hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                     )
                  )


## KNN Classifier

In [77]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train_dec, y_train_dec)

In [78]:
prediction_knn = knn.predict(X_test_dec)
 

In [79]:
X_test_dec

In [80]:
y_test_dec.shape

In [81]:
prediction_knn.shape

In [82]:
prediction_knn_df = pd.DataFrame(prediction_knn)
y_train_dec_df = pd.DataFrame(y_test_dec)
# prediction_knn = prediction_knn.reshape(-1,1)
# y_test_dec = y_test_dec.reshape(-1,1)
prediction_knn_df.shape
#y_train_dec_df.shape

In [83]:
score_knn = skl.metrics.accuracy_score(y_test_dec,prediction_knn_df)
score_knn
