In [None]:
import pandas as pd 
import matplotlib.pyplot as py
import seaborn as sns
train_data = pd.read_csv('../input/loan-dataset/train_x.csv')
train_data.info()
train_label_data=pd.read_csv('../input/loan-dataset/train_y.csv')
train_label_data.info()
train_data = train_data.merge(train_label_data,on = 'ID')

In [None]:
train_data.head(20)

In [None]:

new_train_data = train_data[train_data.Label.notnull()]  
new_train_data.head(7)

length = len(train_data.ID)
length_new = len(new_train_data.ID)
per = (1 - length_new/length) *100
print("Percent of data dropped is ",per)


In [None]:

sns.countplot(x = "Label",data = new_train_data)

l = len(new_train_data.Label)
s = new_train_data.Label.sum()
print(s)
percent = (s/l)*100
print("Percentage of labels which are default is",percent)


In [None]:
interested_columns=['Loan type', 'Occupation type','Age' ]
for col in interested_columns:
    categorical_bin = pd.crosstab(new_train_data[col],new_train_data['Label'])
    categorical_bin.div(categorical_bin.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
    py.xlabel(f'{col}')
    P = py.ylabel('Percentage')

In [None]:
corr = new_train_data.corr(method = 'pearson')
f, ax = py.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(10, 275, as_cmap=True)

sns.heatmap(corr, cmap=cmap, square=True,
            linewidths=0.5, cbar_kws={"shrink": 0.5}, ax=ax)

In [None]:
numerical_columns= ['Expense','Income', 'Score1','Score2','Score3','Score4', 'Score5']


fig,axes = py.subplots(3,3,figsize=(20,14))
for idx,cat_col in enumerate(numerical_columns):
     row,col = idx//3,idx%3
     sns.boxplot(y=cat_col,data=train_data,x='Label',ax=axes[row,col])

print(train_data[numerical_columns].describe())
py.subplots_adjust(hspace=0.5)



In [None]:

interested_columns = ['Expense','Income', 'Score1','Score2','Score3','Score4','Score5','Label']
sns.pairplot(new_train_data[interested_columns][:5000],hue='Label')

In [None]:

categorical_columns=['ID','Loan type', 'Occupation type','Age' ]
numerical_columns= ['ID','Expense','Income', 'Score1','Score2','Score3','Score4', 'Score5']

In [None]:
X = new_train_data.drop(columns='Label')
y = new_train_data['Label']

from sklearn.impute import SimpleImputer
imp= SimpleImputer(strategy = 'most_frequent')
X_categorical = imp.fit_transform(X[categorical_columns])
X_categorical = pd.DataFrame(X_categorical,columns=categorical_columns)

imp= SimpleImputer(strategy = 'mean')
X_numerical = imp.fit_transform(X[numerical_columns])
X_numerical = pd.DataFrame(X_numerical,columns=numerical_columns)

In [None]:
X = X_numerical.merge(X_categorical,on="ID")
X = X.drop(columns = 'ID')

X = pd.get_dummies(X,drop_first=True)


In [None]:
X.info()

In [None]:

from imblearn.over_sampling import SMOTE
smk = SMOTE(random_state=0)
X_new,y_new = smk.fit_sample(X,y)
len(y_new)

l = len(y_new)
s = y_new.sum()
print(s)
percent = (s/l)*100
print("Percentage of labels which are default after balancing the data set is",percent)

In [None]:
X_new.info()

In [None]:

interested_columns = ['Expense','Income', 'Score1','Score2','Score3','Score4','Score5','Label']
smote_df = pd.concat([X_new, y_new], axis=1)
smote_df = smote_df.sample(frac=1).reset_index(drop=True)
sns.pairplot(smote_df[interested_columns][:5000],hue='Label')

In [None]:
numerical_columns= ['Expense','Income', 'Score1','Score2','Score3','Score4', 'Score5']
categorical_columns =['Loan type_B','Occupation type_Y','Occupation type_Z','Age_1.0']

X_standard =pd.DataFrame([])

from sklearn.preprocessing import StandardScaler

X_standard[numerical_columns] =pd.DataFrame(StandardScaler().fit_transform(X_new[numerical_columns]))


X_standard[categorical_columns]=X_new[categorical_columns]
X_standard.info()
X_standard.head()

In [None]:

from sklearn.model_selection import train_test_split
X_train_encoded,X_test_encoded,y_train,y_test = train_test_split(X_standard,y_new,test_size=0.2,random_state=42)

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score
from sklearn.model_selection import cross_val_predict

train_accuracies = []
train_f1_scores = []
test_accuracies = []
test_f1_scores = []
thresholds = []


for thresh in np.arange(0.1,0.9,0.1): 
    logreg_clf = LogisticRegression(solver='liblinear')
    logreg_clf.fit(X_train_encoded,y_train)
    
    y_pred_train_thresh = logreg_clf.predict_proba(X_train_encoded)[:,1]
    y_pred_train = (y_pred_train_thresh > thresh).astype(int)

    train_acc = accuracy_score(y_train,y_pred_train)
    train_f1 = f1_score(y_train,y_pred_train)
    
    y_pred_test_thresh = logreg_clf.predict_proba(X_test_encoded)[:,1]
    y_pred_test = (y_pred_test_thresh > thresh).astype(int) 
    
    test_acc = accuracy_score(y_test,y_pred_test)
    test_f1 = f1_score(y_test,y_pred_test)
    
    train_accuracies.append(train_acc)
    train_f1_scores.append(train_f1)
    test_accuracies.append(test_acc)
    test_f1_scores.append(test_f1)
    thresholds.append(thresh)

In [None]:
Threshold_logreg = {"Training Accuracy": train_accuracies, "Test Accuracy": test_accuracies, "Training F1": train_f1_scores, "Test F1":test_f1_scores, "Decision Threshold": thresholds }
Threshold_logreg_df = pd.DataFrame.from_dict(Threshold_logreg)

plot_df = Threshold_logreg_df.melt('Decision Threshold',var_name='Metrics',value_name="Values")
fig,ax = py.subplots(figsize=(15,5))
sns.pointplot(x="Decision Threshold", y="Values",hue="Metrics", data=plot_df,ax=ax)

In [None]:

logreg_clf = LogisticRegression(solver='liblinear')
logreg_clf.fit(X_train_encoded,y_train)
    
y_pred_train_thresh = logreg_clf.predict_proba(X_train_encoded)[:,1]
y_pred_train = (y_pred_train_thresh > 0.45).astype(int)

train_acc = accuracy_score(y_train,y_pred_train)
train_f1 = f1_score(y_train,y_pred_train)
    
y_pred_test_thresh = logreg_clf.predict_proba(X_test_encoded)[:,1]
y_pred_test = (y_pred_test_thresh >0.45).astype(int) 
    
test_acc = accuracy_score(y_test,y_pred_test)
test_f1 = f1_score(y_test,y_pred_test)


In [None]:

print("Training acc. is :", train_acc)
print("Training f1 :",train_f1)
pd.crosstab(y_train, y_pred_train, rownames=['Actual'], colnames=['Predicted'], margins=True)


In [None]:

print("Test acc. is :", test_acc)
print("Test f1 :",test_f1)
pd.crosstab(y_test, y_pred_test, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [None]:
from sklearn.metrics import plot_roc_curve
ax=py.gca()
rfc=plot_roc_curve(logreg_clf,X_test_encoded,y_test,ax=ax,alpha=0.8)
py.show()

In [None]:
coeff_matrix = logreg_clf.coef_
print(coeff_matrix)

In [None]:

from sklearn.model_selection import train_test_split
X_train_encoded,X_test_encoded,y_train,y_test = train_test_split(X_new,y_new,test_size=0.2,random_state=42)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,f1_score


tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train_encoded,y_train)
y_pred = tree_clf.predict(X_train_encoded)
print("Training Data Set Accuracy: ", accuracy_score(y_train,y_pred))
print("Training Data F1 Score ", f1_score(y_train,y_pred))

print("Validation Mean F1 Score: ",cross_val_score(tree_clf,X_train_encoded,y_train,cv=5,scoring='f1_macro').mean())
print("Validation Mean Accuracy: ",cross_val_score(tree_clf,X_train_encoded,y_train,cv=5,scoring='accuracy').mean())

In [None]:
y_pred = tree_clf.predict(X_test_encoded)
print("Test Data Set Accuracy: ", accuracy_score(y_test,y_pred))
print("Test Data F1 Score ", f1_score(y_test,y_pred))


In [None]:

training_accuracy = []
val_accuracy = []
training_f1 = []
val_f1 = []
tree_depths = []
test_accuracy = []
test_val_accuracy =[]
test_val_f1 = []
test_f1 =[]

for depth in range(1,20):
    tree_clf = DecisionTreeClassifier(max_depth=depth)
    tree_clf.fit(X_train_encoded,y_train)
    y_training_pred = tree_clf.predict(X_train_encoded)

    training_acc = accuracy_score(y_train,y_training_pred)
    train_f1 = f1_score(y_train,y_training_pred)
    val_mean_f1 = cross_val_score(tree_clf,X_train_encoded,y_train,cv=5,scoring='f1_macro').mean()
    val_mean_accuracy = cross_val_score(tree_clf,X_train_encoded,y_train,cv=5,scoring='accuracy').mean()
    
    y_test_pred_1 = tree_clf.predict(X_test_encoded)

    training_acc_1 = accuracy_score(y_test,y_test_pred_1)
    train_f1_1 = f1_score(y_test,y_test_pred_1)
    val_mean_f1_1 = cross_val_score(tree_clf,X_test_encoded,y_test,cv=5,scoring='f1_macro').mean()
    val_mean_accuracy_1 = cross_val_score(tree_clf,X_test_encoded,y_test,cv=5,scoring='accuracy').mean()
    
    training_accuracy.append(training_acc)
    val_accuracy.append(val_mean_accuracy)
    training_f1.append(train_f1)
    val_f1.append(val_mean_f1)
    tree_depths.append(depth)
    
     
    test_accuracy.append(training_acc_1)
    test_val_accuracy.append(val_mean_accuracy_1)
    test_f1.append(train_f1_1)
    test_val_f1.append(val_mean_f1_1)
    

Tuning_Max_depth = {"Training Accuracy": training_accuracy, "Validation Accuracy": val_accuracy, "Training F1": training_f1, "Validation F1":val_f1, "Max_Depth": tree_depths ,"Test_val_f1":test_val_f1 , "Test_val_acc":test_val_accuracy , "Test_acc":test_accuracy , "Test_f1":test_f1 }
Tuning_Max_depth_df = pd.DataFrame.from_dict(Tuning_Max_depth)

plot_df = Tuning_Max_depth_df.melt('Max_Depth',var_name='Metrics',value_name="Values")
fig,ax = py.subplots(figsize=(15,5))
sns.pointplot(x="Max_Depth", y="Values",hue="Metrics", data=plot_df,ax=ax)


In [None]:
Tuning_Max_depth = {"Training Accuracy": training_accuracy, "Validation Accuracy": val_accuracy, "Training F1": training_f1, "Validation F1":val_f1, "Max_Depth": tree_depths }
Tuning_Max_depth_df = pd.DataFrame.from_dict(Tuning_Max_depth)

plot_df = Tuning_Max_depth_df.melt('Max_Depth',var_name='Metrics',value_name="Values")
fig,ax = py.subplots(figsize=(15,5))
sns.pointplot(x="Max_Depth", y="Values",hue="Metrics", data=plot_df,ax=ax)

In [None]:
Tuning_Max_depth = {  "Max_Depth": tree_depths , "Test_acc":test_accuracy ,"Test_val_acc":test_val_accuracy,"Test_f1":test_f1 ,"Test_val_f1":test_val_f1  }
Tuning_Max_depth_df = pd.DataFrame.from_dict(Tuning_Max_depth)

plot_df = Tuning_Max_depth_df.melt('Max_Depth',var_name='Metrics',value_name="Values")
fig,ax = py.subplots(figsize=(15,5))
sns.pointplot(x="Max_Depth", y="Values",hue="Metrics", data=plot_df,ax=ax)

In [None]:


tree_clf = DecisionTreeClassifier(max_depth =8)
tree_clf.fit(X_train_encoded,y_train)
y_pred = tree_clf.predict(X_train_encoded)
print("Training Data Set Accuracy: ", accuracy_score(y_train,y_pred))
print("Training Data F1 Score ", f1_score(y_train,y_pred))

print("Validation Mean F1 Score: ",cross_val_score(tree_clf,X_train_encoded,y_train,cv=5,scoring='f1_macro').mean())
print("Validation Mean Accuracy: ",cross_val_score(tree_clf,X_train_encoded,y_train,cv=5,scoring='accuracy').mean())

In [None]:
pd.crosstab(y_train, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [None]:

y_pred = tree_clf.predict(X_test_encoded)
print("Test Data Set Accuracy: ", accuracy_score(y_test,y_pred))
print("Test Data F1 Score ", f1_score(y_test,y_pred))

print("Validation Test Mean F1 Score: ",cross_val_score(tree_clf,X_test_encoded,y_test,cv=5,scoring='f1_macro').mean())
print("Validation Test Mean Accuracy: ",cross_val_score(tree_clf,X_test_encoded,y_test,cv=5,scoring='accuracy').mean())

pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [None]:
from sklearn.metrics import plot_roc_curve
ax=py.gca()
rfc=plot_roc_curve(tree_clf,X_test_encoded,y_test,ax=ax,alpha=0.8)
py.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=100,max_depth=14,min_samples_leaf = 10, random_state = 42)
rf_clf.fit(X_train_encoded,y_train)
y_pred = rf_clf.predict(X_train_encoded)
print("Train F1 Score ", f1_score(y_train,y_pred))
print("Train Accuracy ", accuracy_score(y_train,y_pred))



In [None]:

pd.crosstab(y_train, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [None]:

y_pred = rf_clf.predict(X_test_encoded)
print("Test Accuracy: ",accuracy_score(y_test,y_pred))
print("Test F1 Score: ",f1_score(y_test,y_pred))
print("Confusion Matrix on Test Data")
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [None]:
from sklearn.metrics import plot_roc_curve
ax=py.gca()
rfc=plot_roc_curve(rf_clf,X_test_encoded,y_test,ax=ax,alpha=0.8)
py.show()

In [None]:

X_test_evaluation = pd.read_csv('../input/loan-dataset/test_x.csv')

In [None]:
X_test_evaluation.info()


In [None]:

X_test_evaluation_new = X_test_evaluation.drop(columns="ID_Test")
X_test_evaluation_new= pd.get_dummies(X_test_evaluation_new,drop_first=True)
X_test_evaluation_new.info()

In [None]:
X_test_evaluation_new.head(10)

In [None]:


X_test_evaluation_new = X_test_evaluation_new[['Expense','Income','Score1','Score2','Score3','Score4','Score5','Loan type_B','Occupation type_Y','Occupation type_Z','Age']]

X_test_evaluation_new.info()

In [None]:
X_test_evaluation_new.head()

In [None]:

pred_y_new =rf_clf.predict(X_test_evaluation_new)




In [None]:

ID_column =pd.DataFrame(X_test_evaluation["ID_Test"])

pred_y = ID_column.copy()
pred_y["Label_Test"]= pred_y_new



In [None]:

pred_y.to_csv('pred_y.csv')

In [None]:

from sklearn.decomposition import PCA
pca = PCA(0.99)

pr_comp=pca.fit_transform(X_standard)
pr_df= pd.DataFrame([])
pr_df = pd.DataFrame(data = pr_comp,columns = ['Principal_Comp_1','Principal_Comp_2','Principal_Comp_3','Principal_Comp_4','Principal_Comp_5','Principal_Comp_6','Principal_comp_7','Principal_comp_8','Principal_comp_9'])  
pr_df.info()

In [None]:


principal_components =['Principal_Comp_1','Principal_Comp_2','Principal_Comp_3','Principal_Comp_4','Principal_Comp_5','Principal_Comp_6','Principal_comp_7','Principal_comp_8','Principal_comp_9']
principal_information_percent = pd.DataFrame([])
principal_information_percent = pd.DataFrame(principal_components)
principal_information_percent['percent variation captured'] = pd.DataFrame(data = pca.explained_variance_ratio_)

principal_information_percent

In [None]:

print(pca.explained_variance_)

In [None]:

print(pca.components_)

In [None]:

pca2 = PCA(0.95)  

pr_comp2=pca2.fit_transform(X_standard)
pr_df2 = pd.DataFrame([])
pr_df2 = pd.DataFrame(data = pr_comp2,columns = ['New Principal_Comp_1','New Principal_Comp_2','New Princi_Comp_3','New Principal_Comp_4','New Principal_Comp_5','New Principal_Comp_6'])  
pr_df2.info()

In [None]:


principal_components_2 =['New Principal_Comp_1','New Principal_Comp_2','New Principal_Comp_3','New Principal_Comp_4','New Principal_Comp_5','New Principal_Comp_6']
principal_information_percent_2 = pd.DataFrame([])
principal_information_percent_2 = pd.DataFrame(principal_components_2)
principal_information_percent_2['percent variation captured'] = pd.DataFrame(data = pca2.explained_variance_ratio_)

principal_information_percent_2

In [None]:

from sklearn.feature_selection import mutual_info_classif
mi = mutual_info_classif(X_standard, y_new,random_state = 42)
print(mi)
print("Mean value of MI = ", np.mean(mi))
print("Standard deviation of MI is =", np.std(mi))

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
tree_model = ExtraTreesClassifier(random_state = 42)
tree_model.fit(X_standard, y_new)
importance_list = tree_model.feature_importances_
print(importance_list)
print("Mean value of importance = ", np.mean(importance_list))
print("Standard deviation of importance is =", np.std(importance_list))
