# Unsupervised Clustering Model
A notebook that predicts loan grade for Problem 2 using unsupervised learning and K-means square

In [9]:
#Imports

#Data
import pandas as pd

#ML

#General
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

#knn
from sklearn.cluster import KMeans

In [10]:
#Getting the csv file's name with the wrangled data
data_file='wrang_xyz_data.csv'

#Splitting the data into different categories that make sense
loan_data=['purpose','initial_list_status','term','loan_amnt']
emp_data=['emp_length','collections_12_mths_ex_med','acc_now_delinq','home_ownership','annual_inc','verification_status','delinq_2yrs','inq_last_6mths','open_acc','pub_rec','total_acc','earliest_cr_line','dti','tot_cur_bal','tot_coll_amt'] #address

#The following inputs are left out as they are only useful for problem 1.
#out=['last_pymnt_d','last_credit_pull_d','recoveries','collection_recovery_fee','last_pymnt_amnt','total_pymnt','total_rec_int','int_rate','out_prncp',''total_rec_late_fee','default_ind']

#Getting the features that will be included in the model
features=loan_data+emp_data

In [13]:
#Clustering unsupervised model predicting loan grade for problem 2
#The model's inputs are the data_file which should be set equal to the wrangled data file and the value to be predicted 
#(pred_value) which in this case is the 'grade'. It could also be the 'sub_grade'

def get_model_unsup(data_file='wrang_xyz_data.csv',pred_value=['grade']):
    #Imporitng the wrangled csv file and including the useful columns for it
  
    df = pd.read_csv('data/'+data_file,usecols=features+pred_value) #int_rate

    X=df.drop('grade',axis=1)

    X_scale=X.drop(['purpose','verification_status','home_ownership','initial_list_status','term'],axis=1)
    X_non_scale=X[['purpose','verification_status','home_ownership','initial_list_status','term']]

    #Scaling the data
    scaler = StandardScaler()
    scaler.fit(X_scale)
    X_scale = pd.DataFrame(scaler.transform(X_scale),index=X.index,columns=X_scale.columns)
    X=X_scale.join(X_non_scale)

    #One-hot Encoding
    ohe_cols=['purpose','verification_status','home_ownership','initial_list_status','term'] #address
    ohe = OneHotEncoder(handle_unknown='ignore')
    ohe.fit(df[ohe_cols])
    X_enc = pd.DataFrame(ohe.transform(X[ohe_cols]).toarray(),index=X.index)
    X=X.join(X_enc).drop(ohe_cols,axis=1)
    X.columns = X.columns.map(str)

    #Using the K-Means-Square algorith for clustering
    model_unsup = KMeans(n_clusters=7)
    model_unsup.fit(X)

    #Creating a function assigning a grade to a label
    def converter(cluster):
        clust={'A':0,'B':1,'C':2,'D':3,'E':4,'F':5,'G':6}
        return clust[cluster]
        
    df['Cluster'] = df['grade'].apply(converter)

    #Printing useful metrics
    print(confusion_matrix(df['Cluster'],model_unsup.labels_))
    print(classification_report(df['Cluster'],model_unsup.labels_))

    return model_unsup
#Running the function 
model_unsup=get_model_unsup(pred_value=['grade'])

[[43038 24954 41068   136   999 26849  8616]
 [68938 40821 71586   924  3139 33573 29015]
 [61216 41689 66646  1324  3445 32244 30286]
 [33273 25755 36678   855  2000 17737 16499]
 [15803 14228 16349   477  1011 11091  7487]
 [ 4906  4729  5156   160   297  3790  2290]
 [ 1017  1164  1121    52    74   959   486]]
              precision    recall  f1-score   support

           0       0.19      0.30      0.23    145660
           1       0.27      0.16      0.20    247996
           2       0.28      0.28      0.28    236850
           3       0.22      0.01      0.01    132797
           4       0.09      0.02      0.03     66446
           5       0.03      0.18      0.05     21328
           6       0.01      0.10      0.01      4873

    accuracy                           0.18    855950
   macro avg       0.15      0.15      0.12    855950
weighted avg       0.23      0.18      0.18    855950

