# Clarification

1) Before running any model, we do some pre-processing on the data set. There exists severe imbalance problem(the number of default and non-default data are very different), so we use SMOTE method to solve it. What's more, we convert all categorical variables into dummy ones and apply normalization on each variable. After pre-processing, we get 20000 sample(half default and half non-default) and 57 attributes for training.

2) For question 6, we are supposed to select at most 10 variables using ridge method. We add ridge penalty to cost function, and select the coefficients with highest absolute values. However, the selected attribute set might include dummy variables; for example, the set can include both 'grade.B' and 'grade.C'. We will regard these sub-variables as the same one. We will enlarge the number of selected set until we acquire ten full variables(such as 'grade' and 'purpose').

#  Question 1: Full Model

In [2]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler 
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import StratifiedKFold
from imblearn.over_sampling import SMOTE

In [2]:
loans = pd.read_csv('loans.csv')
status_list = []
for i in list(loans.loan_status):
    if i == 'Fail':
        status_list.append(1)
    else:
        status_list.append(0)
loans.loan_status = status_list
loans_dummy = pd.get_dummies(loans, drop_first=True, prefix_sep=".")

In [3]:
X, y = loans_dummy.drop(['loan_status'],axis=1),loans.loan_status
clf = StandardScaler()
X = clf.fit_transform(X)
X,y=SMOTE().fit_sample(X, y)

## Full Model: QDA

In [5]:
X, y = shuffle(X, y)

In [6]:
model_qda = QuadraticDiscriminantAnalysis()
qda_accuracy= cross_val_score(model_qda, X, y, cv=10).mean()
print("The accuracy of QDA model is " + str(qda_accuracy))

The accuracy of QDA model is 0.71405


## Full Model: Tree Classifier

In [7]:
X, y = shuffle(X, y)

In [8]:
model_tree = DecisionTreeClassifier()
tree_accuracy = np.mean(cross_val_score(model_tree, X, y, cv=10))
print("The accuracy of Tree model is " + str(tree_accuracy))

The accuracy of Tree model is 0.9025500000000001


## Full Model: k-NN

In [9]:
X, y = shuffle(X, y)

In [10]:
model_knn = KNeighborsClassifier(n_neighbors=5)
knn_accuracy = np.mean(cross_val_score(model_knn, X, y, cv=10))
print("The accuracy of 5-NN model is " + str(knn_accuracy))

The accuracy of 5-NN model is 0.7845500000000001


# Question 2: Reduced Model

## Attributes Selection

In [12]:
loans_reduced_balanced = loans.loc[:, ['annual_inc', 'delinq_2yrs', 'tot_cur_bal', 'total_rec_prncp', 'verification_status', 'grade', 
                               'max_bal_bc', 'installment', 'total_rec_int', 'term', 'loan_status']]
balanced_reduced_dummy = pd.get_dummies(loans_reduced_balanced, drop_first=True, prefix_sep=".")
X_reduced, y = balanced_reduced_dummy.drop(['loan_status'],axis=1),balanced_reduced_dummy.loan_status
clf = StandardScaler()
X_reduced = clf.fit_transform(X_reduced)
X_reduced,y=SMOTE().fit_sample(X_reduced, y)

## Reduced Model: Logistics Regression

In [13]:
X_reduced, y = shuffle(X_reduced, y)

In [14]:
model_log_reduced = LogisticRegression()
log_reduced_accuracy = np.mean(cross_val_score(model_log_reduced, X_reduced, y, cv=10))
print("The accuracy of Tree reduced model is " + str(log_reduced_accuracy))

The accuracy of Tree reduced model is 0.651


## Reduced Model: Tree Model

In [15]:
X_reduced, y = shuffle(X_reduced, y)

In [16]:
model_tree_reduced = DecisionTreeClassifier()
tree_reduced_accuracy = np.mean(cross_val_score(model_tree_reduced, X_reduced, y, cv=10))
print("The accuracy of Tree reduced model is " + str(tree_reduced_accuracy))

The accuracy of Tree reduced model is 0.79925


## Reduced Model: K-NN

In [17]:
X_reduced, y = shuffle(X_reduced, y)

In [18]:
model_knn_reduced = KNeighborsClassifier(n_neighbors=5)
knn_reduced_accuracy = np.mean(cross_val_score(model_knn_reduced, X_reduced, y, cv=10))
print("The accuracy of 5-NN reduced model is " + str(knn_reduced_accuracy))

The accuracy of 5-NN reduced model is 0.8217000000000001


# Question 5: Ridge-reduced model

In [19]:
X_ridge, y = loans_dummy.drop(['loan_status'],axis=1),loans.loan_status
clf = StandardScaler()
X_ridge = clf.fit_transform(X_ridge)
X_ridge,y=SMOTE().fit_sample(X_ridge, y)

In [None]:
params = {'C':10**np.arange(-4, 1, 0.1), 'penalty':['l2']}
log = LogisticRegression()
reg_cv = GridSearchCV(log, params, cv=5)
reg_cv.fit(X_ridge, y)

In [22]:
folds = 10
split = StratifiedKFold(y, n_folds=folds, shuffle = True, random_state = 0)
scores = []
for k, (train, test) in enumerate(split):
    X_train, X_test, y_train, y_test = X_ridge[train], X_ridge[test], y[train], y[test]
    ridge_clf = LogisticRegression(penalty='l2', C=4)
    ridge_clf.fit(X_train, y_train)
    betas = pd.DataFrame((abs(i) for i in ridge_clf.coef_.transpose()),columns=['beta'])
    ridge_best_10 = betas.nlargest(10,'beta').index
    logistic_clf = LogisticRegression()
    logistic_clf.fit(X_train[:, ridge_best_10],y_train)
    scores.append(logistic_clf.score(X_test[:, ridge_best_10], y_test))
print("The accuracy of ridge reduced model is " + str(sum(scores)/10))

The accuracy of ridge reduced model is 0.7465499999999999


# Question 6

In [23]:
X_ridge, y = loans_dummy.drop(['loan_status'],axis=1),loans.loan_status
clf = StandardScaler()
X_ridge = clf.fit_transform(X_ridge)
X_ridge,y=SMOTE().fit_sample(X_ridge, y)

In [None]:
log_ridge_reduced2 = LogisticRegression(C=4, penalty='l2')
log_ridge_reduced2.fit(X_ridge, y)

In [25]:
variable_coef = np.array([abs(i) for i in log_ridge_reduced2.coef_[0]])
sig_variables = variable_coef.argsort()[-18:][::-1].tolist()
for i in range(len(variable_coef)):
    if i in sig_variables:
        continue
    else:
        variable_coef[i] = 0
index_list = []
for i in range(len(variable_coef)):
    if variable_coef[i] != 0:
        index_list.append(i)

In [26]:
X_ridge_10 = X_ridge[:, index_list]
X_ridge_10, y1 = shuffle(X_ridge_10 ,y)
model_reduced_ridge = LogisticRegression()
reduced_ridge_accuracy = np.mean(cross_val_score(model_reduced_ridge, X_ridge_10, y1, cv=10))
print("The accuracy of ridge reduced model is " + str(reduced_ridge_accuracy))

The accuracy of ridge reduced model is 0.75595


# Question 10

In [80]:
X, y = loans_dummy.drop(['loan_status'],axis=1),loans.loan_status
clf = StandardScaler()
X = clf.fit_transform(X)
X,y=SMOTE().fit_sample(X, y)

In [None]:
# Be careful, very slow
from sklearn.ensemble import RandomForestClassifier
params = {'max_features':range(50, 55), 'max_depth':range(18, 22)}
rf_1 = RandomForestClassifier()
rf_cv = GridSearchCV(rf_1, params, cv=5)
rf_cv.fit(X, y)

In [89]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_features=rf_cv.best_param_['max_features'], max_depth=rf_cv.best_param_['max_depth'])
tree_accuracy = np.mean(cross_val_score(rf, X, y, cv=10))
print("The accuracy of Tree model is " + str(tree_accuracy))

The accuracy of Tree model is 0.9122999999999999
