In [1447]:
import numpy as np # linear algebra -for numeric computations
import pandas as pd # data processing -to store data as dataframes 
import matplotlib.pyplot as plt # data visualization 
%matplotlib inline
import seaborn as sns # data visualization 

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import IncrementalPCA
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [1448]:
data = pd.read_csv('../input/prosper-loan/prosperLoanData.csv')
data.head(7)

In [1449]:
# parsing Dates
data['ListingCreationDate'] = pd.to_datetime(data['ListingCreationDate'])
data['ClosedDate'] = pd.to_datetime(data['ClosedDate'])
data['DateCreditPulled'] = pd.to_datetime(data['DateCreditPulled'])
data['FirstRecordedCreditLine'] = pd.to_datetime(data['FirstRecordedCreditLine'])
data['LoanOriginationDate'] = pd.to_datetime(data['LoanOriginationDate'])


In [1450]:
#  removing any feature with more than 75% of missing values.
data_with_less_missing_values = data.dropna(thresh=data.shape[0] * 0.25, axis=1)
data_with_less_missing_values.shape

In [1451]:
# removing loan samples with have more than 20% of missing values
data_with_less_missing_values = data_with_less_missing_values.dropna(thresh=data.shape[1] * 0.80, axis=0).reset_index(drop=True)
data_with_less_missing_values.shape

In [1452]:
cat_cols = [name for name in data_with_less_missing_values 
                        if data_with_less_missing_values[name].dtype in ["object", "bool" ]]
numerical_cols = [name for name in data_with_less_missing_values.columns
                      if data_with_less_missing_values[name].dtype in ['int64', 'float64', 'datetime64[ns]']]
cat_data = data_with_less_missing_values.drop(axis=1, columns=numerical_cols)
num_data = data_with_less_missing_values.drop(axis=1, columns=cat_cols)


In [1453]:
cat_data.Occupation = cat_data.Occupation.fillna(cat_data.Occupation.mode().iloc[0])

In [1454]:
cat_data = cat_data.drop(axis=1, columns=['ProsperRating (Alpha)'])

In [1455]:

# all missing values in the CreditGrade column represents the rating value 0
# fill in nan values with letter Z and then use OrdinalEncoder to convert it to numerical values
cat_data.CreditGrade = cat_data.CreditGrade.fillna("Z")
from sklearn.preprocessing import OrdinalEncoder
ratings = ['Z', 'HR', 'E', 'D', 'C', 'B', 'A', 'AA']
encoder = OrdinalEncoder(categories = [ratings])
cat_data[['CreditGrade']] = encoder.fit_transform(cat_data[['CreditGrade']])
cat_data.CreditGrade = cat_data.CreditGrade.astype(int)

In [1456]:
for column in cat_data.columns:
    print(f'{column} : {len(data[column].unique())}')

In [1457]:
cat_data = cat_data.drop(columns=['LoanKey','MemberKey','ListingKey'],axis=1)

In [1458]:
for column in cat_data.columns:
    print(f'{column} : {len(data[column].unique())}')

In [1459]:
cat_data.isna().sum()

In [1460]:
num_data.isna().sum()

In [1461]:
num_data['DebtToIncomeRatio'].fillna(value=num_data['DebtToIncomeRatio'].mean(), inplace=True)

In [1462]:
plt.figure(figsize=(45,10))
num_data.EmploymentStatusDuration = num_data.EmploymentStatusDuration.fillna(num_data.EmploymentStatusDuration.mode().iloc[0])

In [1463]:
num_data.ClosedDate.fillna(value='Not Closed', inplace=True)

In [1464]:
num_data.isna().sum()

# added new

In [1465]:
num_data['EstimatedEffectiveYield'].fillna(value=num_data['EstimatedEffectiveYield'].median(), inplace=True)
num_data['EstimatedLoss'].fillna(value=num_data['EstimatedLoss'].median(), inplace=True)
num_data['EstimatedReturn'].fillna(value=num_data['EstimatedReturn'].median(), inplace=True)
num_data['ProsperRating (numeric)'].fillna(value=num_data['ProsperRating (numeric)'].median(), inplace=True)
num_data['ProsperScore'].fillna(value=num_data['ProsperScore'].median(), inplace=True)

In [1466]:
num_data.isna().sum()

In [1467]:
num_data.select_dtypes(include=('object'))

In [1468]:
num_data['ClosedDate'] = num_data['ClosedDate'].fillna(0)
num_data['ClosedDate'] = num_data['ClosedDate'].apply(lambda x:1 if x!='Not Closed' else 0)

In [1469]:
num_data.ClosedDate

In [1470]:
modified_data = num_data.join(cat_data)

In [1471]:
modified_data.select_dtypes(include=('bool')).columns

In [1472]:
bools = ['IsBorrowerHomeowner', 'CurrentlyInGroup', 'IncomeVerifiable']
for i in bools:
    modified_data[i] = modified_data[i].apply(lambda x:1 if x else 0)

In [1473]:
modified_data.info()

In [1474]:
y = modified_data['LoanStatus']
X = modified_data.drop('LoanStatus',axis=1)
y

In [1475]:
y.unique()

In [1476]:
cat_X = X.select_dtypes(include=('object'))
cat_X

In [1477]:
X = X.drop(columns=cat_X.columns)

# handling catigorical columns in X

In [1478]:
for column in cat_X.columns:
    print(f'{column} : {len(cat_X[column].unique())}')

In [1479]:
X

In [1480]:
X = X.drop(columns=['ListingCreationDate','DateCreditPulled','FirstRecordedCreditLine','LoanOriginationDate'])
# X = X.drop(columns=cat_X.columns)

In [1481]:
label_encoding_cols=["EmploymentStatus","Occupation", "BorrowerState", "IncomeRange", "LoanOriginationQuarter"]
for i in label_encoding_cols:
    X[i]=X[i].astype("category")
    X[i]=X[i].cat.codes

In [1482]:
X.join(cat_X)
X.info()

In [1483]:
# Feature Scaling of the column X
sc = StandardScaler()
scaled_X = sc.fit_transform(X)

# DecisionTreeClassifier, Output variable is LoanStatus

In [1484]:
# splitting the dataset into Training Set and Test Set
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2)

In [1485]:
# import DecisionTreeClassifier

from sklearn.tree import DecisionTreeClassifier

# instantiate the DecisionTreeClassifier model with criterion gini index

model = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)


# fit the model
model.fit(X_train, y_train)

In [1486]:
y_pred = model.predict(X_test)

In [1487]:
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

In [1488]:
y_pred_train = model.predict(X_train)

print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))

In [1489]:
y = modified_data["BorrowerRate"]
X = modified_data.drop(["BorrowerRate"],axis=1)

In [1490]:
X = X.drop(columns=['ListingCreationDate','DateCreditPulled','FirstRecordedCreditLine','LoanOriginationDate'])

In [1491]:
label_encoding_cols=["EmploymentStatus","Occupation", "BorrowerState", "IncomeRange", "LoanOriginationQuarter", "LoanStatus"]
for i in label_encoding_cols:
    X[i]=X[i].astype("category")
    X[i]=X[i].cat.codes

In [1492]:
X.info()

In [1493]:
# Feature Scaling of the column X
sc = StandardScaler()
scaled_X = sc.fit_transform(X)

In [1494]:
# import DecisionTreeRegressor

from sklearn.tree import DecisionTreeRegressor

In [1495]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=0)
model2 = DecisionTreeRegressor(max_depth=4,min_samples_split=5,max_leaf_nodes=10)
model2.fit(x_train,y_train)
y2_pred=model2.predict(x_test)

In [1496]:
print("MSE: %.9f" % metrics.mean_squared_error(y_test, y2_pred))
print("MAE: %.9f" % metrics.mean_absolute_error(y_test, y2_pred))