In [0]:
!pip install rfpimp

In [0]:
!wget https://www.mlgraz.at/course_files/week4/loan.csv

In [0]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from  sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from  sklearn.metrics import *
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn import model_selection
import plotly.express as px

In [0]:
import rfpimp

In [0]:
df = pd.read_csv("loan.csv")

# EDA

In [0]:
df.head()

**How many rows are there?**

In [0]:
df.shape

**How many Loans are there in total?**

In [0]:
df.Loan_ID.nunique()

**How many loans are accepted vs not?**

In [0]:
px.bar(data_frame=df, x="Loan_Status")

## Relationship with Loan Status

Relationship between Loan and Gender

In [0]:
px.histogram(df, x="ApplicantIncome", color="Loan_Status")

In [0]:
df.groupby(["Gender", "Loan_Status"]).count()["Loan_ID"]

# Dealing with Missing Values

**Impute Missing Values (except Column LoanAmount) with the most frequent Values**

In [0]:
categorical_columns_missing = ["Gender", "Married", "Dependents", "Credit_History", "Self_Employed", "Loan_Amount_Term"]
impute_missing=SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
df[categorical_columns_missing] = impute_missing.fit_transform(df[categorical_columns_missing])

**Impute LoanAmount with the mean**

In [0]:
impute_missing=SimpleImputer(missing_values=np.NaN, strategy='mean')
df["LoanAmount"] = impute_missing.fit_transform(df[["LoanAmount"]])

# Encoding Categorical Data

In [0]:
categorical_columns = ["Gender", "Married", "Dependents", "Education", "Self_Employed", "Loan_Status", "Credit_History", "Property_Area"]

**Set Datatype of Ordinal/Nominal Variables to Category**

In [0]:
for col in categorical_columns:
    df[col] = df[col].astype('category')

**Drop Loan ID**

In [0]:
df = df.drop("Loan_ID", axis=1)

**Create One Hot Encoding Features for the Categorical Variables**

In [0]:
df = pd.get_dummies(df, drop_first=True)

In [0]:
df.head()

**Rename Last Column to Load**

In [0]:
df.columns = list(df.columns.values[:-1]) + ["Loan"] 

# Split Input and Output

In [0]:
X = df[df.columns[:-1]]
y = df[df.columns[-1:]]

# Decision Tree

In [0]:
model=DecisionTreeClassifier()

In [0]:
model.fit(X,y)

In [0]:
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
tree.plot_tree(model)
fig.savefig('tree.png')

In [0]:
model.score(X,y)

# Traint Test Split


In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [0]:
model=DecisionTreeClassifier()
model.fit(X_train, y_train)

# Evaluation

In [0]:
y_pred = model.predict(X_test)


In [0]:
accuracy_score(y_test, y_pred)

In [0]:
precision_score(y_test, y_pred)

In [0]:
recall_score(y_test, y_pred)

In [0]:
f1_score(y_test, y_pred)

# K-Folds Cross Validation

In [0]:
model=DecisionTreeClassifier()
scores = cross_validate(model, X, y, cv=3, scoring=('accuracy', 'average_precision', 'recall','f1'))

In [0]:
scores

# Random Forest

In [0]:
rfc = RandomForestClassifier()
rfc_scores = cross_validate(rfc, X, y.values.ravel(), cv=3, scoring=('accuracy', 'average_precision', 'recall','f1'))

In [0]:
rfc_scores

In [0]:
report = pd.DataFrame(index=list(rfc_scores.keys())[2:], columns=['Decision Tree', 'Random Forest'])
for key in report.index:
  report.loc[key] = [np.mean(scores[key]), np.mean(rfc_scores[key])]

report['Improvment'] = report['Random Forest'] - report['Decision Tree']
report *= 100
report = report.astype(float).round(1)

In [0]:
report

# Feature Importance

In [0]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train.values.ravel())
importance = rfpimp.importances(rfc, X_test, y_test)
rfpimp.plot_importances(importance)

# Feature Engineering