In [66]:
import os #paths to file
import numpy as np # linear algebra
import pandas as pd # data processing
import warnings# warning filter


#ploting libraries
import matplotlib.pyplot as plt 
import seaborn as sns

#relevant ML libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

#ML models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

#warning hadle
warnings.filterwarnings("ignore")

In [67]:
#path for the training set
tr_path = "train.csv"
#path for the testing set
te_path = "test.csv"

In [68]:
# read in csv file as a DataFrame
tr_data = pd.read_csv(tr_path)
# explore the first 5 rows
#tr_data.head()

# read in csv file as a DataFrame
te_data = pd.read_csv(te_path)
# explore the first 5 rows
te_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [69]:
#column information
#tr_data.info(verbose=True, null_counts=True)
tr_data.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [70]:
#the Id column is not needed, let's drop it for both test and train datasets
tr_data.drop('Loan_ID',  axis=1,inplace=True)
te_data.drop('Loan_ID', axis=1,inplace=True)

tr_data.drop('Loan_Status',  axis=1,inplace=True)


In [71]:
#missing values in decsending order
#tr_data.isnull().sum().sort_values(ascending=False)


In [72]:
#filling the missing data
null_cols = ['Credit_History', 'Self_Employed', 'LoanAmount','Dependents', 'Loan_Amount_Term', 'Gender', 'Married']


for col in null_cols:
    # fill in NaN with mode in train data
    tr_data[col] = tr_data[col].fillna(
    tr_data[col].dropna().mode().values[0] )
    # fill in NaN with mode in test data
    te_data[col] = te_data[col].fillna(
    te_data[col].dropna().mode().values[0])

    
#tr_data.isnull().sum().sort_values(ascending=False)
#te_data.isnull().sum().sort_values()


print("After filling missing values\n\n","#"*50,"\n")
#for col in null_cols:
    #print(f"\n{col}:\n{tr_data[col].value_counts()}\n","-"*50)


In [73]:
features = ['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area']

X_train = tr_data[features]
y_train = tr_data['LoanAmount']
X_test = te_data[features]
y_test = te_data['LoanAmount']

In [74]:
from sklearn.preprocessing import OrdinalEncoder

col_with_categorical = [col for col in X_train.columns
                       if X_train[col].dtype == 'object']
label_tr_data = tr_data.copy()
label_te_data = te_data.copy()

my_ordinal = OrdinalEncoder()

label_tr_data[col_with_categorical] = pd.DataFrame(my_ordinal.fit_transform(X_train[col_with_categorical]))
label_te_data[col_with_categorical] = pd.DataFrame(my_ordinal.transform(X_test[col_with_categorical]))


In [75]:
#Decison tree
Model = DecisionTreeClassifier()
Model.fit(label_tr_data, y_train)

y_predict = Model.predict(label_te_data)

#  prediction Summary by species
print(classification_report(y_test, y_predict))

# Accuracy score
DT_SC = accuracy_score(y_predict,y_test)
print(f"{round(DT_SC*100,2)}% Accurate")


#save predictions
Decision_Tree=pd.DataFrame({'y_test':y_test,'prediction':y_predict})
Decision_Tree.to_csv("Decision Tree.csv") 

              precision    recall  f1-score   support

        26.0       0.00      0.00      0.00         0
        28.0       0.00      0.00      0.00         2
        30.0       0.00      0.00      0.00         1
        35.0       0.00      0.00      0.00         1
        40.0       1.00      0.67      0.80         3
        42.0       0.00      0.00      0.00         0
        45.0       0.00      0.00      0.00         0
        46.0       0.00      0.00      0.00         1
        47.0       0.00      0.00      0.00         0
        48.0       0.00      0.00      0.00         1
        49.0       0.00      0.00      0.00         1
        50.0       1.00      1.00      1.00         2
        55.0       1.00      1.00      1.00         1
        56.0       0.00      0.00      0.00         0
        57.0       0.00      0.00      0.00         1
        59.0       1.00      1.00      1.00         1
        60.0       0.33      1.00      0.50         1
        61.0       0.00    

In [76]:
# Random Forest
RF = RandomForestClassifier()
RF.fit(label_tr_data, y_train)

y_predict = RF.predict(label_te_data)

#  prediction Summary by species
print(classification_report(y_test, y_predict))

# Accuracy score
RF_SC = accuracy_score(y_predict,y_test)
print(f"{round(RF_SC*100,2)}% Accurate")

              precision    recall  f1-score   support

        25.0       0.00      0.00      0.00         0
        28.0       0.00      0.00      0.00         2
        30.0       0.00      0.00      0.00         1
        35.0       0.00      0.00      0.00         1
        40.0       1.00      0.33      0.50         3
        42.0       0.00      0.00      0.00         0
        46.0       0.00      0.00      0.00         1
        48.0       0.00      0.00      0.00         1
        49.0       0.00      0.00      0.00         1
        50.0       0.00      0.00      0.00         2
        53.0       0.00      0.00      0.00         0
        55.0       0.00      0.00      0.00         1
        57.0       0.00      0.00      0.00         1
        59.0       0.00      0.00      0.00         1
        60.0       0.00      0.00      0.00         1
        61.0       0.00      0.00      0.00         1
        64.0       0.00      0.00      0.00         2
        65.0       0.00    

In [77]:
#XGBoost

XGB = XGBClassifier()
XGB.fit(label_tr_data, y_train)

y_predict = XGB.predict(label_te_data)

#  prediction Summary by species
print(classification_report(y_test, y_predict))

# Accuracy score
XGB_SC = accuracy_score(y_predict,y_test)
print(f"{round(XGB_SC*100,2)}% Accurate")

              precision    recall  f1-score   support

        28.0       0.00      0.00      0.00         2
        30.0       0.33      1.00      0.50         1
        35.0       0.00      0.00      0.00         1
        40.0       1.00      1.00      1.00         3
        42.0       0.00      0.00      0.00         0
        45.0       0.00      0.00      0.00         0
        46.0       0.00      0.00      0.00         1
        48.0       0.00      0.00      0.00         1
        49.0       0.00      0.00      0.00         1
        50.0       0.50      1.00      0.67         2
        55.0       1.00      1.00      1.00         1
        57.0       0.00      0.00      0.00         1
        59.0       0.00      0.00      0.00         1
        60.0       0.50      1.00      0.67         1
        61.0       0.00      0.00      0.00         1
        64.0       0.00      0.00      0.00         2
        65.0       0.00      0.00      0.00         4
        66.0       0.20    

In [80]:
#score = [DT_SC,RF_SC,XGB_SC]
#Models = pd.DataFrame({'n_neighbors': ["Decision Tree","Random Forest","XGBoost"],'Score': score})
#Models.sort_values(by='Score', ascending=False)