<a href="https://colab.research.google.com/github/moridin04/CCMACLRL_EXERCISES_COM221ML/blob/main/Exercise9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Exercise 9: Choosing the best performing model on a dataset

Instructions:

- Use the Dataset File to train your model
- Use the Test File to generate your results
- Use the Sample Submission file to generate the same format
- Use all classification models

Submit your results to:
https://www.kaggle.com/competitions/playground-series-s4e10/overview



In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

## Dataset File

In [None]:
dataset_url = 'https://github.com/robitussin/CCMACLRL_EXERCISES/blob/main/datasets/loan_approval/train.csv?raw=true'
df = pd.read_csv(dataset_url)

## Test File

In [None]:
test_url = 'https://github.com/robitussin/CCMACLRL_EXERCISES/blob/main/datasets/loan_approval/test.csv?raw=true'
dt=pd.read_csv(test_url)

In [None]:
dt.info()

## Sample Submission File

In [None]:
sample_submission_url ='https://github.com/robitussin/CCMACLRL_EXERCISES/blob/main/datasets/loan_approval/sample_submission.csv?raw=true'

sf=pd.read_csv(sample_submission_url)

In [None]:
sf.info()

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df['loan_status'].value_counts()

In [None]:
df['person_home_ownership'] = df['person_home_ownership'].map({"RENT" : 1 , "OWN" : 0})
df['loan_intent'] = df['loan_intent'].map({"PERSONAL" : 1 , "MORTGAGE" : 2 , "MEDICAL" : 3, "VENTURE" : 4, "EDUCATION" : 5})
df['loan_grade'] = df['loan_grade'].map({"A" : 1 , "B" : 2 , "C" : 3, "D" : 4, "E" : 5})
df['cb_person_default_on_file'] = df['cb_person_default_on_file'].map({"Y" : 1 , "N" : 0})

In [None]:
df.isnull().sum()

In [None]:
columns_with_null = ['person_home_ownership', 'loan_intent', 'loan_grade']
for column in columns_with_null:
    df['person_home_ownership'] = df['person_home_ownership'].fillna(df['person_home_ownership'].mode()[0])
    df['loan_intent'] = df['loan_intent'].fillna(df['loan_intent'].mode()[0])
    df['loan_grade'] = df['loan_grade'].fillna(df['loan_grade'].mode()[0])

In [None]:
df.isnull().sum()

In [None]:
x = df.drop(columns = ['id', 'loan_status']).values
y = df['loan_status'].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

## 1. Train a KNN Classifier

In [None]:
score_list = {}

In [None]:
knn = KNeighborsClassifier(n_neighbors = 13)
knn.fit(x_train, y_train)
knn.score(x_test, y_test)

- Perform cross validation

In [None]:
knn_scores = cross_val_score(knn, x, y, cv = 5)
knn_scores

In [None]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (knn_scores.mean(), knn_scores.std()))
score_list["KNN Classifier"] = knn_scores.mean()

## 2. Train a Logistic Regression Classifier

In [None]:
LR = LogisticRegression()
LR.fit(x_train, y_train)
LR_score = LR.score(x_test, y_test)

- Perform cross validation

In [None]:
LR_scores = cross_val_score(LR, x, y, cv = 5)
LR_scores

In [None]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (LR_scores.mean(), LR_scores.std()))
score_list["Logistic Regression Classifier"] = LR_scores.mean()

## 3. Train a Naive Bayes Classifier

In [None]:
NBC = GaussianNB()
NBC.fit(x_train, y_train)
NBC.score(x_test, y_test)

- Perform cross validation

In [None]:
NBC_scores = cross_val_score(NBC, x, y, cv = 5)
NBC_scores

In [None]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (NBC_scores.mean(), NBC_scores.std()))
score_list["Naive Bayes Classifier"] = NBC_scores.mean()

## 4. Train a SVM Classifier

In [None]:
SVC = SVC()
SVC.fit(x_train, y_train)
SVC.score(x_test, y_test)

- Perform cross validation

In [None]:
SVC_scores = cross_val_score(SVC, x, y, cv = 5)
SVC_scores

In [None]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (SVC_scores.mean(), SVC_scores.std()))
score_list["SVM Classifier"] = SVC_scores.mean()

## 5. Train a Decision Tree Classifier

In [None]:
DTC = DecisionTreeClassifier(random_state = 1)
DTC.fit(x_train, y_train)
DTC_score = DTC.score(x_test, y_test)

- Perform cross validation

In [None]:
DTC_scores = cross_val_score(DTC, x, y, cv = 5)
DTC_scores

In [None]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (DTC_scores.mean(), DTC_scores.std()))
score_list["Decision Tree Classifier"] = DTC_scores.mean()

## 6. Train a Random Forest Classifier

In [None]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [None]:
RFC = RandomForestClassifier(criterion = 'gini')
RFC.fit(x_train_scaled, y_train)
RFC_score = RFC.score(x_test, y_test)
score_list["Random Forest Classifier"] = RFC_score
print(f"Score is {RFC_score}")

## 7. Compare all the performance of all classification models

In [None]:
score_list = list(score_list.items())
for alg, score in score_list:
    print(f"{alg} Score is {str(score)[:4]} ")

The best algorithm is **Random Forest Classifier**.

In [None]:
y_pred = RFC.predict(x_test)

In [None]:
dt.info()

In [None]:
dt.head()

In [None]:
dt.isnull().sum()

In [None]:
dt['person_home_ownership'] = dt['person_home_ownership'].map({"RENT" : 1 , "OWN" : 0})
dt['loan_intent'] = dt['loan_intent'].map({"PERSONAL" : 1 , "MORTGAGE" : 2 , "MEDICAL" : 3, "VENTURE" : 4, "EDUCATION" : 5, "HOMEIMPROVEMENT" : 6, "DEBTCONSOLIDATION" : 7})
dt['loan_grade'] = dt['loan_grade'].map({"A" : 1 , "B" : 2 , "C" : 3, "D" : 4, "E" : 5, "F" : 6})
dt['cb_person_default_on_file'] = dt['cb_person_default_on_file'].map({"Y" : 1 , "N" : 0})

## 9. Generate Submission File

Choose the model that has the best performance to generate a submission file.

In [None]:
id = sf.pop('id')
y_pred = RFC.predict(dt)

# Create a submission DataFrame
submission_df = pd.DataFrame({
    'id': id,
    'loan_status': y_pred
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission_file.csv', index=False)
print("Submission file created: submission_file.csv")