In [73]:
import os #paths to file
import numpy as np # linear algebra
import pandas as pd # data processing
import warnings# warning filter


#ploting libraries
import matplotlib.pyplot as plt 
import seaborn as sns

#relevant ML libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

#ML models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

#warning hadle
warnings.filterwarnings("ignore")

In [74]:
#path for the training set
tr_path = "train.csv"
#path for the testing set
te_path = "test.csv"

In [75]:
# read in csv file as a DataFrame
tr_data = pd.read_csv(tr_path)
# explore the first 5 rows
#tr_data.head()

# read in csv file as a DataFrame
te_data = pd.read_csv(te_path)
# explore the first 5 rows
te_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [76]:
#column information
#tr_data.info(verbose=True, null_counts=True)

In [77]:
#the Id column is not needed, let's drop it for both test and train datasets
tr_data.drop('Loan_ID',axis=1,inplace=True)
te_data.drop('Loan_ID',axis=1,inplace=True)

In [78]:
#missing values in decsending order
#tr_data.isnull().sum().sort_values(ascending=False)

In [79]:
#filling the missing data
null_cols = ['Credit_History', 'Self_Employed', 'LoanAmount','Dependents', 'Loan_Amount_Term', 'Gender', 'Married']


for col in null_cols:
    tr_data[col] = tr_df[col].fillna(
    tr_data[col].dropna().mode().values[0] )   

    
tr_data.isnull().sum().sort_values(ascending=False)
print("After filling missing values\n\n","#"*50,"\n")
#for col in null_cols:
    #print(f"\n{col}:\n{tr_data[col].value_counts()}\n","-"*50)

After filling missing values

 ################################################## 



In [80]:
#converting categorical values to numbers

to_numeric = {'Male': 1, 'Female': 2,
'Yes': 1, 'No': 2,
'Graduate': 1, 'Not Graduate': 2,
'Urban': 3, 'Semiurban': 2,'Rural': 1,
'Y': 1, 'N': 0,
'3+': 3}

# adding the new numeric values from the to_numeric variable to both datasets
tr_data = tr_data.applymap(lambda lable: to_numeric.get(lable) if lable in to_numeric else lable)
te_data = te_data.applymap(lambda lable: to_numeric.get(lable) if lable in to_numeric else lable)

# convertind the Dependents column
Dependents_ = pd.to_numeric(tr_data.Dependents)
Dependents__ = pd.to_numeric(te_data.Dependents)

# dropping the previous Dependents column
tr_data.drop(['Dependents'], axis = 1, inplace = True)
te_data.drop(['Dependents'], axis = 1, inplace = True)

# concatination of the new Dependents column with both datasets
tr_data = pd.concat([tr_data, Dependents_], axis = 1)
te_data = pd.concat([te_data, Dependents__], axis = 1)

# checking the our manipulated dataset for validation
print(f"training set (row, col): {tr_data.shape}\n\ntesting set (row, col): {te_data.shape}\n")
print(tr_data.info(), "\n\n", te_data.info())

training set (row, col): (614, 12)

testing set (row, col): (367, 11)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             367 non-null    float64
 1   Married            367 non-null    float64
 2   Education          614 non-null    int64  
 3   Self_Employed      367 non-null    float64
 4   ApplicantIncome    614 non-null    int64  
 5   CoapplicantIncome  614 non-null    float64
 6   LoanAmount         367 non-null    float64
 7   Loan_Amount_Term   367 non-null    float64
 8   Credit_History     367 non-null    float64
 9   Property_Area      614 non-null    int64  
 10  Loan_Status        614 non-null    int64  
 11  Dependents         367 non-null    float64
dtypes: float64(8), int64(4)
memory usage: 57.7 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 11 column

In [103]:
features = ['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area']

X_train = tr_data[features]
y_train = tr_data['LoanAmount']

X_test = te_data[features]
X_train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,Loan_Amount_Term,Credit_History,Property_Area
0,1.0,1.0,0.0,1,2.0,5849,0.0,360.0,1.0,3
1,1.0,1.0,1.0,1,2.0,4583,1508.0,360.0,1.0,1
2,1.0,1.0,2.0,1,2.0,3000,0.0,360.0,1.0,3
3,1.0,1.0,2.0,2,2.0,2583,2358.0,360.0,1.0,3
4,1.0,2.0,0.0,1,2.0,6000,0.0,360.0,1.0,3


In [92]:
#Decison tree
Model = DecisionTreeClassifier()
#Model.fit(X_train, y_train)

#y_predict = Model.predict(X_test)

#  prediction Summary by species
#print(classification_report(y_test, y_predict))

# Accuracy score
#DT_SC = accuracy_score(y_predict,y_test)
#print(f"{round(DT_SC*100,2)}% Accurate")


#save predictions
#Decision_Tree=pd.DataFrame({'y_test':y_test,'prediction':y_predict})
#Decision_Tree.to_csv("Decision Tree.csv") 

In [93]:
# Random Forest
RF = RandomForestClassifier()
#RF.fit(X_train, y_train)

#y_predict = RF.predict(X_test)

#  prediction Summary by species
#print(classification_report(y_test, y_predict))

# Accuracy score
#RF_SC = accuracy_score(y_predict,y_test)
#print(f"{round(RF_SC*100,2)}% Accurate")

In [98]:
#Random_Forest=pd.DataFrame({'y_test':y_test,'prediction':y_predict})
#Random_Forest.to_csv("Random Forest.csv")     

In [99]:
#XGBoost

XGB = XGBClassifier()
#XGB.fit(X_train, y_train)

#y_predict = XGB.predict(X_test)

#  prediction Summary by species
#print(classification_report(y_test, y_predict))

# Accuracy score
#XGB_SC = accuracy_score(y_predict,y_test)
#print(f"{round(XGB_SC*100,2)}% Accurate")

In [101]:
#score = [DT_SC,RF_SC,XGB_SC]
#Models = pd.DataFrame({
    'n_neighbors': ["Decision Tree","Random Forest","XGBoost"],
    'Score': score})
#Models.sort_values(by='Score', ascending=False)

IndentationError: unexpected indent (Temp/ipykernel_5832/1658062802.py, line 3)