In [1]:
# Constants
DATAPATHTEST = "test_data.csv"
DATAPATHTRAIN = "train_data.csv"
CLEANEDTRAINPATH = "cleaned_train.csv" 
CLEANEDTESTPATH = "cleaned_test.csv"

In [2]:
#load packages 
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import OrdinalEncoder
import pickle

In [3]:
#read data
row_data_train = pd.read_csv(DATAPATHTRAIN)
row_data_test = pd.read_csv(DATAPATHTEST)
df_train = row_data_train.copy()
df_test = row_data_test.copy()

## Data Clean part 

### Replace Values and parse


In [4]:
df_train['Dependents'].replace({"3+" : 3}, inplace = True)
df_test['Dependents'].replace({"3+" : 3}, inplace = True)
df_train['Dependents'].unique()

array(['0', '1', '2', 3, nan], dtype=object)

### Correct Data Types

In [5]:
df_train['Dependents'] = df_train['Dependents'].astype("float")
df_test['Dependents'] = df_test['Dependents'].astype("float")

### Dealing With Missing Values

In [6]:
# Separating the columns that have missing values into categorical columns and numeric columns 
colmns_names = df_train.columns
cat_col = [col for col in colmns_names if df_train[col].dtype == "object" and df_train[col].isna().sum() > 0]
num_col = [col for col in colmns_names if df_train[col].dtype == "float" and df_train[col].isna().sum() > 0]

In [7]:
# Replace the missing values of categorical columns with u to refer to "unknown values"
for col in cat_col:
    df_train[col].fillna("u", inplace = True)
    df_test[col].fillna("u", inplace = True)
    
# Fill in the missing values of the numerical columns with the mean
for col in num_col:
    df_train[col].fillna(df_train[col].mean(), inplace = True)
    df_test[col].fillna(df_train[col].mean(), inplace = True)

### Data Transformation Part  

In [8]:
full_cat_col = [col for col in colmns_names if df_train[col].dtype == "object"]
od_model = OrdinalEncoder()
df_train[full_cat_col] = od_model.fit_transform(df_train[full_cat_col])
df_test[full_cat_col[:-1]] = od_model.fit_transform(df_test[full_cat_col[:-1]])

## Check part

In [9]:
df_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0.0,1.0,0.0,0.0,0.0,0.0,5849,0.0,146.412162,360.0,1.0,2.0,1.0
1,1.0,1.0,1.0,1.0,0.0,0.0,4583,1508.0,128.0,360.0,1.0,0.0,0.0
2,2.0,1.0,1.0,0.0,0.0,1.0,3000,0.0,66.0,360.0,1.0,2.0,1.0
3,3.0,1.0,1.0,0.0,1.0,0.0,2583,2358.0,120.0,360.0,1.0,2.0,1.0
4,4.0,1.0,0.0,0.0,0.0,0.0,6000,0.0,141.0,360.0,1.0,2.0,1.0


In [10]:
df_test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,0.0,1.0,1.0,0.0,0.0,0.0,5720,0,110.0,360.0,1.0,2.0
1,1.0,1.0,1.0,1.0,0.0,0.0,3076,1500,126.0,360.0,1.0,2.0
2,2.0,1.0,1.0,2.0,0.0,0.0,5000,1800,208.0,360.0,1.0,2.0
3,3.0,1.0,1.0,2.0,0.0,0.0,2340,2546,100.0,360.0,0.842199,2.0
4,4.0,1.0,0.0,0.0,1.0,0.0,3276,0,78.0,360.0,1.0,2.0


In [11]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    float64
 1   Gender             614 non-null    float64
 2   Married            614 non-null    float64
 3   Dependents         614 non-null    float64
 4   Education          614 non-null    float64
 5   Self_Employed      614 non-null    float64
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         614 non-null    float64
 9   Loan_Amount_Term   614 non-null    float64
 10  Credit_History     614 non-null    float64
 11  Property_Area      614 non-null    float64
 12  Loan_Status        614 non-null    float64
dtypes: float64(12), int64(1)
memory usage: 62.5 KB


In [12]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            367 non-null    float64
 1   Gender             367 non-null    float64
 2   Married            367 non-null    float64
 3   Dependents         367 non-null    float64
 4   Education          367 non-null    float64
 5   Self_Employed      367 non-null    float64
 6   ApplicantIncome    367 non-null    int64  
 7   CoapplicantIncome  367 non-null    int64  
 8   LoanAmount         367 non-null    float64
 9   Loan_Amount_Term   367 non-null    float64
 10  Credit_History     367 non-null    float64
 11  Property_Area      367 non-null    float64
dtypes: float64(10), int64(2)
memory usage: 34.5 KB


## Saving our clean data 

In [15]:
df_train.to_csv(CLEANEDTRAINPATH, index = False)
df_test.to_csv(CLEANEDTESTPATH, index = False)