# Modeling Notebook

# Table of Contents:
- [Notebook Imports](#notebook-imports)


# Notebook Imports:

In [1]:
import os
import pandas as pd
import numpy as np
from skimpy import skim
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')


# Loading Data

In [2]:
# Specify your folder path:
folder_path = r"C:/Users/johne/Downloads/home-credit-default-risk"

# Loading application_test.csv
application_test_path = os.path.join(folder_path, "application_test.csv")
try:
    application_test_df = pd.read_csv(application_test_path, encoding='utf-8') # Added throughout import code to account for CSV file loading errors
except UnicodeDecodeError:
    application_test_df = pd.read_csv(application_test_path, encoding='latin1')
print("Displaying the first 5 rows of application_test.csv:")
display(application_test_df.head())  # Neat display

# Loading application_train.csv
application_train_path = os.path.join(folder_path, "application_train.csv")
try:
    application_train_df = pd.read_csv(application_train_path, encoding='utf-8')
except UnicodeDecodeError:
    application_train_df = pd.read_csv(application_train_path, encoding='latin1')
print("\nDisplaying the first 5 rows of application_train.csv:")
display(application_train_df.head())

Displaying the first 5 rows of application_test.csv:


Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,



Displaying the first 5 rows of application_train.csv:


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


# Initial Splitting and Setup

In [3]:
# Identify target and feature columns
target = 'TARGET'
features = application_train_df.columns.drop(target)

# Split the dataset into features and target
X_train = application_train_df[features]
y_train = application_train_df[target]

# For testing data
X_test = application_test_df

# Preprocessing


In [4]:
# Identify categorical and numerical columns
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
numerical_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [5]:
# Preprocessing the training data
print("Head of original training data:")
print(X_train.head())
X_train_preprocessed = preprocessor.fit_transform(X_train)
print("Head of training data after preprocessing:")
print(pd.DataFrame(X_train_preprocessed).head())

# Preprocessing the test data
print("Head of original test data:")
print(X_test.head())
X_test_preprocessed = preprocessor.transform(X_test)
print("Head of test data after preprocessing:")
print(pd.DataFrame(X_test_preprocessed).head())

# Convert preprocessed data back to DataFrame for better readability
X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed)
X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed)

# Display the first few rows of the preprocessed training data
print(X_train_preprocessed_df.head())

# Save preprocessed data for modeling
X_train_preprocessed_df.to_csv('X_train_preprocessed.csv', index=False)
X_test_preprocessed_df.to_csv('X_test_preprocessed.csv', index=False)
y_train.to_csv('y_train.csv', index=False)

Head of original training data:
   SK_ID_CURR NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  \
0      100002         Cash loans           M            N               Y   
1      100003         Cash loans           F            N               N   
2      100004    Revolving loans           M            Y               Y   
3      100006         Cash loans           F            N               Y   
4      100007         Cash loans           M            N               Y   

   CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  AMT_GOODS_PRICE  \
0             0          202500.0    406597.5      24700.5         351000.0   
1             0          270000.0   1293502.5      35698.5        1129500.0   
2             0           67500.0    135000.0       6750.0         135000.0   
3             0          135000.0    312682.5      29686.5         297000.0   
4             0          121500.0    513000.0      21865.5         513000.0   

   ... FLAG_DOCUMENT_18 FLAG_D