# Modeling Notebook

# Table of Contents:
- [Notebook Imports](#notebook-imports)


# Notebook Imports:

In [21]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')


# Loading Data

In [2]:
# Specify your folder path:
folder_path = r"C:/Users/johne/Downloads/home-credit-default-risk"

# Loading application_test.csv
application_test_path = os.path.join(folder_path, "application_test.csv")
try:
    application_test_df = pd.read_csv(application_test_path, encoding='utf-8') # Added throughout import code to account for CSV file loading errors
except UnicodeDecodeError:
    application_test_df = pd.read_csv(application_test_path, encoding='latin1')
print("Displaying the first 5 rows of application_test.csv:")
display(application_test_df.head())  # Neat display

# Loading application_train.csv
application_train_path = os.path.join(folder_path, "application_train.csv")
try:
    application_train_df = pd.read_csv(application_train_path, encoding='utf-8')
except UnicodeDecodeError:
    application_train_df = pd.read_csv(application_train_path, encoding='latin1')
print("\nDisplaying the first 5 rows of application_train.csv:")
display(application_train_df.head())

Displaying the first 5 rows of application_test.csv:


Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,



Displaying the first 5 rows of application_train.csv:


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


# Initial Splitting and Setup

In [3]:
# Identify target and feature columns
target = 'TARGET'
features = application_train_df.columns.drop(target)

# Split the dataset into features and target
X_train = application_train_df[features]
y_train = application_train_df[target]

# For testing data
X_test = application_test_df

# Preprocessing


In [7]:
# Identify categorical and numerical columns
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
numerical_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]

# Identify columns with missing values
missing_numerical_cols = [col for col in numerical_cols if X_train[col].isnull().any()]
missing_categorical_cols = [col for col in categorical_cols if X_train[col].isnull().any()]

# Print columns with missing values
print("Numerical columns with missing values:", missing_numerical_cols)
print("Categorical columns with missing values:", missing_categorical_cols)

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

Numerical columns with missing values: ['AMT_ANNUITY', 'AMT_GOODS_PRICE', 'OWN_CAR_AGE', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI'

In [5]:
# Preprocessing the training data
print("Head of original training data:")
print(X_train.head())
X_train_preprocessed = preprocessor.fit_transform(X_train)
print("Head of training data after preprocessing:")
print(pd.DataFrame(X_train_preprocessed).head())

# Preprocessing the test data
print("Head of original test data:")
print(X_test.head())
X_test_preprocessed = preprocessor.transform(X_test)
print("Head of test data after preprocessing:")
print(pd.DataFrame(X_test_preprocessed).head())

# Convert preprocessed data back to DataFrame for better readability
X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed)
X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed)

# Display the first few rows of the preprocessed training data
print(X_train_preprocessed_df.head())

# Save preprocessed data for modeling
X_train_preprocessed_df.to_csv('X_train_preprocessed.csv', index=False)
X_test_preprocessed_df.to_csv('X_test_preprocessed.csv', index=False)
y_train.to_csv('y_train.csv', index=False)

Head of original training data:
   SK_ID_CURR NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  \
0      100002         Cash loans           M            N               Y   
1      100003         Cash loans           F            N               N   
2      100004    Revolving loans           M            Y               Y   
3      100006         Cash loans           F            N               Y   
4      100007         Cash loans           M            N               Y   

   CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  AMT_GOODS_PRICE  \
0             0          202500.0    406597.5      24700.5         351000.0   
1             0          270000.0   1293502.5      35698.5        1129500.0   
2             0           67500.0    135000.0       6750.0         135000.0   
3             0          135000.0    312682.5      29686.5         297000.0   
4             0          121500.0    513000.0      21865.5         513000.0   

   ... FLAG_DOCUMENT_18 FLAG_D

# Simple Linear Model on Non-missing data:

In [10]:
# Select features with missing values <1% for linear regression model
missing_threshold = 0.01
low_missing_numerical_cols = [col for col in numerical_cols if X_train[col].isnull().mean() < missing_threshold]
low_missing_categorical_cols = [col for col in categorical_cols if X_train[col].isnull().mean() < missing_threshold]

# Preprocessing only for selected features
selected_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, low_missing_numerical_cols),
        ('cat', categorical_transformer, low_missing_categorical_cols)
    ]
)

# Preprocessing the training data with selected features
X_train_selected_preprocessed = selected_preprocessor.fit_transform(X_train)

# Convert preprocessed data back to DataFrame for better readability
X_train_selected_preprocessed_df = pd.DataFrame(X_train_selected_preprocessed)

# Simple Linear Regression Model
linear_regressor = LinearRegression()
linear_regressor.fit(X_train_selected_preprocessed_df, y_train)

# Predict on the training data
y_train_pred = linear_regressor.predict(X_train_selected_preprocessed_df)

# Calculate and print Mean Squared Error
mse = mean_squared_error(y_train, y_train_pred)
print(f"Mean Squared Error of Linear Regression Model: {mse}")


# FINAL SCORE: Score: 0.50000 Private score: 0.49995

Mean Squared Error of Linear Regression Model: 0.07080357961985398


In [18]:
# Examine model weights
coefficients = linear_regressor.coef_
intercept = linear_regressor.intercept_

# Get the names of all selected features after one-hot encoding
feature_names = selected_preprocessor.named_transformers_['num'].feature_names_in_.tolist() + \
                selected_preprocessor.named_transformers_['cat'].get_feature_names_out(low_missing_categorical_cols).tolist()

# Create a DataFrame for coefficients and feature names
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Sort features by the absolute value of coefficients to see which are most impactful
coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs()
coef_df_sorted = coef_df.sort_values(by='Abs_Coefficient', ascending=False)

# Expandable display for better examination
pd.options.display.max_rows = None
print("\nLinear Regression Model Coefficients and their Importance:")
print(coef_df_sorted)


Linear Regression Model Coefficients and their Importance:
                                               Feature   Coefficient  \
99                  WEEKDAY_APPR_PROCESS_START_TUESDAY  4.266204e+11   
94                   WEEKDAY_APPR_PROCESS_START_FRIDAY  4.266204e+11   
100               WEEKDAY_APPR_PROCESS_START_WEDNESDAY  4.266204e+11   
98                 WEEKDAY_APPR_PROCESS_START_THURSDAY  4.266204e+11   
96                 WEEKDAY_APPR_PROCESS_START_SATURDAY  4.266204e+11   
95                   WEEKDAY_APPR_PROCESS_START_MONDAY  4.266204e+11   
97                   WEEKDAY_APPR_PROCESS_START_SUNDAY  4.266204e+11   
56                                       CODE_GENDER_M  9.323239e+10   
55                                       CODE_GENDER_F  9.323239e+10   
57                                     CODE_GENDER_XNA  9.323239e+10   
82                   NAME_FAMILY_STATUS_Civil marriage  9.318658e+10   
83                          NAME_FAMILY_STATUS_Married  9.318658e+10   
86  

Comment: 
- Lots of highly predicitve features
- What features are related? 
- Is ovefitting an issue? Too many variables issue.

In [22]:
# Looking at VIF values to see which features are highly correlated: 
# Calculate Variance Inflation Factor (VIF) to check for multicollinearity
print("\nCalculating Variance Inflation Factor (VIF) for each feature:")
vif_data = pd.DataFrame()
vif_data['Feature'] = X_train_selected_preprocessed_df.columns
vif_data['VIF'] = [variance_inflation_factor(X_train_selected_preprocessed_df.values, i) for i in range(X_train_selected_preprocessed_df.shape[1])]

# Display VIF values
print(vif_data.sort_values(by='VIF', ascending=False))


Calculating Variance Inflation Factor (VIF) for each feature:
     Feature           VIF
54        54           inf
17        17           inf
53        53           inf
69        69  2.328886e+11
86        86  2.245456e+11
126      126  1.130351e+11
133      133  1.004550e+11
71        71  8.575835e+10
149      149  8.434181e+10
111      111  7.815018e+10
116      116  6.338545e+10
107      107  6.197756e+10
75        75  5.993970e+10
146      146  5.595090e+10
153      153  4.970559e+10
122      122  4.857204e+10
150      150  4.437022e+10
103      103  4.243615e+10
113      113  3.146400e+10
148      148  3.021790e+10
57        57  2.978886e+10
74        74  2.869924e+10
127      127  2.454077e+10
119      119  2.040085e+10
144      144  1.884324e+10
88        88  1.719279e+10
130      130  1.638146e+10
135      135  1.555505e+10
123      123  1.110307e+10
128      128  1.064043e+10
87        87  1.051743e+10
97        97  1.050727e+10
118      118  1.006456e+10
139      139  9.294

# Kaggle Submission Code:

In [15]:
# Generate predictions for the test set using the trained linear regression model
X_test_selected_preprocessed = selected_preprocessor.transform(application_test_df)
y_test_pred = linear_regressor.predict(X_test_selected_preprocessed)

# Convert predictions to binary values (0 or 1) using a threshold of 0.5
y_test_pred_binary = [1 if pred >= 0.5 else 0 for pred in y_test_pred]

# Load the sample submission format
submission_path = "C:/Users/johne/Downloads/sample_submission.csv"
try:
    sample_submission = pd.read_csv(submission_path, encoding='utf-8')
except UnicodeDecodeError:
    sample_submission = pd.read_csv(submission_path, encoding='latin1')

# Prepare the submission DataFrame
submission = sample_submission.copy()
submission['TARGET'] = y_test_pred_binary

# Save the submission file
submission_path_output = "C:/Users/johne/Downloads/sample_submission.csv"
submission.to_csv(submission_path_output, index=False)

# Display the first few rows of the submission file for verification
print("\nDisplaying the first few rows of the submission file:")
display(submission.head())

# Note: The preprocessed data can now be used for fitting various models, which will be explored next.


Displaying the first few rows of the submission file:


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0
1,100005,0
2,100013,0
3,100028,0
4,100038,0
