In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 9 16:48:53 2024

@author: andreapaloschavez
"""
import pandas as pd
from src.static import DATA_DIR

In [2]:
df = pd.read_csv(f'{DATA_DIR}/raw_data.csv', na_values=['.', 'NaN'])

# Check the first few rows of the CSV data
df.head()

Unnamed: 0,enterprise_flag,record_number,census_tract_2020,tract_income_ratio,affordability_cat,date_of_mortgage_note,purpose_of_loan,type_of_seller,federal_guarantee,tot_num_units,year,underserved_areas_ind,num_bedrooms,num_units,affordability_level,tenant_income_ind
0,2,1,3,1,8,2,2,1,2,5.0,2021,,1,27.0,2,0
1,2,1,3,1,8,2,2,1,2,5.0,2021,,1,40.0,4,0
2,2,1,3,1,8,2,2,1,2,5.0,2021,,1,83.0,1,0
3,2,1,3,1,8,2,2,1,2,5.0,2021,,1,6.0,1,0
4,2,1,3,1,8,2,2,1,2,5.0,2021,,1,48.0,3,0


In [3]:
# Check the data types and missing values in the data
df_csv_info = df.info()

# Check the percentage of missing values per column
missing_values = df.isnull().mean() * 100
df_csv_info, missing_values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 895729 entries, 0 to 895728
Data columns (total 16 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   enterprise_flag        895729 non-null  int64  
 1   record_number          895729 non-null  int64  
 2   census_tract_2020      895729 non-null  int64  
 3   tract_income_ratio     895729 non-null  int64  
 4   affordability_cat      895729 non-null  int64  
 5   date_of_mortgage_note  895729 non-null  int64  
 6   purpose_of_loan        895729 non-null  int64  
 7   type_of_seller         895729 non-null  int64  
 8   federal_guarantee      895729 non-null  int64  
 9   tot_num_units          895728 non-null  float64
 10  year                   895729 non-null  int64  
 11  underserved_areas_ind  61291 non-null   float64
 12  num_bedrooms           895729 non-null  int64  
 13  num_units              895728 non-null  float64
 14  affordability_level    895729 non-nu

(None,
 enterprise_flag           0.000000
 record_number             0.000000
 census_tract_2020         0.000000
 tract_income_ratio        0.000000
 affordability_cat         0.000000
 date_of_mortgage_note     0.000000
 purpose_of_loan           0.000000
 type_of_seller            0.000000
 federal_guarantee         0.000000
 tot_num_units             0.000112
 year                      0.000000
 underserved_areas_ind    93.157417
 num_bedrooms              0.000000
 num_units                 0.000112
 affordability_level       0.000000
 tenant_income_ind         0.000000
 dtype: float64)

In [None]:
# We find that the 'underserved_areas_ind' column is entirely null
# so we drop it from the dataset
df.drop('underserved_areas_ind', axis=1, inplace=True)
df.dropna()

# Check the last few rows of the data
df.tail()

Unnamed: 0,enterprise_flag,record_number,census_tract_2020,tract_income_ratio,affordability_cat,date_of_mortgage_note,purpose_of_loan,type_of_seller,federal_guarantee,tot_num_units,year,num_bedrooms,num_units,affordability_level,tenant_income_ind
895724,2,5052,3,1,1,1,2,1,2,4.0,2020,2,1.0,2,0
895725,2,5052,3,1,1,1,2,1,2,4.0,2020,2,1.0,1,0
895726,2,5052,3,1,1,1,2,1,2,4.0,2020,1,1.0,4,0
895727,2,5052,3,1,1,1,2,1,2,4.0,2020,1,1.0,3,0
895728,2,5052,3,1,1,1,2,1,2,4.0,2020,1,1.0,1,0


In [5]:
# The columns with numerical data (like num_units, num_bedrooms, 
# and tract_income_ratio) seem like potential candidates for 
# independent variables in a linear regression model. 
# However, we need to ensure that these variables are 
# continuous and not categorical.

# Update target variable to 'tract_income_ratio'
target_variable = 'tract_income_ratio'
independent_variables = ['num_bedrooms', 'num_units', 'tot_num_units']

In [6]:
# Check the correlation between the target variable and independent variables
correlation_matrix = df[independent_variables + [target_variable]].corr()

correlation_matrix

Unnamed: 0,num_bedrooms,num_units,tot_num_units,tract_income_ratio
num_bedrooms,1.0,-0.021897,-0.010175,-0.008437
num_units,-0.021897,1.0,0.16347,0.009433
tot_num_units,-0.010175,0.16347,1.0,0.013652
tract_income_ratio,-0.008437,0.009433,0.013652,1.0


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Prepare the data for regression
X = df[independent_variables]  # Independent variables
y = df[target_variable]  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2
# MSE is an extremely small value, essentially close to zero. 
# It indicates that the model's predictions are almost identical to the 
# actual values in the test set. 
# overfitting or perfect collinearity between features and the target?

# R2 -- model explains 100% of the variance in the target variab,
# a high correlation between the independent variables and the target


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values