In [83]:
import pandas as pd 
import numpy as np 
from statsmodels.formula.api import ols 

In [84]:
dataset = pd.read_csv('https://raw.githubusercontent.com/Explore-AI/Public-Data/master/bootcamps/Personal_Loans.csv') 
dataset.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Securities Account,CD Account,Online,Gender,Area,Personal Loan,Loan Size
0,26,2,60,2,3.0,Undergrad,132,0,0,0,Male,Alameda,1,15
1,26,0,83,3,3.9,Postgrad,0,0,0,1,Female,Ventura,1,30
2,26,0,129,3,0.7,Postgrad,0,0,0,0,Male,Los Angeles,1,33
3,26,0,132,3,6.5,Professional,0,0,0,0,Female,Orange,1,40
4,26,2,132,2,2.4,Professional,0,0,0,0,Male,Alameda,1,35


In [85]:
dataset.shape 

(4846, 14)

In [86]:
# Renaming the columns to the format of "col_name" 
dataset.columns = [column_name.replace(" ", "_") for column_name in dataset.columns] 

dataset.columns

Index(['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education',
       'Mortgage', 'Securities_Account', 'CD_Account', 'Online', 'Gender',
       'Area', 'Personal_Loan', 'Loan_Size'],
      dtype='object')

In [87]:
dataset.shape 

(4846, 14)

In [88]:
# Creating dummy variables 
dataset_dummies = pd.get_dummies(dataset) 
dataset_dummies.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Mortgage,Securities_Account,CD_Account,Online,Personal_Loan,...,Area_Santa Cruz,Area_Shasta,Area_Siskiyou,Area_Solano,Area_Sonoma,Area_Stanislaus,Area_Trinity,Area_Tuolumne,Area_Ventura,Area_Yolo
0,26,2,60,2,3.0,132,0,0,0,1,...,False,False,False,False,False,False,False,False,False,False
1,26,0,83,3,3.9,0,0,0,1,1,...,False,False,False,False,False,False,False,False,True,False
2,26,0,129,3,0.7,0,0,0,0,1,...,False,False,False,False,False,False,False,False,False,False
3,26,0,132,3,6.5,0,0,0,0,1,...,False,False,False,False,False,False,False,False,False,False
4,26,2,132,2,2.4,0,0,0,0,1,...,False,False,False,False,False,False,False,False,False,False


In [89]:
dataset_dummies.shape 

(4846, 54)

In [90]:
# Reordering the columns 
# So that the target variable is at the end/last column
col_titles = [col for col in dataset_dummies.columns if col != "Loan_Size"] + ["Loan_Size"]

dataset_dummies = dataset_dummies.reindex(columns=col_titles)

# Include underscores in between col names
# E.g., Securities Account = Securities_Account
dataset_dummies.columns = [column.replace(" ", "_") for column in dataset_dummies.columns]

In [91]:
dataset_dummies.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Mortgage,Securities_Account,CD_Account,Online,Personal_Loan,...,Area_Shasta,Area_Siskiyou,Area_Solano,Area_Sonoma,Area_Stanislaus,Area_Trinity,Area_Tuolumne,Area_Ventura,Area_Yolo,Loan_Size
0,26,2,60,2,3.0,132,0,0,0,1,...,False,False,False,False,False,False,False,False,False,15
1,26,0,83,3,3.9,0,0,0,1,1,...,False,False,False,False,False,False,False,True,False,30
2,26,0,129,3,0.7,0,0,0,0,1,...,False,False,False,False,False,False,False,False,False,33
3,26,0,132,3,6.5,0,0,0,0,1,...,False,False,False,False,False,False,False,False,False,40
4,26,2,132,2,2.4,0,0,0,0,1,...,False,False,False,False,False,False,False,False,False,35


In [92]:
# Building the OLS - Regressing Line 
predictor_variables = [column for column in dataset_dummies.drop(columns=["Loan_Size"]).columns] 

target_variable = "Loan_Size"

formular_str = target_variable + " ~ " + "+".join(predictor_variables) 

model = ols(formular_str, dataset_dummies)

fitted_model = model.fit()

print(fitted_model.summary())

                            OLS Regression Results                            
Dep. Variable:              Loan_Size   R-squared:                       0.923
Model:                            OLS   Adj. R-squared:                  0.922
Method:                 Least Squares   F-statistic:                     1154.
Date:                Wed, 12 Nov 2025   Prob (F-statistic):               0.00
Time:                        17:12:34   Log-Likelihood:                -11958.
No. Observations:                4846   AIC:                         2.402e+04
Df Residuals:                    4795   BIC:                         2.435e+04
Df Model:                          50                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
Intercept   

In [93]:
# Encoding the Dummary Variables using 
# drop_first = True to avoid the Dummy Variable Trap 

new_dataset_dummies = pd.get_dummies(dataset, drop_first=True) 

# Include underscores in between col names
# E.g., Securities Account = Securities_Account
new_dataset_dummies.columns = [column.replace(" ", "_") for column in new_dataset_dummies.columns]

new_dataset_dummies.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Mortgage,Securities_Account,CD_Account,Online,Personal_Loan,...,Area_Santa_Cruz,Area_Shasta,Area_Siskiyou,Area_Solano,Area_Sonoma,Area_Stanislaus,Area_Trinity,Area_Tuolumne,Area_Ventura,Area_Yolo
0,26,2,60,2,3.0,132,0,0,0,1,...,False,False,False,False,False,False,False,False,False,False
1,26,0,83,3,3.9,0,0,0,1,1,...,False,False,False,False,False,False,False,False,True,False
2,26,0,129,3,0.7,0,0,0,0,1,...,False,False,False,False,False,False,False,False,False,False
3,26,0,132,3,6.5,0,0,0,0,1,...,False,False,False,False,False,False,False,False,False,False
4,26,2,132,2,2.4,0,0,0,0,1,...,False,False,False,False,False,False,False,False,False,False


In [94]:
new_dataset_dummies.shape 

(4846, 51)

In [95]:
# Fitting the model with the new Dummies 

# Extract the target variables 
new_predictor_variables = [column for column in new_dataset_dummies.drop(columns=["Loan_Size"]).columns] 

# Model the formular string 
new_formular_str = target_variable + " ~ " + "+".join(new_predictor_variables) 

new_model = ols(new_formular_str, new_dataset_dummies) 

fitted_new_model = new_model.fit() 

print(fitted_new_model.summary())


                            OLS Regression Results                            
Dep. Variable:              Loan_Size   R-squared:                       0.923
Model:                            OLS   Adj. R-squared:                  0.922
Method:                 Least Squares   F-statistic:                     1154.
Date:                Wed, 12 Nov 2025   Prob (F-statistic):               0.00
Time:                        17:12:34   Log-Likelihood:                -11958.
No. Observations:                4846   AIC:                         2.402e+04
Df Residuals:                    4795   BIC:                         2.435e+04
Df Model:                          50                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
Intercept   

In [96]:
# Lets consider variable selection by correlation and significance 

correlation_matrix = new_dataset_dummies.corr()["Loan_Size"].sort_values(ascending=False)
correlation_matrix

Loan_Size                 1.000000
Personal_Loan             0.955495
Income                    0.515645
CCAvg                     0.398250
CD_Account                0.296944
Education_Professional    0.091847
Family                    0.067339
Mortgage                  0.065089
Area_Santa_Clara          0.038019
Area_Sonoma               0.029814
Securities_Account        0.024090
Area_Contra_Costa         0.021899
Area_Shasta               0.019341
Area_San_Diego            0.017113
Area_San_Luis_Obispo      0.016299
Area_Kern                 0.014590
Area_Monterey             0.009898
Area_Ventura              0.007358
Area_Marin                0.004675
Online                    0.004077
Area_San_Joaquin          0.003724
Area_Los_Angeles          0.002802
Area_Santa_Cruz           0.002504
Area_Stanislaus           0.000004
Area_Mendocino           -0.000414
Area_Solano              -0.002067
Area_Fresno              -0.002736
Area_Riverside           -0.002958
Area_Humboldt       

In [97]:
# Get the p_values for each predictor 
from scipy.stats import pearsonr 

my_dict = dict() 

for column in new_dataset_dummies.columns:
    coef, p_value = pearsonr(new_dataset_dummies[column], new_dataset_dummies[target_variable])

    my_dict[column] = {
        "Coefficient": coef,
        "P Value": np.round(p_value, 6)
    } 

df = pd.DataFrame(my_dict).T 
df.head()

Unnamed: 0,Coefficient,P Value
Age,-0.017247,0.229978
Experience,-0.017268,0.229413
Income,0.515645,0.0
Family,0.067339,3e-06
CCAvg,0.39825,0.0


In [98]:
df.sort_values(by="P Value")

Unnamed: 0,Coefficient,P Value
Income,0.515645,0.0
CCAvg,0.39825,0.0
CD_Account,0.296944,0.0
Personal_Loan,0.955495,0.0
Education_Undergrad,-0.154798,0.0
Education_Professional,0.091847,0.0
Loan_Size,1.0,0.0
Family,0.067339,3e-06
Mortgage,0.065089,6e-06
Area_Santa_Clara,0.038019,0.008123


In [99]:
# Extract variables with P Value less than 0.05 significance level 
significant_predictors = list(df[df["P Value"] < 0.05].index)

# Drop the target variable from the list of predictors
significant_predictors.pop(significant_predictors.index(target_variable))

# This are the predictors that are statistically significant for the model
significant_predictors

['Income',
 'Family',
 'CCAvg',
 'Mortgage',
 'CD_Account',
 'Personal_Loan',
 'Education_Professional',
 'Education_Undergrad',
 'Area_San_Bernardino',
 'Area_Santa_Clara',
 'Area_Sonoma']

In [100]:
# Extracting the significant df 
significant_df = new_dataset_dummies[significant_predictors] 
target_variable_values = new_dataset_dummies[target_variable]



In [101]:
corr = significant_df.corr().T
corr

Unnamed: 0,Income,Family,CCAvg,Mortgage,CD_Account,Personal_Loan,Education_Professional,Education_Undergrad,Area_San_Bernardino,Area_Santa_Clara,Area_Sonoma
Income,1.0,-0.156079,0.644302,0.210611,0.169919,0.504979,-0.1027,0.209267,0.021637,0.033587,-0.000775
Family,-0.156079,1.0,-0.10628,-0.020653,0.014386,0.061375,-0.010968,-0.117909,-0.035889,0.006504,0.015329
CCAvg,0.644302,-0.10628,1.0,0.114429,0.135701,0.36813,-0.07711,0.150126,0.000288,0.004966,0.023888
Mortgage,0.210611,-0.020653,0.114429,1.0,0.091747,0.143598,-0.01303,0.043906,0.021045,-0.012633,0.008713
CD_Account,0.169919,0.014386,0.135701,0.091747,1.0,0.312914,0.010973,-0.016942,-0.030362,0.022754,0.039426
Personal_Loan,0.504979,0.061375,0.36813,0.143598,0.312914,1.0,0.095241,-0.153779,-0.03261,0.036866,0.031593
Education_Professional,-0.1027,-0.010968,-0.07711,-0.01303,0.010973,0.095241,1.0,-0.557776,-0.02663,-0.022901,-0.018218
Education_Undergrad,0.209267,-0.117909,0.150126,0.043906,-0.016942,-0.153779,-0.557776,1.0,0.032272,-0.010727,-0.008255
Area_San_Bernardino,0.021637,-0.035889,0.000288,0.021045,-0.030362,-0.03261,-0.02663,0.032272,1.0,-0.052988,-0.01081
Area_Santa_Clara,0.033587,0.006504,0.004966,-0.012633,0.022754,0.036866,-0.022901,-0.010727,-0.052988,1.0,-0.027465


In [102]:
corr[np.abs(corr) > 0.9]

Unnamed: 0,Income,Family,CCAvg,Mortgage,CD_Account,Personal_Loan,Education_Professional,Education_Undergrad,Area_San_Bernardino,Area_Santa_Clara,Area_Sonoma
Income,1.0,,,,,,,,,,
Family,,1.0,,,,,,,,,
CCAvg,,,1.0,,,,,,,,
Mortgage,,,,1.0,,,,,,,
CD_Account,,,,,1.0,,,,,,
Personal_Loan,,,,,,1.0,,,,,
Education_Professional,,,,,,,1.0,,,,
Education_Undergrad,,,,,,,,1.0,,,
Area_San_Bernardino,,,,,,,,,1.0,,
Area_Santa_Clara,,,,,,,,,,1.0,


In [103]:
row, col = np.where(np.abs(corr) == 1)

row 


array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [104]:
corr 

Unnamed: 0,Income,Family,CCAvg,Mortgage,CD_Account,Personal_Loan,Education_Professional,Education_Undergrad,Area_San_Bernardino,Area_Santa_Clara,Area_Sonoma
Income,1.0,-0.156079,0.644302,0.210611,0.169919,0.504979,-0.1027,0.209267,0.021637,0.033587,-0.000775
Family,-0.156079,1.0,-0.10628,-0.020653,0.014386,0.061375,-0.010968,-0.117909,-0.035889,0.006504,0.015329
CCAvg,0.644302,-0.10628,1.0,0.114429,0.135701,0.36813,-0.07711,0.150126,0.000288,0.004966,0.023888
Mortgage,0.210611,-0.020653,0.114429,1.0,0.091747,0.143598,-0.01303,0.043906,0.021045,-0.012633,0.008713
CD_Account,0.169919,0.014386,0.135701,0.091747,1.0,0.312914,0.010973,-0.016942,-0.030362,0.022754,0.039426
Personal_Loan,0.504979,0.061375,0.36813,0.143598,0.312914,1.0,0.095241,-0.153779,-0.03261,0.036866,0.031593
Education_Professional,-0.1027,-0.010968,-0.07711,-0.01303,0.010973,0.095241,1.0,-0.557776,-0.02663,-0.022901,-0.018218
Education_Undergrad,0.209267,-0.117909,0.150126,0.043906,-0.016942,-0.153779,-0.557776,1.0,0.032272,-0.010727,-0.008255
Area_San_Bernardino,0.021637,-0.035889,0.000288,0.021045,-0.030362,-0.03261,-0.02663,0.032272,1.0,-0.052988,-0.01081
Area_Santa_Clara,0.033587,0.006504,0.004966,-0.012633,0.022754,0.036866,-0.022901,-0.010727,-0.052988,1.0,-0.027465


In [105]:
rows, cols = np.where(np.abs(corr) > 0.9)

rows 

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [106]:
cols 

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [107]:
corr

Unnamed: 0,Income,Family,CCAvg,Mortgage,CD_Account,Personal_Loan,Education_Professional,Education_Undergrad,Area_San_Bernardino,Area_Santa_Clara,Area_Sonoma
Income,1.0,-0.156079,0.644302,0.210611,0.169919,0.504979,-0.1027,0.209267,0.021637,0.033587,-0.000775
Family,-0.156079,1.0,-0.10628,-0.020653,0.014386,0.061375,-0.010968,-0.117909,-0.035889,0.006504,0.015329
CCAvg,0.644302,-0.10628,1.0,0.114429,0.135701,0.36813,-0.07711,0.150126,0.000288,0.004966,0.023888
Mortgage,0.210611,-0.020653,0.114429,1.0,0.091747,0.143598,-0.01303,0.043906,0.021045,-0.012633,0.008713
CD_Account,0.169919,0.014386,0.135701,0.091747,1.0,0.312914,0.010973,-0.016942,-0.030362,0.022754,0.039426
Personal_Loan,0.504979,0.061375,0.36813,0.143598,0.312914,1.0,0.095241,-0.153779,-0.03261,0.036866,0.031593
Education_Professional,-0.1027,-0.010968,-0.07711,-0.01303,0.010973,0.095241,1.0,-0.557776,-0.02663,-0.022901,-0.018218
Education_Undergrad,0.209267,-0.117909,0.150126,0.043906,-0.016942,-0.153779,-0.557776,1.0,0.032272,-0.010727,-0.008255
Area_San_Bernardino,0.021637,-0.035889,0.000288,0.021045,-0.030362,-0.03261,-0.02663,0.032272,1.0,-0.052988,-0.01081
Area_Santa_Clara,0.033587,0.006504,0.004966,-0.012633,0.022754,0.036866,-0.022901,-0.010727,-0.052988,1.0,-0.027465


In [108]:
off_diagonal = np.where(rows != cols)

In [109]:
off_diagonal

(array([], dtype=int64),)

In [None]:
dataset.head()

In [None]:
# Using the variance threshold to select variables. 
# Based on variances of each predictor and does not consider the relationship of predictors with the target vari
# variable.
# The predictors have to be on the same scale so we have to normalize our data fast. 
from sklearn.preprocessing import MinMaxScaler

# Step 1: Carry Variable Extraction (X and Y) 
X_col_names = new_dataset_dummies.drop(columns=["Loan_Size"]).columns
y_col_name = "Loan_Size" 
X_values_df = new_dataset_dummies[X_col_names]

# Step 2: Do the scalling 
scaler = MinMaxScaler()

X_scaled = scaler.fit_transform(X_values_df) 

X_normalized_df = pd.DataFrame(data=X_scaled, columns=X_col_names) 

X_normalized_df.describe()

In [112]:
# Variance Thresholding in sklearn 
from sklearn.feature_selection import VarianceThreshold 

# Create a VarianceTHreshold Object 
selector = VarianceThreshold(threshold=0.03) 

# Using the aobject to apply threshold to the data 
selector.fit(X_normalized_df)

0,1,2
,threshold,0.03


In [115]:
# Accessing the variances for each predictor 
column_variances = selector.variances_ 

my_dict_2 = dict()

my_dict_2 = [{"Variance Name": column_name, "Varianace": column_variance} for column_name, column_variance in zip(X_normalized_df.columns, column_variances)]

In [118]:
my_df = pd.DataFrame(my_dict_2)

In [134]:
selector.get_support(indices=True)
X_normalized_df.columns[selector.get_support(indices=True)]


Index(['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Securities_Account',
       'CD_Account', 'Online', 'Personal_Loan', 'Education_Professional',
       'Education_Undergrad', 'Gender_Male', 'Area_Los_Angeles', 'Area_Orange',
       'Area_Sacramento', 'Area_San_Diego', 'Area_San_Francisco',
       'Area_San_Mateo', 'Area_Santa_Clara'],
      dtype='object')

In [130]:
X_normalized_df.columns[[0, 2, 4]]

Index(['Age', 'Income', 'CCAvg'], dtype='object')

In [121]:
X_new_df = X_normalized_df[X_normalized_df.columns[selector.get_support(indices=True)]] 

X_variable_names = X_new_df.columns 

X_new_df.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Securities_Account,CD_Account,Online,Personal_Loan,Education_Professional,Education_Undergrad,Gender_Male,Area_Los_Angeles,Area_Orange,Area_Sacramento,Area_San_Diego,Area_San_Francisco,Area_San_Mateo,Area_Santa_Clara
0,0.068182,0.046512,0.240741,0.333333,0.3,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.068182,0.0,0.347222,0.666667,0.39,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.068182,0.0,0.560185,0.666667,0.07,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.068182,0.0,0.574074,0.666667,0.65,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.068182,0.046512,0.574074,0.333333,0.24,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [122]:
X_new_df.shape 

(4846, 19)

In [135]:
my_dataset = new_dataset_dummies.copy()

In [145]:
# Step 1: Extract X and Y variables 
X_var_names = my_dataset.drop(columns=["Loan_Size"]).columns
Y_var_name = "Loan_Size"

# Step 2: Extracting data for X_var_names 
X_data_df = my_dataset[X_var_names]
y_data = my_dataset[Y_var_name] 

# Step 3: Normalize the data 
scaler_object = MinMaxScaler()
X_data_df_scaled = scaler_object.fit_transform(X_data_df) 
X_data_df_normalized = pd.DataFrame(data=X_data_df_scaled, columns=X_data_df.columns) 

# Step 4: Variance Thresholding 
my_selector = VarianceThreshold(threshold=0.03)
my_selector.fit(X_data_df_normalized) 

# Extract Variances
my_variances = my_selector.variances_ 

# Exxtracting significant columns 
my_significant_columns_df = X_data_df_normalized[X_data_df_normalized.columns[my_selector.get_support(indices=True)]]
my_significant_columns_df


Unnamed: 0,Age,Experience,Income,Family,CCAvg,Securities_Account,CD_Account,Online,Personal_Loan,Education_Professional,Education_Undergrad,Gender_Male,Area_Los_Angeles,Area_Orange,Area_Sacramento,Area_San_Diego,Area_San_Francisco,Area_San_Mateo,Area_Santa_Clara
0,0.068182,0.046512,0.240741,0.333333,0.30,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.068182,0.000000,0.347222,0.666667,0.39,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.068182,0.000000,0.560185,0.666667,0.07,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.068182,0.000000,0.574074,0.666667,0.65,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.068182,0.046512,0.574074,0.333333,0.24,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4841,1.000000,0.953488,0.324074,1.000000,0.24,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4842,1.000000,1.000000,0.328704,1.000000,0.17,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4843,1.000000,1.000000,0.449074,1.000000,0.17,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4844,1.000000,0.953488,0.481481,0.000000,0.20,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [149]:
my_selector_2 = VarianceThreshold(threshold=0.1).fit(X_data_df_normalized)
my_selector_3 = VarianceThreshold(threshold=0.15).fit(X_data_df_normalized)

my_variances_2 = my_selector_2.variances_
my_variances_3 = my_selector_3.variances_ 

my_significant_columns_df_2 = X_data_df_normalized[X_data_df_normalized.columns[my_selector_2.get_support(indices=True)]]
my_signifcant_columns_df_3 = X_data_df_normalized[X_data_df_normalized.columns[my_selector_3.get_support(indices=True)]]

In [150]:
my_significant_columns_df_2.shape

(4846, 8)

In [151]:
my_signifcant_columns_df_3.shape 

(4846, 5)

In [155]:
# Fitting the OLS model with a threshold of 0.03 
from statsmodels.formula.api import ols

my_predictors = list(my_significant_columns_df.columns) 
my_response_variable = "Loan_Size"

new_formular_str = my_response_variable + " ~ " + "+".join(my_predictors) 
my_new_model = ols(new_formular_str, my_dataset) 


my_new_model_fitted = my_new_model.fit() 
print(my_new_model_fitted.summary())

                            OLS Regression Results                            
Dep. Variable:              Loan_Size   R-squared:                       0.917
Model:                            OLS   Adj. R-squared:                  0.916
Method:                 Least Squares   F-statistic:                     2797.
Date:                Wed, 12 Nov 2025   Prob (F-statistic):               0.00
Time:                        22:20:34   Log-Likelihood:                -12156.
No. Observations:                4846   AIC:                         2.435e+04
Df Residuals:                    4826   BIC:                         2.448e+04
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
Intercept   