In [1]:
pip install pandas numpy scikit-learn statsmodels

[0mNote: you may need to restart the kernel to use updated packages.


In [11]:
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [3]:
df = pd.read_csv('credit_score.csv')
print(df.head())

   income  credit_score  target  job_title_Doctor  job_title_Engineer  \
0    0.36         -1.55       0             False               False   
1   -0.12         -0.49       0             False               False   
2    0.48         -0.40       1             False                True   
3    1.15         -0.90       0             False               False   
4   -0.20         -0.21       0             False                True   

   job_title_Teacher  income_log  
0               True    0.310466  
1               True   -0.130651  
2              False    0.391867  
3              False    0.765678  
4              False   -0.218092  


In [14]:
df = df.dropna()

In [15]:
# Define features and target variable
X = df[['credit_score', 'income_log']]
y = df['target']

In [16]:
# Fit the model using Ordinary Least Squares (OLS)
model = sm.OLS(y, X).fit()

# Check p-values and remove the feature with the highest p-value if greater than 0.05
while True:
    max_p_value = model.pvalues.max()  # Get the highest p-value
    if max_p_value > 0.05:
        feature_to_remove = model.pvalues.idxmax()  # Identify the feature with the highest p-value
        print(f"Removing feature: {feature_to_remove} with p-value: {max_p_value}")
        X = X.drop(columns=[feature_to_remove])  # Drop the feature
        model = sm.OLS(y, X).fit()  # Refit the model
    else:
        break  # Exit the loop if all p-values are below 0.05

# Display the final model summary
print(model.summary())

Removing feature: income_log with p-value: 0.8644595978701624
                                 OLS Regression Results                                
Dep. Variable:                 target   R-squared (uncentered):                   0.186
Model:                            OLS   Adj. R-squared (uncentered):              0.177
Method:                 Least Squares   F-statistic:                              19.70
Date:                Sun, 11 Jan 2026   Prob (F-statistic):                    2.68e-05
Time:                        16:33:32   Log-Likelihood:                         -88.607
No. Observations:                  87   AIC:                                      179.2
Df Residuals:                      86   BIC:                                      181.7
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                   coef    std err          t      P>|t|  

In [17]:
def forward_selection(X, y):
    remaining_features = set(X.columns)
    selected_features = []
    current_score = 0.0
    
    while remaining_features:
        scores_with_candidates = []
        
        for feature in remaining_features:
            features_to_test = selected_features + [feature]
            X_train, X_test, y_train, y_test = train_test_split(X[features_to_test], y, test_size=0.2, random_state=42)
            
            # Train the model
            model = LinearRegression()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = r2_score(y_test, y_pred)
            
            scores_with_candidates.append((score, feature))
        
        # Select the feature with the highest score
        scores_with_candidates.sort(reverse=True)
        best_score, best_feature = scores_with_candidates[0]
        
        if current_score < best_score:
            remaining_features.remove(best_feature)
            selected_features.append(best_feature)
            current_score = best_score
        else:
            break
    
    return selected_features

best_features = forward_selection(X, y)
print(f"Selected features using Forward Selection: {best_features}")

Selected features using Forward Selection: ['credit_score']


In [18]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the LASSO model with alpha (regularization parameter)
lasso_model = Lasso(alpha=0.1)

# Train the LASSO model
lasso_model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = lasso_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f'R-squared score: {r2}')

# Display the coefficients of the features
print(f'LASSO Coefficients: {lasso_model.coef_}')

R-squared score: 0.5695785562028565
LASSO Coefficients: [0.25676197]
