In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('combined-training.csv')
scaler = StandardScaler()

#Removing outliers from X2, X3, X4, X5, and X7 using Z-score method
z_scores = np.abs(stats.zscore(df[['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7']]))
df_no_outliers = df[(z_scores < 3).all(axis=1)]

#Standardizing the data
scaled_features = scaler.fit_transform(df_no_outliers[['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7']])

#Creating a new DataFrame with the standardized data
df_processed = pd.DataFrame(scaled_features, columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7'])


In [2]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

#Calculating VIF for each feature
vif = pd.DataFrame()
vif["Feature"] = df_processed.columns
vif["VIF value"] = [variance_inflation_factor(df_processed.values, i) for i in range(df_processed.shape[1])]

print(vif)


  Feature  VIF value
0      x1   5.033925
1      x2   1.204054
2      x3   1.000111
3      x4   1.400975
4      x5   1.547530
5      x6   5.087386
6      x7   1.530043


In [8]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

#Import test data and separate into X and Y
df_test = pd.read_csv('combined-test.csv')
X_test = df_test[['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7']]
Y_test = df_test['BIS']

#Training data set separated into X and Y
X_train = df_no_outliers[['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7']]  # Removed x6 based on multicollinearity
Y_train = df_no_outliers['BIS']

#Create a linear regression model
model = LinearRegression()

#Perform forward feature selection with linear regression estimator and r2 scoring
sfs = SequentialFeatureSelector(model, direction='forward', n_features_to_select='auto', scoring='r2')
sfs.fit(X_train, Y_train)

#Prompt features selected from forward selection
selected_features_fw = X_train.columns[sfs.get_support()]
print(f"Selected Features From Forward Selection: {selected_features_fw}")

#Perform backward feature selection with linear regression estimator and r2 scoring
sbs = SequentialFeatureSelector(model, direction='backward', n_features_to_select='auto', scoring='r2')
sbs.fit(X_train, Y_train)

#Prompt features selected from backward selection
selected_features_bw = X_train.columns[sbs.get_support()]
print(f"Selected Features From Backward Elimination: {selected_features_bw}")


Selected Features From Forward Selection: Index(['x4', 'x6', 'x7'], dtype='object')
Selected Features From Backward Elimination: Index(['x1', 'x4', 'x5', 'x7'], dtype='object')


In [10]:
#creating a model for forward selected features
X_train_fw = X_train[selected_features_fw]
X_test_fw = X_test[selected_features_fw]
fw_model = sm.OLS(Y_train, sm.add_constant(X_train_fw)).fit()

#creating a model for backward selected features
X_train_bw = X_train[selected_features_bw]
X_test_bw = X_test[selected_features_bw]
bw_model = sm.OLS(Y_train, sm.add_constant(X_train_bw)).fit()

print("-------------------Forward Selection Model-------------------")
print(fw_model.summary())
print("\n")
print("-------------------Backward Elimination Model-------------------")
print(bw_model.summary())




-------------------Forward Selection Model-------------------
                            OLS Regression Results                            
Dep. Variable:                    BIS   R-squared:                       0.611
Model:                            OLS   Adj. R-squared:                  0.611
Method:                 Least Squares   F-statistic:                 1.645e+04
Date:                Fri, 13 Dec 2024   Prob (F-statistic):               0.00
Time:                        13:04:55   Log-Likelihood:            -1.1377e+05
No. Observations:               31442   AIC:                         2.275e+05
Df Residuals:                   31438   BIC:                         2.276e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------