In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('combined-training.csv')
scaler = StandardScaler()

#Removing outliers from X2, X3, X4, X5, and X7 using Z-score method
z_scores = np.abs(stats.zscore(df[['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7']]))
df_no_outliers = df[(z_scores < 3).all(axis=1)]

#Standardizing the data
scaled_features = scaler.fit_transform(df_no_outliers[['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7']])

#Creating a new DataFrame with the standardized data
df_processed = pd.DataFrame(scaled_features, columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7'])

In [2]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

#Calculating VIF for each feature
vif = pd.DataFrame()
vif["Feature"] = df_processed.columns
vif["VIF value"] = [variance_inflation_factor(df_processed.values, i) for i in range(df_processed.shape[1])]

print(vif)

  Feature  VIF value
0      x1   5.033925
1      x2   1.204054
2      x3   1.000111
3      x4   1.400975
4      x5   1.547530
5      x6   5.087386
6      x7   1.530043


In [3]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

#Creating a linear regression model
model = LinearRegression()

#Selecting the 4 best features
rfe = RFE(estimator=model, n_features_to_select=4)
rfe.fit(df_processed, df_no_outliers['BIS'])

#Printing the selected features
selected_features = df_processed.columns[rfe.support_]
print(selected_features)

Index(['x1', 'x4', 'x5', 'x7'], dtype='object')


In [6]:
from sklearn.metrics import r2_score, mean_squared_error

df_test = pd.read_csv('combined-test.csv')

X_train = df_processed[['x1', 'x4', 'x5', 'x7']]  # Removed x6 based on multicollinearity
Y_train = df_no_outliers['BIS']

#Standardizing the test data
X_test = scaler.fit_transform(df_test[['x1', 'x4', 'x5', 'x7']])
Y_test = df_test['BIS']

#Creating a linear regression model
model = LinearRegression()
model.fit(X_train, Y_train)

#Predicting the test data
Y_pred = model.predict(X_test)

#Calculating R-squared and Mean Squared Error (MSE)
r2 = r2_score(Y_test, Y_pred)
mse = mean_squared_error(Y_test, Y_pred)

print(f'R-squared: {r2:.4f}')
print(f'Mean Squared Error: {mse:.4f}')

#Calculating the coefficients and intercept of the model
coefficients = model.coef_
intercept = model.intercept_

#Printing the model equation
print("\nModel Equation:")
print(f"DoA Index = {intercept} + ({coefficients[0]} * x1) + ({coefficients[1]} * x4) + ({coefficients[2]} * x5) + ({coefficients[3]} * x7)")

R-squared: 0.6703
Mean Squared Error: 135.7432

Model Equation:
DoA Index = 40.88536034603397 + (2.670220905421646 * x1) + (7.274514906369749 * x4) + (1.841585521578302 * x5) + (-4.188347205091275 * x7)




In [9]:
from scipy.stats import pearsonr

correlation = pearsonr(Y_test, Y_pred)
print(f"Pearson correlation for Regression Model : {correlation[0]}")

Pearson correlation for Regression Model : 0.8737576565768335
