In [79]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model._ridge import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures

In [2]:
df = pd.read_csv("NPPE1_ModelBuilding3.csv")

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.087472,0.002829,1.768235,0.188396,0.117071,0.148148,0.478165,0.720443,0.967195,0.281563,0.428571,0.959596,1.0,0.0,23.358097
1,0.378379,0.022079,1.115629,0.091974,0.066089,0.851852,0.911759,0.785321,0.885001,0.424648,0.285714,0.868687,0.0,1.0,17.268768
2,0.066901,0.003828,-0.536262,0.221188,0.255671,0.296296,0.228024,0.406472,0.980184,0.274376,0.428571,0.767677,0.0,1.0,27.776974
3,0.140645,0.011132,1.323366,0.422514,0.153103,0.148148,0.410679,0.200319,0.861371,0.305006,0.142857,0.848485,0.0,1.0,16.12196
4,0.144225,0.204918,-0.93079,0.148694,0.17749,0.259259,0.146832,0.111429,0.983448,0.286322,0.285714,0.616162,1.0,0.0,23.129426


In [6]:
x = df.drop(columns=['14'])
y = df['14']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [13]:
ridge_model = Ridge(alpha=10, solver='saga', tol=1e-4, random_state=42)
ridge_model.fit(x_train, y_train)

In [14]:
r2_score = ridge_model.score(x_test, y_test)
print(f"R² Score on Test Dataset: {r2_score}")

R² Score on Test Dataset: 0.6613547575262211


In [16]:
coeff = ridge_model.coef_
coeff

array([ -0.4825441 ,   3.74601838,  -0.73583331,   0.54199933,
        -9.89014109,   5.80114296,  -5.06099736,  -9.45015598,
         4.73124885, -23.51321982,  11.31863371,   0.49450664,
        -0.89196134,   0.89196134])

In [20]:
most_imp = np.argmax(np.abs(coeff)) #the feature index with the largest absolute coefficient
least_imp = np.argmin(np.abs(coeff)) # the feature index with the smallest absolute coefficient

In [21]:
most_imp

np.int64(9)

In [22]:
least_imp

np.int64(0)

In [27]:
param_grid = {
    'penalty': ['l1', 'l2'],
    'alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
    'tol': [1e-4, 1e-3, 1e-2, 1e-1]
}


In [28]:
sgd = SGDRegressor(random_state=42)

In [33]:
grid_search = GridSearchCV(
    estimator=sgd,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=5,
    n_jobs=-1
)

In [34]:
grid_search.fit(x_train, y_train)

In [35]:
best_model = grid_search.best_estimator_
best_param = grid_search.best_params_

In [36]:
best_model

In [37]:
best_param

{'alpha': 0.001, 'penalty': 'l2', 'tol': 0.0001}

In [41]:
y_pred = best_model.predict(x_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error on Test Dataset: {mae}")

Mean Absolute Error on Test Dataset: 3.8131121797994014


In [48]:
pipe = Pipeline([
    ('pca', PCA()),
    ('lasso', Lasso())
])

In [55]:
params = {
    'pca__n_components': [0.9, 0.95],
    'lasso__alpha': [10, 1, 0.01, 0.001]
}

In [56]:
grid = GridSearchCV(
    estimator=pipe,
    param_grid=params,
    scoring='neg_mean_absolute_error',
    cv=5,
    n_jobs=-1
)

In [57]:
grid.fit(x_train, y_train)

In [59]:
best_pipe = grid.best_estimator_
best_pipe

In [62]:
best_params = grid.best_params_
best_params

{'lasso__alpha': 0.01, 'pca__n_components': 0.95}

In [63]:
y_pr = best_pipe.predict(x_test)

In [65]:
r2_score_test = r2_score(y_test, y_pred)
r2_score_test

0.6613421396890455

In [68]:
pca_model = best_pipe.named_steps['pca']
var_exp_comp1 = pca_model.explained_variance_ratio_[0]
var_exp_comp1

np.float64(0.6993757201670407)

In [72]:
pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, interaction_only=False)),
    ('lasso',Lasso(alpha=1, warm_start=True, random_state=0))
])

In [73]:
pipeline.fit(x_train, y_train)

In [74]:
y_predict = pipeline.predict(x_test)

In [78]:
r2_score_tests = r2_score(y_test, y_predict)
r2_score_tests

0.157678032410551

In [80]:
lr = LinearRegression()

In [83]:
rfe = RFE(estimator=lr, n_features_to_select=x_train.shape[1]-1)
#X_train.shape[1]: This gives the total number of features (columns) in the training dataset X_train.
#X_train.shape[1] - 1: By subtracting 1, we are specifying that one feature will be eliminated, and the remaining features will be retained.
rfe.fit(x_train, y_train)

In [84]:
eliminated_feature_index = list(rfe.support_).index(False)
eliminated_feature_index

2

In [85]:
rfe.support_
# False implies that column has been eliminated => 2nd index

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])