## 1. Import necessary libraries 

In [2]:
# import pandas
import pandas as pd


## 2.Load the dataset

In [4]:
# Load the dataset test.csv
df = pd.read_csv('test.csv')
print("Shape of dataset is: ", df.shape)
print("first 5 rows of the data:")
print(df.head())

Shape of dataset is:  (50, 42)
first 5 rows of the data:
   Unnamed: 0         0         1         2         3         4         5  \
0           0 -0.217681  0.357015 -0.622700 -0.828995 -0.493001  0.899600   
1           1 -1.773032  2.644343  1.958347  0.308051  0.496699  0.253740   
2           2  0.484733 -0.180480 -0.252354  0.113270 -1.563191  0.298753   
3           3  1.441273 -1.519370  0.404982 -0.926930  0.917862  1.266911   
4           4  1.091310  1.227669 -1.555896  0.558327  0.833529  1.672572   

          6         7         8  ...        31        32        33        34  \
0  0.610370  0.747294  1.586017  ...  0.681953 -0.589365 -0.151785 -0.310267   
1 -0.343192  0.269127 -1.072743  ... -2.135674 -0.465310 -2.530288  3.137749   
2 -0.668144  0.919229 -0.645964  ... -0.955123  0.883110  0.122670  0.423599   
3 -1.024388 -3.241267  0.504987  ...  0.199060  2.122156 -0.474945 -0.600217   
4 -0.920674  0.538756 -0.581681  ...  2.319330 -1.993736  0.034083  0.393318   


## 3. Split the dependent variables(y) and independent variables(X)

In [6]:
#
X = df.drop('label', axis=1)
y = df['label']
print("Dependent Variable: ", y)
print("Independent Variable: ", X)

Dependent Variable:  0      75.003235
1      20.496200
2       6.252874
3    -107.491031
4    -332.121927
5    -113.377817
6     285.022171
7     199.555146
8     -71.168129
9     -38.082855
10     -9.052701
11    233.596035
12    112.938833
13   -353.175564
14    -45.947598
15     85.377728
16   -140.225808
17    -28.761833
18    -63.034713
19    -78.584249
20     94.149798
21    241.396866
22     23.597307
23    -49.101465
24   -278.680240
25     -9.692361
26    322.406613
27     29.625022
28     31.545839
29    197.701888
30     90.489140
31    313.155970
32     73.999240
33   -237.232155
34     38.363780
35    243.044433
36    361.139563
37    -69.590999
38    106.895522
39   -275.413916
40     51.204852
41   -104.644539
42   -245.076902
43   -107.739257
44    -28.852411
45    121.642369
46    257.528880
47    228.979111
48   -265.592983
49    152.317382
Name: label, dtype: float64
Independent Variable:      Unnamed: 0         0         1         2         3         4         5  \


## 4. Train Test Split


In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

## 5. Train the model 

In [10]:
# import the LinearRegression model from sklearn
from sklearn.linear_model import LinearRegression

In [11]:
# Train the model 
linear=LinearRegression()
linear.fit(X_train,y_train)

## 6. Predicting the Model on the Test Set

In [13]:
# Predicting the dependent variable values for the test set
y_pred = linear.predict(X_test)
print("The predicted dependent variable value is: ", y_pred)

The predicted dependent variable value is:  [-323.86547401 -189.91403173   37.15752085  120.79370316  195.36548028
 -116.05609269  215.1265104   -33.0984219   122.10932595  -30.67208695]


## 7. Evaluate the model 

In [15]:
# compute mse, rmse, r2_score
from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score, mean_absolute_error
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)

print('The mean squared error is:', mse)
print('The root mean squared error is:', rmse)
print('The mean absolute error is:', mae)
print('The r2_score (accuracy) is:', r2)

The mean squared error is: 10027.56471334102
The root mean squared error is: 100.13772872070257
The mean absolute error is: 76.93628725970743
The r2_score (accuracy) is: 0.7457602741619347


### Is the Model Overfitting? Is the RÂ² Score Low?

If your model performs very well on the training set but poorly on the test set, itâ€™s likely overfitting, meaning the model has learned the noise and details of the training data instead of the general pattern.

When the RÂ² score on the test set is low, it shows that the model isnâ€™t generalizing well to new data.

### Itâ€™s time to apply Regularization!
Regularization techniques like L1 (Lasso) and L2 (Ridge) help reduce overfitting by penalizing large coefficients, leading to a simpler and more stable model.

In [17]:
# importing the necessary libraries
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV


In [18]:
# Define Ridge object
ridge = Ridge()

In [19]:
# Define parameters
parameters = {'alpha': [1,2,3,4,5,6,7,8,9,10,11,12,13,15,18,20]}

### Create GridSearchCV
- Use ``GridSearchCV(estimator=ridge, parameters, cv=5, scoring='neg_mean_squared_error')``

In [21]:
# creating GridSearchCV
grid = GridSearchCV(estimator=ridge, param_grid=parameters, cv=5, scoring='neg_mean_squared_error')

### Fit GridSearchCV
- ``grid.fit(X_train, y_train)`` â€” this finds best alpha by cross-validation.

In [23]:
# Train the model
grid.fit(X_train,y_train)

### Evaluate on test set
- Use ``grid.predict(X_test)`` and evaluate **MSE**, **RMSE**,**MAE** **RÂ²**

In [25]:
# Predict on X_test
pred= grid.predict(X_test)

In [26]:
# Compute metrics
mse = mean_squared_error(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)
r2 = r2_score(y_test,pred)
mae = mean_absolute_error(y_test,pred)

In [27]:
print('The mean squared error is:', mse)
print('The root mean squared error is:', rmse)
print('The mean absolute error is:', mae)
print('The r2_score (accuracy) is:', r2)

The mean squared error is: 3876.915245607147
The root mean squared error is: 62.26487971246027
The mean absolute error is: 55.86889873075019
The r2_score (accuracy) is: 0.9017043621938223


_**Now see the difference how model imporves.**_ ðŸŽ‰

|

**Can you repeat same process for Lasso (L1)?** <br/>
-> Yes sir, I can!

In [31]:
# Defining Lasso Object
lasso = Lasso()

In [32]:
# Defining Parameters
parameters_lasso = {'alpha': [1,2,3,4,5,6,7,8,9,10,11,12,13,15,18,20]}

In [33]:
# creating GridSearchCV
grid_lasso = GridSearchCV(estimator=ridge, param_grid=parameters_lasso, cv=5, scoring='neg_mean_squared_error')

In [34]:
# Train the model
grid_lasso.fit(X_train,y_train)

In [35]:
# predict on X_test
pred_lasso= grid_lasso.predict(X_test)

In [36]:
# compute metrics
mse = mean_squared_error(y_test, pred_lasso)
rmse = root_mean_squared_error(y_test, pred_lasso)
r2 = r2_score(y_test,pred_lasso)
mae = mean_absolute_error(y_test,pred_lasso)

In [37]:
print('The mean squared error is:', mse)
print('The root mean squared error is:', rmse)
print('The mean absolute error is:', mae)
print('The r2_score (accuracy) is:', r2)

The mean squared error is: 3876.915245607147
The root mean squared error is: 62.26487971246027
The mean absolute error is: 55.86889873075019
The r2_score (accuracy) is: 0.9017043621938223
