## Hyper-parameter Tuning

In [19]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import os

In [20]:
os.chdir("D:/meridianthe4/PML/Datasets")

In [21]:
boston = pd.read_csv("Boston.csv")
X = boston.drop('medv', axis=1)
y = boston['medv']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
alphas = [0.001, 0.01, 0.1, 1, 1.5, 2.5, 5, 10]
scores = []

for a in alphas:
    ridge = Ridge(alpha=a)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    scores.append([a, mean_squared_error(y_test, y_pred)])
df_scores = pd.DataFrame(scores, columns=['alpha', 'score'])
df_scores.sort_values('score')

Unnamed: 0,alpha,score
0,0.001,21.518121
1,0.01,21.524213
2,0.1,21.585116
3,1.0,22.044053
4,1.5,22.193243
5,2.5,22.377774
6,5.0,22.57618
7,10.0,22.695335


## Housing

In [23]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector

In [24]:
housing = pd.read_csv("Housing.csv")
X, y = housing.drop('price', axis=1), housing['price']

In [25]:
ohe = OneHotEncoder(drop='first', sparse_output=False).set_output(transform="pandas")
col_transformer = ColumnTransformer([("OHE", ohe, make_column_selector(dtype_include=object))], 
                                    remainder='passthrough', 
                                    verbose_feature_names_out=False)
col_transformer = col_transformer.set_output(transform="pandas")
X = col_transformer.fit_transform(X)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

In [27]:
alphas = [0.001, 0.01, 0.1, 1, 1.5, 2.5, 5, 10]
scores = []

for a in alphas:
    ridge = Ridge(alpha=a)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    scores.append([a, mean_squared_error(y_test, y_pred)])
df_scores = pd.DataFrame(scores, columns=['alpha', 'score'])
df_scores.sort_values('score')

Unnamed: 0,alpha,score
7,10.0,269840800.0
6,5.0,269993300.0
5,2.5,270874500.0
4,1.5,271468800.0
3,1.0,271833300.0
2,0.1,272624500.0
1,0.01,272714400.0
0,0.001,272723500.0


In [28]:
np.arange(0.001, 15, 0.5)

array([1.0000e-03, 5.0100e-01, 1.0010e+00, 1.5010e+00, 2.0010e+00,
       2.5010e+00, 3.0010e+00, 3.5010e+00, 4.0010e+00, 4.5010e+00,
       5.0010e+00, 5.5010e+00, 6.0010e+00, 6.5010e+00, 7.0010e+00,
       7.5010e+00, 8.0010e+00, 8.5010e+00, 9.0010e+00, 9.5010e+00,
       1.0001e+01, 1.0501e+01, 1.1001e+01, 1.1501e+01, 1.2001e+01,
       1.2501e+01, 1.3001e+01, 1.3501e+01, 1.4001e+01, 1.4501e+01])

In [29]:
np.linspace(0.001, 15, 20)

array([1.00000000e-03, 7.90421053e-01, 1.57984211e+00, 2.36926316e+00,
       3.15868421e+00, 3.94810526e+00, 4.73752632e+00, 5.52694737e+00,
       6.31636842e+00, 7.10578947e+00, 7.89521053e+00, 8.68463158e+00,
       9.47405263e+00, 1.02634737e+01, 1.10528947e+01, 1.18423158e+01,
       1.26317368e+01, 1.34211579e+01, 1.42105789e+01, 1.50000000e+01])

In [30]:
alphas = np.linspace(0.001, 15, 30)
scores = []

for a in alphas:
    ridge = Ridge(alpha=a)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    scores.append([a, mean_squared_error(y_test, y_pred)])
df_scores = pd.DataFrame(scores, columns=['alpha', 'score'])
df_scores.sort_values('score')

Unnamed: 0,alpha,score
15,7.759103,269712300.0
16,8.27631,269718000.0
14,7.241897,269722900.0
17,8.793517,269738900.0
13,6.72469,269751200.0
18,9.310724,269773900.0
12,6.207483,269798700.0
19,9.827931,269822000.0
11,5.690276,269866800.0
20,10.345138,269882500.0


## Exp Salary

In [31]:
sals = pd.read_csv("Exp_Salaries.csv")
X, y = sals.drop('Salary', axis=1), sals['Salary']

In [32]:
ohe = OneHotEncoder(drop='first', sparse_output=False).set_output(transform="pandas")
col_transformer = ColumnTransformer([("OHE", ohe, make_column_selector(dtype_include=object))], 
                                    remainder='passthrough', 
                                    verbose_feature_names_out=False)
col_transformer = col_transformer.set_output(transform="pandas")
X = col_transformer.fit_transform(X)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

In [34]:
alphas = np.linspace(0.001, 15, 30)
scores = []

for a in alphas:
    ridge = Ridge(alpha=a)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    scores.append([a, mean_squared_error(y_test, y_pred)])
df_scores = pd.DataFrame(scores, columns=['alpha', 'score'])
df_scores.sort_values('score')

Unnamed: 0,alpha,score
0,0.001,13384010.0
1,0.518207,14573820.0
2,1.035414,15804700.0
3,1.552621,16856820.0
4,2.069828,17729900.0
5,2.587034,18456290.0
6,3.104241,19067250.0
7,3.621448,19587640.0
8,4.138655,20036410.0
9,4.655862,20427830.0
