# Linear Models

Build a baseline a linear regression models to predict scores.

RMSE is tha main metric for model evaluation.

## Imports

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
data = pd.read_csv('../data/merged_data.csv')
data.drop(columns='Unnamed: 0', inplace=True)
data

Unnamed: 0,wave_height,dominant_period,avg_period,dominant_wave_direction_sin,dominant_wave_direction_cos,wave_height_1.0_h,dominant_period_1.0_h,avg_period_1.0_h,dominant_wave_direction_sin_1.0_h,dominant_wave_direction_cos_1.0_h,...,wave_height_9.0_h,dominant_period_9.0_h,avg_period_9.0_h,dominant_wave_direction_sin_9.0_h,dominant_wave_direction_cos_9.0_h,score,wind_speed,gust_speed,wind_direction_sin,wind_direction_cos
0,2.56,11.76,9.82,-0.681998,0.731354,2.52,13.33,9.60,-0.669131,0.743145,...,2.94,13.33,10.56,-0.719340,0.694658,6.440000,0.60,1.05,0.410719,0.911762
1,2.46,13.33,9.98,-0.681998,0.731354,2.40,13.33,9.99,-0.669131,0.743145,...,2.80,14.29,10.01,-0.731354,0.681998,7.266667,0.28,0.82,0.671721,-0.740805
2,2.20,12.50,9.79,-0.681998,0.731354,2.56,11.76,9.82,-0.681998,0.731354,...,2.71,13.33,9.84,-0.681998,0.731354,7.223333,0.20,0.78,0.972776,-0.231748
3,2.33,11.76,9.75,-0.681998,0.731354,2.46,13.33,9.98,-0.681998,0.731354,...,2.67,13.33,9.67,-0.694658,0.719340,15.180000,0.28,0.70,0.829038,0.559193
4,2.48,11.76,10.13,-0.529919,0.848048,2.20,12.50,9.79,-0.681998,0.731354,...,2.65,14.29,9.92,-0.719340,0.694658,4.786667,0.52,0.90,-0.923210,0.384295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,2.00,13.00,11.00,-0.681998,0.731354,1.90,13.00,11.00,-0.882948,0.469472,...,2.10,13.00,10.80,-0.669131,0.743145,17.300000,0.00,0.60,0.000000,1.000000
344,2.00,13.00,11.30,-0.819152,0.573576,1.90,13.00,10.60,-0.766044,0.642788,...,1.80,13.00,10.70,-0.719340,0.694658,6.700000,0.70,1.54,-0.961262,-0.275637
345,2.10,13.00,11.40,-0.819152,0.573576,2.00,13.00,11.00,-0.681998,0.731354,...,1.80,13.00,10.70,-0.587785,0.809017,9.630000,1.40,2.20,-0.743145,0.669131
346,2.10,13.00,11.20,-0.838671,0.544639,2.00,13.00,11.30,-0.819152,0.573576,...,1.90,13.00,10.80,-0.743145,0.669131,9.915000,2.18,2.70,-0.719340,0.694658


## X, y

In [4]:
# split data into input and output columns
X = data.drop(columns="score")
y = data["score"]

In [31]:
X.columns

Index(['wave_height', 'dominant_period', 'avg_period',
       'dominant_wave_direction_sin', 'dominant_wave_direction_cos',
       'wave_height_1.0_h', 'dominant_period_1.0_h', 'avg_period_1.0_h',
       'dominant_wave_direction_sin_1.0_h',
       'dominant_wave_direction_cos_1.0_h', 'wave_height_1.5_h',
       'dominant_period_1.5_h', 'avg_period_1.5_h',
       'dominant_wave_direction_sin_1.5_h',
       'dominant_wave_direction_cos_1.5_h', 'wave_height_3.0_h',
       'dominant_period_3.0_h', 'avg_period_3.0_h',
       'dominant_wave_direction_sin_3.0_h',
       'dominant_wave_direction_cos_3.0_h', 'wave_height_6.0_h',
       'dominant_period_6.0_h', 'avg_period_6.0_h',
       'dominant_wave_direction_sin_6.0_h',
       'dominant_wave_direction_cos_6.0_h', 'wave_height_9.0_h',
       'dominant_period_9.0_h', 'avg_period_9.0_h',
       'dominant_wave_direction_sin_9.0_h',
       'dominant_wave_direction_cos_9.0_h', 'wind_speed', 'gust_speed',
       'wind_direction_sin', 'wind_directio

## Standard Scaling

In [5]:
ss = StandardScaler()
X_sc = ss.fit_transform(X)

## Train, test split

In [6]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_sc, y, random_state=1331)

## Baseline

In [7]:
y_baseline_preds = np.full_like(y, y.mean()) 
baseline_rmse = mean_squared_error(y, y_baseline_preds, squared=False)
baseline_rmse

3.1675113889842983

## Create list of dicts to save model results

In [8]:
all_model_results = []


### Add baseline score

In [9]:
baseline_results = {
    "Model Name": "Baseline (y_mean)",
    "Score Type": 'Baseline RMSE',
    "Score": baseline_rmse,
    "Observations": "",
}
all_model_results.append(baseline_results)
all_model_results

[{'Model Name': 'Baseline (y_mean)',
  'Score Type': 'Baseline RMSE',
  'Score': 3.1675113889842983,
  'Observations': ''}]

## Linear Regression

In [10]:
# Instantiate model
linreg = LinearRegression()

# Fit on training set
linreg.fit(X_train, y_train)

# Score on training and testing set
print(f"R-squared score on train set: {linreg.score(X_train, y_train)}")
print(f"R-squared score on test set: {linreg.score(X_test, y_test)}")

R-squared score on train set: 0.22190748542397942
R-squared score on test set: 0.049404168666461246


In [11]:
# Review strongest coefficients
pd.DataFrame(linreg.coef_, X.columns).sort_values(by=0, ascending=False).head()

Unnamed: 0,0
avg_period_1.0_h,1.450338
wave_height_1.5_h,0.842378
dominant_wave_direction_cos,0.783223
dominant_wave_direction_sin_6.0_h,0.707868
avg_period_9.0_h,0.697566


In [12]:
# Review strongest coefficients
pd.DataFrame(linreg.coef_, X.columns).sort_values(by=0, ascending=False).tail()

Unnamed: 0,0
dominant_wave_direction_cos_6.0_h,-0.644084
dominant_wave_direction_sin_3.0_h,-0.817508
wave_height_1.0_h,-1.011872
avg_period,-1.051983
wave_height_3.0_h,-1.228831


### Linear Regression RMSE

In [13]:
linreg_train_rmse = mean_squared_error(y_train, linreg.predict(X_train), squared=False)
linreg_train_rmse

2.8378314204310477

In [14]:
linreg_test_rmse = mean_squared_error(y_test, linreg.predict(X_test), squared=False)
linreg_test_rmse

2.9195332929845854

### Save model result for final comparisson

In [15]:
linreg_results = [
    {
        "Model Name": "Linear Regression",
        "Score Type": "Train RMSE",
        "Score": linreg_train_rmse,
        "Observations": "",
    },
    {
        "Model Name": "Linear Regression",
        "Score Type": "Test RMSE",
        "Score": linreg_test_rmse,
        "Observations": "",
    },
]

for result in linreg_results:
    all_model_results.append(result)


## Lasso Regression with Grid Search CV

In [16]:
# Empty dict for results
lasso_results = {}

# Instantiate model
lasso = Lasso()

# Define GS params
lasso_params = {"alpha": [0.01, 0.1, 1, 10, 100]}

# Instantiate GridSearch
lasso_gs = GridSearchCV(lasso, lasso_params)

# Fit on training set
lasso_gs.fit(X_train, y_train)

# Score on training and testing set
print(f"R-squared score on train set: {lasso_gs.score(X_train, y_train)}")
print(f"R-squared score on test set: {lasso_gs.score(X_test, y_test)}")


R-squared score on train set: 0.17437225944521462
R-squared score on test set: 0.11306336319152299


In [17]:
# Get Lasso best alpha
lasso_alpha = lasso_gs.best_params_
lasso_alpha

{'alpha': 0.1}

In [18]:
# Review coefficients
lasso_coef = pd.DataFrame(lasso_gs.best_estimator_.coef_, X.columns).sort_values(by=0, ascending=False)
lasso_coef = lasso_coef[lasso_coef[0] != 0]
lasso_coef

Unnamed: 0,0
avg_period_9.0_h,0.45346
avg_period_6.0_h,0.284672
wind_direction_cos,0.261481
dominant_wave_direction_cos,0.260206
wind_direction_sin,0.080331
avg_period_1.0_h,0.026621
wind_speed,-0.013706
dominant_wave_direction_cos_6.0_h,-0.219899
dominant_period_1.5_h,-0.392921
dominant_period_6.0_h,-0.452037


### Lasso Regression RMSE

In [19]:
lasso_train_rmse = mean_squared_error(y_train, lasso_gs.best_estimator_.predict(X_train), squared=False)
lasso_train_rmse

2.9232308445850683

In [20]:
lasso_test_rmse = mean_squared_error(y_test, lasso_gs.best_estimator_.predict(X_test), squared=False)
lasso_test_rmse

2.8200822609798704

### Save model result for final comparisson

In [21]:
lasso_results = [
    {
        "Model Name": "Lasso Regression",
        "Score Type": "Train RMSE",
        "Score": lasso_train_rmse,
        "Observations": lasso_alpha,
    },
    {
        "Model Name": "Lasso Regression",
        "Score Type": "Test RMSE",
        "Score": lasso_test_rmse,
        "Observations": lasso_alpha,
    },
]

for result in lasso_results:
    all_model_results.append(result)


### Save model pkl for streamlit

In [22]:
# With help from 6.07 lesson
import pickle
with open("../saved-models/saved-lasso.pkl", "wb") as file:
    pickle.dump(lasso_gs.best_estimator_, file)

## Ridge Regression with Grid Search CV

In [23]:
# Empty dict for results
ridge_results = {}

# Instantiate model
ridge = Ridge()

# Define GS params
ridge_params = {'alpha': [100, 200, 500, 1_000]}

# Instantiate GridSearch
ridge_gs = GridSearchCV(ridge, ridge_params)

# Fit on training set
ridge_gs.fit(X_train, y_train)

# Score on training and testing set
print(f"R-squared score on train set: {ridge_gs.score(X_train, y_train)}")
print(f"R-squared score on test set: {ridge_gs.score(X_test, y_test)}")




R-squared score on train set: 0.15691405240848066
R-squared score on test set: 0.10476007102332119


In [24]:
# Get Ridge best alpha
ridge_alpha = ridge_gs.best_params_
ridge_alpha

{'alpha': 200}

In [25]:
# Review coefficients
ridge_coef = pd.DataFrame(ridge_gs.best_estimator_.coef_, X.columns).sort_values(by=0, ascending=False)
ridge_coef = ridge_coef[ridge_coef[0] != 0]
ridge_coef

Unnamed: 0,0
avg_period_9.0_h,0.250481
avg_period_6.0_h,0.191522
wind_direction_cos,0.166979
dominant_wave_direction_cos,0.153214
avg_period_1.0_h,0.12114
wind_direction_sin,0.104503
avg_period_1.5_h,0.098757
dominant_wave_direction_sin,0.083246
dominant_wave_direction_sin_6.0_h,0.035414
dominant_wave_direction_cos_3.0_h,0.013261


### Ridge Regression RMSE

In [26]:
ridge_train_rmse = mean_squared_error(y_train, ridge_gs.best_estimator_.predict(X_train), squared=False)
ridge_train_rmse

2.953975571786676

In [27]:
ridge_test_rmse = mean_squared_error(y_test, ridge_gs.best_estimator_.predict(X_test), squared=False)
ridge_test_rmse

2.833251983216231

### Save model result for final comparisson

In [28]:
ridge_results = [
    {
        "Model Name": "Ridge Regression",
        "Score Type": "Train RMSE",
        "Score": ridge_train_rmse,
        "Observations": ridge_alpha,
    },
    {
        "Model Name": "Ridge Regression",
        "Score Type": "Test RMSE",
        "Score": ridge_test_rmse,
        "Observations": ridge_alpha,
    },
]

for result in ridge_results:
    all_model_results.append(result)
all_model_results

[{'Model Name': 'Baseline (y_mean)',
  'Score Type': 'Baseline RMSE',
  'Score': 3.1675113889842983,
  'Observations': ''},
 {'Model Name': 'Linear Regression',
  'Score Type': 'Train RMSE',
  'Score': 2.8378314204310477,
  'Observations': ''},
 {'Model Name': 'Linear Regression',
  'Score Type': 'Test RMSE',
  'Score': 2.9195332929845854,
  'Observations': ''},
 {'Model Name': 'Lasso Regression',
  'Score Type': 'Train RMSE',
  'Score': 2.9232308445850683,
  'Observations': {'alpha': 0.1}},
 {'Model Name': 'Lasso Regression',
  'Score Type': 'Test RMSE',
  'Score': 2.8200822609798704,
  'Observations': {'alpha': 0.1}},
 {'Model Name': 'Ridge Regression',
  'Score Type': 'Train RMSE',
  'Score': 2.953975571786676,
  'Observations': {'alpha': 200}},
 {'Model Name': 'Ridge Regression',
  'Score Type': 'Test RMSE',
  'Score': 2.833251983216231,
  'Observations': {'alpha': 200}}]

# Linear Regression Results

In [29]:
pd.DataFrame(all_model_results)

Unnamed: 0,Model Name,Score Type,Score,Observations
0,Baseline (y_mean),Baseline RMSE,3.167511,
1,Linear Regression,Train RMSE,2.837831,
2,Linear Regression,Test RMSE,2.919533,
3,Lasso Regression,Train RMSE,2.923231,{'alpha': 0.1}
4,Lasso Regression,Test RMSE,2.820082,{'alpha': 0.1}
5,Ridge Regression,Train RMSE,2.953976,{'alpha': 200}
6,Ridge Regression,Test RMSE,2.833252,{'alpha': 200}


## Save results to csv

In [30]:
pd.DataFrame(all_model_results).to_csv('../model-results/linreg.csv')