# Package Management

In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.preprocessing import StandardScaler

In [13]:
from sklearn.datasets import fetch_california_housing

# Data Load

In [15]:
data = fetch_california_housing(as_frame = True)
df = data.frame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [16]:
df.sample(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
8841,3.5659,24.0,3.634125,1.07168,3148.0,1.384345,34.09,-118.38,2.813
19950,3.5611,12.0,6.175373,1.084577,2191.0,2.725124,36.23,-119.34,0.902
19643,6.1324,20.0,8.22093,1.325581,282.0,3.27907,37.54,-120.82,1.648
6085,5.2405,36.0,6.335,0.955,640.0,3.2,34.09,-117.87,2.2
18669,2.6875,16.0,5.303523,1.105691,865.0,2.344173,36.98,-121.91,2.333


# Data Prep

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [18]:
X = df.drop(columns = 'MedHouseVal', axis = 1)
y = df['MedHouseVal']

In [19]:
X.shape, y.shape

((20640, 8), (20640,))

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16512, 8), (4128, 8), (16512,), (4128,))

# Scaling

## Scaling data

In [25]:
scaler = StandardScaler()
scaler

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [26]:
scaler.fit(X_train)
scaler

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [27]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [28]:
pd.DataFrame(X_train_scaled).describe() #Data is scaled keeping mean ~ 0 and std ~ 1

Unnamed: 0,0,1,2,3,4,5,6,7
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,-6.411753000000001e-17,-1.678244e-17,1.790127e-16,-6.89371e-16,0.0,1.032766e-17,3.38575e-15,1.776787e-15
std,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003
min,-1.775438,-2.190766,-1.904386,-1.762117,-1.251913,-0.2076846,-1.447697,-2.377207
25%,-0.6900689,-0.8417859,-0.4118373,-0.2081645,-0.560634,-0.05770769,-0.8018107,-1.110749
50%,-0.1758995,0.03108328,-0.08350905,-0.109416,-0.228186,-0.02415892,-0.6473597,0.5346501
75%,0.4686502,0.6658972,0.2621376,0.008455177,0.263449,0.01580865,0.9720351,0.783953
max,5.839268,1.856173,57.16655,56.64727,30.127428,107.1164,2.951816,2.628794


## Train Models on Scaled data

In [29]:
lr_scaled = LinearRegression()
lr_scaled.fit(X_train_scaled, y_train)

y_pred_lr_scaled =  lr_scaled.predict(X_test_scaled)
lr_rmse_scaled = root_mean_squared_error(y_test, y_pred_lr_scaled)
lr_rmse_scaled

0.7455813830127764

In [30]:
r_scaled = Ridge(alpha = 0.1)
r_scaled.fit(X_train_scaled, y_train)

y_pred_r_scaled = r_scaled.predict(X_test_scaled)
r_rmse_scaled = root_mean_squared_error(y_test, y_pred_r_scaled)
r_rmse_scaled

0.7455789118982765

In [31]:
l_scaled = Lasso(alpha = 0.1)
l_scaled.fit(X_train_scaled, y_train)

y_pred_l_scaled = l_scaled.predict(X_test_scaled)
l_rmse_scaled = root_mean_squared_error(y_test, y_pred_l_scaled)
l_rmse_scaled

0.8243961598848472

# Pipeline

## Package Management

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso

## Pipeline Creation

In [33]:
pipe_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LinearRegression())
])
pipe_lr

0,1,2
,steps,"[('scaler', ...), ('lr', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [34]:
pipe_ridge = Pipeline([
    ('scaler',StandardScaler()),
    ('ridge',Ridge(alpha=0.1))
])
pipe_ridge

0,1,2
,steps,"[('scaler', ...), ('ridge', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,0.1
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [35]:
pipe_lasso = Pipeline([
    ('scaler',StandardScaler()),
    ('lasso',Lasso(alpha = 0.1))
])
pipe_lasso

0,1,2
,steps,"[('scaler', ...), ('lasso', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,0.1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


## Model Training

In [36]:
pipe_lr.fit(X_train, y_train)
pipe_ridge.fit(X_train, y_train)
pipe_lasso.fit(X_train, y_train)

0,1,2
,steps,"[('scaler', ...), ('lasso', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,0.1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


## Prediction

In [39]:
lr_pred = pipe_lr.predict(X_test)
ridge_pred = pipe_ridge.predict(X_test)
lasso_pred = pipe_lasso.predict(X_test)

## Evaluation

In [40]:
rmse_lr = root_mean_squared_error(y_test, lr_pred)
rmse_ridge = root_mean_squared_error(y_test, ridge_pred)
rmse_lasso = root_mean_squared_error(y_test, lasso_pred)

rmse_lr, rmse_ridge, rmse_lasso

(0.7455813830127764, 0.7455789118982765, 0.8243961598848472)

# Saving Pipeline and Model

In [43]:
import joblib
joblib.dump(pipe_lr, "Models/lr_model.pkl")
joblib.dump(pipe_ridge, "Models/ridge_model.pkl")
joblib.dump(pipe_lasso, "Models/lasso_model.pkl")

['Models/lasso_model.pkl']