# Linear Regression

## Ashwin Shammy Mathew

In [45]:
import pandas as pd     #importing libraries
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

### Q1

In [2]:
dataset = fetch_california_housing()
data = pd.DataFrame(dataset.data, columns = dataset.feature_names)     #had to refer online
data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [3]:
data.shape

(20640, 8)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


In [5]:
data.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [6]:
data.isnull().sum()     #no null values

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

In [20]:
x = data.drop('MedInc', axis = 1)     #scaling
y = data['MedInc']

In [22]:
normalisation = MinMaxScaler()
x = normalisation.fit_transform(x)

x = pd.DataFrame(x)
x.describe()

Unnamed: 0,0,1,2,3,4,5,6
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,0.541951,0.032488,0.022629,0.039869,0.001914,0.328572,0.476125
std,0.246776,0.017539,0.014049,0.03174,0.008358,0.226988,0.199555
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.333333,0.025482,0.019943,0.021974,0.001398,0.147715,0.253984
50%,0.54902,0.031071,0.021209,0.032596,0.001711,0.182784,0.583665
75%,0.705882,0.036907,0.022713,0.048264,0.002084,0.549416,0.631474
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


The dataset California Housing does not have any Null or Missing values hence handling is not required.

Encoding was not required because the California Housing dataset contains only numerical features and does not include any categorical variables. Therefore, all columns are already suitable for machine learning algorithms without additional transformation.

Although the dataset contains some outliers, they were not removed. Instead, standardization was applied, which reduces the influence of extreme values.

### Q2 and Q3

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, test_size = 0.33)

In [25]:
lr_model = LinearRegression()     #linear regression
lr_model.fit(x_train, y_train)

In [26]:
y_pred = lr_model.predict(x_test)
y_pred

array([2.3460855 , 3.2357584 , 1.9983766 , ..., 4.38837594, 4.85350324,
       2.83417634])

In [29]:
mae = mean_absolute_error(y_test, y_pred)     #evaluation matrices
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [30]:
mae

0.7980739765409184

In [31]:
mse

1.4572951896262405

In [32]:
r2

0.5933626198838573

In [35]:
dt = DecisionTreeRegressor()     #decision tree
dt.fit(x_train, y_train)

In [40]:
dt_pred = dt.predict(x_test)
dt_pred

array([1.1728, 3.5391, 4.3986, ..., 5.2447, 4.4063, 2.105 ])

In [41]:
mae = mean_absolute_error(y_test, dt_pred)     #evaluation matrices
mse = mean_squared_error(y_test, dt_pred)
r2 = r2_score(y_test, dt_pred)

In [42]:
mae

0.8723844685848503

In [43]:
mse

1.613125319468585

In [44]:
r2

0.549880450867365

In [46]:
rf = RandomForestRegressor(n_estimators = 200, random_state = 42)     #random forest

In [48]:
rf.fit(x_train, y_train)

In [49]:
rf_pred = rf.predict(x_test)

In [50]:
mse = mean_squared_error(y_test, rf_pred)     #evaluation matrices
mae = mean_absolute_error(y_test, rf_pred)
r2 = r2_score(y_test, rf_pred)

In [51]:
mse

0.764349945074277

In [52]:
mae

0.5915290505725191

In [53]:
r2

0.7867190797242408

The best performing model is the Random Forest. Achieves the lowest MSE and MAE among all models. Has the highest R² score.

The worst performing model is the Decision Tree. Highest MSE and MAE among the models. Has the lowest R² score.