In [23]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [24]:
data = pd.read_csv("housing.csv")

## Understanding the Data 

In [25]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [27]:
data.dropna(inplace=True)

In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20433 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  float64
 3   total_rooms         20433 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  float64
 6   households          20433 non-null  float64
 7   median_income       20433 non-null  float64
 8   median_house_value  20433 non-null  float64
 9   ocean_proximity     20433 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.7+ MB


# Preprocessing the data

In [29]:
data['total_rooms'] = np.log(data['total_rooms']+1)
data['total_bedrooms'] = np.log(data['total_rooms']+1)
data['population'] = np.log(data['population']+1)
data['households'] = np.log(data['households']+1)

data = data.join(pd.get_dummies(data.ocean_proximity)).drop(['ocean_proximity'],axis=1)

data['bedroom_ratio'] = data['total_bedrooms']/data['total_rooms']
data['household_rooms'] = data['total_rooms']/data['households']

In [30]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN,bedroom_ratio,household_rooms
0,-122.23,37.88,41.0,6.781058,2.051692,5.777652,4.844187,8.3252,452600.0,False,False,False,True,False,0.302562,1.399834
1,-122.22,37.86,21.0,8.867850,2.289282,7.784057,7.037906,8.3014,358500.0,False,False,False,True,False,0.258155,1.260013
2,-122.24,37.85,52.0,7.291656,2.115250,6.208590,5.181784,7.2574,352100.0,False,False,False,True,False,0.290092,1.407171
3,-122.25,37.85,52.0,7.150701,2.098104,6.326149,5.393628,5.6431,341300.0,False,False,False,True,False,0.293412,1.325768
4,-122.25,37.85,52.0,7.395108,2.127649,6.338594,5.560682,3.8462,342200.0,False,False,False,True,False,0.287710,1.329892
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,7.418181,2.130394,6.740519,5.802118,1.5603,78100.0,False,True,False,False,False,0.287185,1.278530
20636,-121.21,39.49,18.0,6.548219,2.021312,5.877736,4.744932,2.5568,77100.0,False,True,False,False,False,0.308681,1.380045
20637,-121.22,39.43,17.0,7.720905,2.165723,6.915723,6.073045,1.7000,92300.0,False,True,False,False,False,0.280501,1.271340
20638,-121.32,39.43,18.0,7.528869,2.143457,6.609349,5.857933,1.8672,84700.0,False,True,False,False,False,0.284698,1.285243


## Splitting the Data 

In [31]:
from sklearn.model_selection import train_test_split 

X = data.drop(['median_house_value'], axis = 1) # since dropping a valuse axis =1 
y = data['median_house_value']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2) 

## Applying Linear Regression 

In [33]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train,y_train)

In [34]:
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN,bedroom_ratio,household_rooms
11644,-118.04,33.81,22.0,8.308446,2.230922,7.698483,6.467699,5.8527,True,False,False,False,False,0.268513,1.284606
18318,-122.11,37.44,35.0,7.609367,2.152851,6.931472,5.932245,5.6413,False,False,False,True,False,0.282921,1.282713
11075,-117.85,33.81,26.0,8.339740,2.234278,7.803027,6.656727,4.9917,True,False,False,False,False,0.267907,1.252829
7691,-118.11,33.95,34.0,7.749322,2.168976,6.848005,5.877736,6.4319,True,False,False,False,False,0.279892,1.318420
13540,-117.30,34.14,39.0,7.485492,2.138358,6.735780,5.771441,1.9432,False,True,False,False,False,0.285667,1.296988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18178,-122.00,37.36,25.0,8.170469,2.215988,7.539559,6.745236,3.4238,True,False,False,False,False,0.271219,1.211295
5451,-118.46,34.01,43.0,6.242223,1.979928,5.587249,4.644391,5.6428,True,False,False,False,False,0.317183,1.344035
4748,-118.35,34.05,52.0,7.586804,2.150227,6.971669,6.016157,3.6435,True,False,False,False,False,0.283417,1.261071
3016,-118.93,34.82,8.0,6.232448,1.978578,5.438079,4.442651,4.0332,False,True,False,False,False,0.317464,1.402867


In [35]:
y_pred = reg.predict(X_test)

In [36]:
reg.score(X_test,y_test) 

0.6663630226786089