In [5]:
# Importing important libraries
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [6]:
data = pd.read_csv('housing_data.csv')
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [7]:
#Check the unique values in the ocean_proximity column to code it
data.ocean_proximity.unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [8]:
#Coding non-numeric variables
codes = [1,2,3,4,5]
names = ["NEAR BAY", "<1H OCEAN", "INLAND", "NEAR OCEAN","ISLAND"]
data['ocean_proximity1'] = data['ocean_proximity'].replace(names, codes)

#select all columns except 'ocean_proximity'
data = data.loc[:, data.columns!='ocean_proximity']

In [9]:
#Check if there are missing values
data.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity1        0
dtype: int64

In [10]:
#Dealing with missing data -- Using Imputation (Mean as a strategy)
from sklearn.impute import SimpleImputer
train_most_frequent = data.copy()
#setting strategy to 'mean' to impute by the mean
mean_imputer = SimpleImputer(strategy='mean')# strategy can also be mean or median 
train_most_frequent.iloc[:,:] = mean_imputer.fit_transform(train_most_frequent)
data = train_most_frequent.iloc[:,:]

#check all missing values have been dealt with 
data.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity1      0
dtype: int64

In [11]:
#Splitting data into train and test data

#Using the whole data set
#train, test = train_test_split(data, test_size=0.2, random_state=42)

#Using the X,Y method (separating X and y first)
X = data[['housing_median_age','total_rooms','total_bedrooms', 'population','households','median_income']]
y = data[["median_house_value"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
#Running the model
model = LinearRegression().fit(X_train, y_train)

#Assessing the model
model.score(X_train, y_train)
#model.score(X_test, y_test)

0.5700818543102877

In [13]:
predict = model.predict(X_test)
actual = y_test.values[0]

print('Predicted Value :',predict[0])
print('Actual Value :', actual)

Predicted Value : [100859.76709752]
Actual Value : [47700.]


### Using statsmodel 

In [14]:
# Import models and utility functions
import statsmodels.api as sm 

In [15]:
model = sm.OLS(y_train, X_train).fit()

print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:     median_house_value   R-squared (uncentered):                   0.896
Model:                            OLS   Adj. R-squared (uncentered):              0.896
Method:                 Least Squares   F-statistic:                          2.361e+04
Date:                Mon, 07 Feb 2022   Prob (F-statistic):                        0.00
Time:                        23:54:24   Log-Likelihood:                     -2.0914e+05
No. Observations:               16512   AIC:                                  4.183e+05
Df Residuals:                   16506   BIC:                                  4.183e+05
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                         coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------