In [38]:
import numpy as np
import pandas as pd
df = pd.read_csv('california_housing_train.csv')
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.40,19.0,7650.0,1901.0,1129.0,463.0,1.8200,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.9250,65500.0
...,...,...,...,...,...,...,...,...,...
16995,-124.26,40.58,52.0,2217.0,394.0,907.0,369.0,2.3571,111400.0
16996,-124.27,40.69,36.0,2349.0,528.0,1194.0,465.0,2.5179,79000.0
16997,-124.30,41.84,17.0,2677.0,531.0,1244.0,456.0,3.0313,103600.0
16998,-124.30,41.80,19.0,2672.0,552.0,1298.0,478.0,1.9797,85800.0


In [39]:
bedrooms = df['total_bedrooms']
df = df.drop(['longitude', 'latitude', 'median_income', 'housing_median_age','median_house_value', 'total_bedrooms'], axis=1)
df['bedrooms'] = bedrooms
df

Unnamed: 0,total_rooms,population,households,bedrooms
0,5612.0,1015.0,472.0,1283.0
1,7650.0,1129.0,463.0,1901.0
2,720.0,333.0,117.0,174.0
3,1501.0,515.0,226.0,337.0
4,1454.0,624.0,262.0,326.0
...,...,...,...,...
16995,2217.0,907.0,369.0,394.0
16996,2349.0,1194.0,465.0,528.0
16997,2677.0,1244.0,456.0,531.0
16998,2672.0,1298.0,478.0,552.0


In [40]:
df_np = df.to_numpy()
df_np.shape

(17000, 4)

In [41]:
x_train, y_train = df_np[:, :3], df_np[:,3:]
x_train.shape, y_train.shape

((17000, 3), (17000, 1))

In [42]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [43]:
model = LinearRegression()
model.fit(x_train, y_train)


In [44]:
y_train

array([[1283.],
       [1901.],
       [ 174.],
       ...,
       [ 531.],
       [ 552.],
       [ 300.]])

In [45]:
predictions = model.predict(x_train)
predictions


array([[631.82986004],
       [693.11956219],
       [130.849378  ],
       ...,
       [502.56126019],
       [522.07260277],
       [304.4499167 ]])

In [46]:
absolute_error, squared_error = mean_absolute_error(y_train, predictions), mean_squared_error(y_train, predictions)
absolute_error, squared_error

(39.672217650706656, 5650.631822167879)

In [47]:
def make_predictions(df, model):
    rows,cols = df.shape
    new_df = np.ones(shape=(rows,cols+1))
    new_df[:, 1:] = df
    return np.dot(new_df, model)


Predicting With random weights and checking errors

In [48]:
test_model = np.array([1, 1/4, 1/3, 1.2])
print(test_model)
random_predictions= make_predictions(x_train, test_model)
random_predictions

[1.         0.25       0.33333333 1.2       ]


array([2308.73333333, 2845.43333333,  432.4       , ..., 1632.11666667,
       1675.26666667, 1048.66666667])

In [49]:
absolute_error, squared_error = mean_absolute_error(y_train, random_predictions), mean_squared_error(y_train, random_predictions)
absolute_error, squared_error

(1200.4964323529412, 2324808.2680397546)

In [50]:
predictions_dataframe = df
predictions_dataframe['random_predictions'] = random_predictions
predictions_dataframe['sklearn_predictions'] = predictions
predictions_dataframe['original_values'] = y_train
predictions_dataframe

Unnamed: 0,total_rooms,population,households,bedrooms,random_predictions,sklearn_predictions,original_values
0,5612.0,1015.0,472.0,1283.0,2308.733333,631.829860,1283.0
1,7650.0,1129.0,463.0,1901.0,2845.433333,693.119562,1901.0
2,720.0,333.0,117.0,174.0,432.400000,130.849378,174.0
3,1501.0,515.0,226.0,337.0,819.116667,259.462982,337.0
4,1454.0,624.0,262.0,326.0,886.900000,289.315311,326.0
...,...,...,...,...,...,...,...
16995,2217.0,907.0,369.0,394.0,1300.383333,412.047168,394.0
16996,2349.0,1194.0,465.0,528.0,1544.250000,501.086730,528.0
16997,2677.0,1244.0,456.0,531.0,1632.116667,502.561260,531.0
16998,2672.0,1298.0,478.0,552.0,1675.266667,522.072603,552.0


In [51]:
def get_best_model(x,y):
  (n, p_minus_one) = x.shape
  p = p_minus_one + 1

  new_X = np.ones(shape=(n, p))
  new_X[:, 1:] = x

  return np.dot(np.dot(np.linalg.inv(np.dot(new_X.T, new_X)), new_X.T), y)

In [52]:
best_model = get_best_model(x_train,y_train)
best_model

array([[ 1.5496502 ],
       [ 0.03619529],
       [-0.03248088],
       [ 0.9748312 ]])

In [53]:
best_predictions = make_predictions(x_train, best_model)
best_predictions.shape

(17000, 1)

In [54]:
predictions_dataframe['best_predictions'] = best_predictions
predictions_dataframe

Unnamed: 0,total_rooms,population,households,bedrooms,random_predictions,sklearn_predictions,original_values,best_predictions
0,5612.0,1015.0,472.0,1283.0,2308.733333,631.829860,1283.0,631.829860
1,7650.0,1129.0,463.0,1901.0,2845.433333,693.119562,1901.0,693.119562
2,720.0,333.0,117.0,174.0,432.400000,130.849378,174.0,130.849378
3,1501.0,515.0,226.0,337.0,819.116667,259.462982,337.0,259.462982
4,1454.0,624.0,262.0,326.0,886.900000,289.315311,326.0,289.315311
...,...,...,...,...,...,...,...,...
16995,2217.0,907.0,369.0,394.0,1300.383333,412.047168,394.0,412.047168
16996,2349.0,1194.0,465.0,528.0,1544.250000,501.086730,528.0,501.086730
16997,2677.0,1244.0,456.0,531.0,1632.116667,502.561260,531.0,502.561260
16998,2672.0,1298.0,478.0,552.0,1675.266667,522.072603,552.0,522.072603
