In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# import dataset
data = pd.read_csv("housing.csv")

# shuffle dataset
data = data.sample(n=len(data))

# calculate ln of certain variables
data['total_rooms']= np.log(data['total_rooms']+1)
data['total_bedrooms']= np.log(data['total_bedrooms']+1)
data['population']= np.log(data['population']+1)
data['households']= np.log(data['households']+1)

# scale data using standard scaler
from sklearn.preprocessing import StandardScaler
data = pd.DataFrame(data)
columns_to_exclude = ['ocean_proximity']
scaler = StandardScaler()
scaled_data = data.copy()
scaled_data.loc[:, data.columns.difference(columns_to_exclude)] = scaler.fit_transform(data.loc[:, data.columns.difference(columns_to_exclude)])

# one hot vector encoding
scaled_data=scaled_data.join(pd.get_dummies(scaled_data.ocean_proximity)).drop(['ocean_proximity'], axis=1)

# fill missing values with median
scaled_data['total_bedrooms'] = scaled_data['total_bedrooms'].fillna(scaled_data['total_bedrooms'].median())


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
class LMS:
    # initialization function for variables
    def __init__(self, learning_rate, n_iterations):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
        
    # training function
    def train(self, X, y):
        # initialize weights and bias with 0
        self.weights = np.zeros(X.shape[1])
        self.bias = 0

        # update weights and bias
        for _ in range(self.n_iterations):
            for xi, target in zip(X, y):
                update = self.learning_rate * (target - self.predict(xi))
                self.weights += update * xi
                self.bias += update
                
    def predict(self, X):
        # predict the median_house_value
        return np.dot(X, self.weights) + self.bias



# kfold cross validation(10 folds)
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# mae and mse
from sklearn.metrics import mean_absolute_error, mean_squared_error
mae_train_list, mse_train_list = [], []
mae_test_list, mse_test_list = [], []

# X y split
X = scaled_data.drop('median_house_value', axis=1)
y = scaled_data['median_house_value']

count = 1

# loop through each fold
for train_index, test_index in kf.split(X):
    print("Fold", count, ":")
          
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    # convert bool values to float64
    X_train, X_test = X_train.astype('float64'), X_test.astype('float64')
    
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    
    
    # initialize and train LMS model
    lms = LMS(learning_rate=0.002, n_iterations=2)
    lms.train(X_train.values, y_train)


    
    # evaluate model on train data
    predictions_train = lms.predict(X_train.values)
    # calculate mean absolute error
    mae_train = mean_absolute_error(y_train, predictions_train)
    print("Mean Absolute Error for TRAIN set:", mae_train)
    mae_train_list.append(mae_train)
    # calculate mean squared error
    mse_train = mean_squared_error(y_train, predictions_train)
    print("Mean Sqared Error for TRAIN set:", mse_train)
    mse_train_list.append(mse_train)

    # evaluate model on test data
    predictions_test = lms.predict(X_test.values)
    # calculate mean absolute error
    mae_test = mean_absolute_error(y_test, predictions_test)
    print("Mean Absolute Error for TEST set:", mae_test)
    mae_test_list.append(mae_test)
    # calculate mean squared error
    mse_test = mean_squared_error(y_test, predictions_test)
    print("Mean Sqared Error for TEST set:", mse_test, "\n")
    mse_test_list.append(mse_test)

    count += 1

# print average values
print("Average MAE for TRAIN set:", np.mean(mae_train_list))
print("Average MSE for TRAIN set:", np.mean(mse_train_list))
print("Average MAE for TEST set:", np.mean(mae_test_list))
print("Average MSE for TEST set:", np.mean(mse_test_list))

Fold 1 :
Mean Absolute Error for TRAIN set: 0.4142428567797834
Mean Sqared Error for TRAIN set: 0.33428760358352233
Mean Absolute Error for TEST set: 0.41919943382568237
Mean Sqared Error for TEST set: 0.33785171904446537 

Fold 2 :
Mean Absolute Error for TRAIN set: 0.4161687325395954
Mean Sqared Error for TRAIN set: 0.33755515737216624
Mean Absolute Error for TEST set: 0.3991429803423611
Mean Sqared Error for TEST set: 0.3133318269817109 

Fold 3 :
Mean Absolute Error for TRAIN set: 0.4140404835646354
Mean Sqared Error for TRAIN set: 0.3311817073795151
Mean Absolute Error for TEST set: 0.4289435252439454
Mean Sqared Error for TEST set: 0.3628263455109762 

Fold 4 :
Mean Absolute Error for TRAIN set: 0.4149671443052929
Mean Sqared Error for TRAIN set: 0.3362434365036737
Mean Absolute Error for TEST set: 0.4130280547585711
Mean Sqared Error for TEST set: 0.3407704079716866 

Fold 5 :
Mean Absolute Error for TRAIN set: 0.41579506364862906
Mean Sqared Error for TRAIN set: 0.3346422295648