In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

class MultiLinearRegression:
    def __init__(self, csv_file, target_column):
        self.csv_file = csv_file
        self.target_column = target_column
        self.data = None
        self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None
        self.coefficients = None
        self.intercept = None
        self.scaler = StandardScaler()
     
    def remove_outliers(self, df, column):
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    
    def calculate_vif(self, X):
        X_numeric = X.select_dtypes(include=[np.number])
        vif_data = pd.DataFrame()
        vif_data["Feature"] = X_numeric.columns
        vif_data["VIF"] = [variance_inflation_factor(X_numeric.values, i) for i in range(X_numeric.shape[1])]
        return vif_data
    
    def load_data(self):
        self.data = pd.read_csv(self.csv_file)
        categorical_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating','airconditioning', 'prefarea', 'furnishingstatus']
        numerical_columns = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']
        self.data = pd.get_dummies(self.data, columns=categorical_columns, drop_first=True)
        self.data = self.data.apply(pd.to_numeric, errors='coerce')
        self.data = self.data.dropna()
        if self.data.empty:
            raise ValueError("No valid data found after preprocessing. Please check the dataset.")
        
        for col in numerical_columns:
            self.data = self.remove_outliers(self.data, col)
        
        self.data[self.target_column] = np.log1p(self.data[self.target_column])
        
        X = self.data.drop(columns=[self.target_column])
        X_numeric = X.select_dtypes(include=[np.number])
        vif = self.calculate_vif(X_numeric)
        high_vif_features = vif[vif["VIF"] > 10]["Feature"].tolist()
        X = X.drop(columns=high_vif_features, errors='ignore')
        
        X = self.scaler.fit_transform(X)
        y = self.data[self.target_column].values
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        print(f"Data loaded successfully! Training samples: {len(self.X_train)}, Testing samples: {len(self.X_test)}")

    def train_model(self):
        if self.X_train is None:
            raise ValueError("Data not loaded. Call load_data() first.")
        self.X_train = self.X_train.astype(float)
        self.y_train = self.y_train.astype(float)
        X_b = np.c_[np.ones((self.X_train.shape[0], 1)), self.X_train] 
        theta = np.linalg.pinv(X_b.T.dot(X_b)).dot(X_b.T).dot(self.y_train)
        self.intercept = theta[0]
        self.coefficients = theta[1:]
        print("Model trained successfully!")

    def evaluate_model(self):
        if self.X_test is None:
            raise ValueError("Model not trained. Call train_model() first.")
        X_b_test = np.c_[np.ones((self.X_test.shape[0], 1)), self.X_test]
        predictions = X_b_test.dot(np.r_[self.intercept, self.coefficients])
        mse = np.mean((self.y_test - predictions) ** 2)
        return mse
    
    def predict(self, input_data):
        input_data = np.array(input_data)
        if input_data.ndim == 1:
            input_data = input_data.reshape(1, -1)
        input_data = self.scaler.transform(input_data)
        X_b = np.c_[np.ones((input_data.shape[0], 1)), input_data]
        return X_b.dot(np.r_[self.intercept, self.coefficients])

mlr = MultiLinearRegression("./dataset/Housing.csv", "price")
mlr.load_data()
mlr.train_model()
mse = mlr.evaluate_model()
print(f"Mean Squared Error: {mse}")


Data loaded successfully! Training samples: 377, Testing samples: 95
Model trained successfully!
Mean Squared Error: 0.05929312001426214
