<a href="https://colab.research.google.com/github/mephist0isaloser/Gamma-Z-Hostel-Mess-Management-Server/blob/main/real_estate_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Real Estate Price Prediction***

# Import and intro

In [1]:
import pandas as pd

## Loading the dataset

In [3]:
df = pd.read_csv('/content/Melbourne_housing_FULL.csv')


## Splitting the data into X and Y

In [4]:
X = df.drop('Price', axis=1)
Y = df['Price']

# Spliting data into numerical and categorical

In [5]:
X_numerical = X.select_dtypes(include=['float', 'int'])  # Select numerical features
X_categorical = X.select_dtypes(include=['object'])  # Select categorical features


## Handling the categorical data

In [6]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()
X_categorical_encoded = encoder.fit_transform(X_categorical)

import numpy as np

X_combined = np.concatenate((X_numerical, X_categorical_encoded), axis=1)


### *Handling the Nan characters for X*

In [7]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer()
X_combined_imputed = imputer.fit_transform(X_combined)

### Handling the NaN for Y

In [8]:
import numpy as np
from sklearn.impute import SimpleImputer

# Reshape the target variable to have 2D shape
Y_reshaped = Y.values.reshape(-1, 1)

# Apply SimpleImputer to impute missing values
imputer = SimpleImputer()
Y_imputed = imputer.fit_transform(Y_reshaped)

# Convert back to 1D array if needed
Y_imputed_1d = np.squeeze(Y_imputed)


## Test and Train split


In [9]:
import numpy as np

def train_test_split_custom(X, y, test_size=0.2, random_state=None):
    # Set random seed if specified
    if random_state is not None:
        np.random.seed(random_state)

    # Shuffle the indices of the data
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    # Determine the split index based on the test size
    split_index = int(len(X) * (1 - test_size))

    # Split the data into training and testing sets
    X_train = X[indices[:split_index]]
    X_test = X[indices[split_index:]]
    y_train = y[indices[:split_index]]
    y_test = y[indices[split_index:]]

    return X_train, X_test, y_train, y_test

X_train, X_test, Y_train, Y_test = train_test_split_custom(X_combined_imputed, Y_imputed_1d, test_size=0.2, random_state=100)

# **LinearRegression**

## Training and predition

In [10]:
import numpy as np

def linear_regression(X, y):
    # Add a column of ones to X for the intercept term
    X = np.column_stack((np.ones((len(X), 1)), X))

    # Calculate the regression coefficients (theta) using the normal equation
    theta = np.linalg.inv(X.T @ X) @ X.T @ y

    return theta

def predict(X, theta):
    # Add a column of ones to X for the intercept term
    X = np.column_stack((np.ones((len(X), 1)), X))

    # Calculate the predicted values
    y_pred = X @ theta

    return y_pred

theta = linear_regression(X_train, Y_train)
Y_test_pred = predict(X_test, theta)
Y_train_pred = predict(X_train, theta)


## Evalvate the model

In [11]:
import numpy as np

def mean_squared_error_custom(y_true, y_pred):
    mse = np.mean((y_true - y_pred) ** 2)
    return mse

def r2_score_custom(y_true, y_pred):
    ssr = np.sum((y_true - y_pred) ** 2)
    sst = np.sum((y_true - np.mean(y_true)) ** 2)
    r2 = 1 - (ssr / sst)
    return r2

def mean_absolute_error_custom(y_true, y_pred):
    mae = np.mean(np.abs(y_true - y_pred))
    return mae



lr_train_mse = mean_squared_error_custom(Y_train, Y_train_pred)
lr_train_r2 = r2_score_custom(Y_train,Y_train_pred)
lr_train_mae = mean_absolute_error_custom(Y_train, Y_train_pred)

lr_test_mse = mean_squared_error_custom(Y_test, Y_test_pred)
lr_test_r2 = r2_score_custom(Y_test, Y_test_pred)
lr_test_mae = mean_absolute_error_custom(Y_test, Y_test_pred)

In [12]:
lr_results = pd.DataFrame(['Linear regression', lr_train_mse, lr_train_r2,lr_train_mae, lr_test_mse, lr_test_r2, lr_test_mae]).transpose()
lr_results.columns = ['Method', 'Training MSE', 'Training R2','Training MAE', 'Test MSE', 'Test R2', 'Test MAE']

lr_results

Unnamed: 0,Method,Training MSE,Training R2,Training MAE,Test MSE,Test R2,Test MAE
0,Linear regression,187919374452.3128,0.415277,288941.731698,211937309665.69827,0.375671,289627.508921


In [13]:
X_test, Y_test_pred

(array([[4.000e+00, 1.790e+01, 3.082e+03, ..., 3.200e+01, 2.900e+01,
         2.000e+00],
        [4.000e+00, 7.900e+00, 3.079e+03, ..., 4.600e+01, 0.000e+00,
         0.000e+00],
        [5.000e+00, 4.600e+00, 3.142e+03, ..., 1.900e+01, 2.700e+01,
         5.000e+00],
        ...,
        [4.000e+00, 6.400e+00, 3.078e+03, ..., 2.200e+01, 6.000e+00,
         2.000e+00],
        [3.000e+00, 9.100e+00, 3.015e+03, ..., 8.000e+00, 1.000e+01,
         6.000e+00],
        [3.000e+00, 7.900e+00, 3.079e+03, ..., 3.700e+01, 0.000e+00,
         0.000e+00]]),
 array([ 919573.72142341, 1533027.91645931, 1714433.40855458, ...,
        1505875.62403648,  949384.44623779, 1329394.3119783 ]))

In [None]:
import pandas as pd

# Sample array
data = X_test

# Create a DataFrame from the array
df = pd.DataFrame(data)

# Define the CSV file path
csv_file = 'data.csv'

# Write DataFrame to CSV file
df.to_csv(csv_file, index=False, header=False)
