In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv('/content/Real_Estate.csv')


df=df.drop(columns=['Transaction date'])



# Fill null values (example: fill with median for numerical columns)
df.fillna(df.median(numeric_only=True), inplace=True)

# Check for outliers using Z-score (consider only numerical columns)
numerical_cols = df.select_dtypes(include=[np.number]).columns
z_scores = np.abs((df[numerical_cols] - df[numerical_cols].mean()) / df[numerical_cols].std())

# Set a threshold for outlier detection
threshold = 3
outliers = z_scores > threshold

# Replace outliers with median values
for column in numerical_cols:
    df.loc[outliers[column], column] = df[column].median()



# Initialize the StandardScaler
# Define features and target
X = df.drop(columns=['House price of unit area'])
Y = df['House price of unit area']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

# Initialize and fit the StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)





# Initialize the Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the test data
Y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(Y_test, Y_pred)
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

# Output the evaluation metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)


Mean Absolute Error (MAE): 7.998808819507606
Mean Squared Error (MSE): 93.93194121762492
R-squared (R2): 0.5601858402174962


In [57]:

example_row = (4.9,639.6198,8,24.975468641126400,121.4765643977470)

# Convert the row into a NumPy array and reshape it
input_array = np.array([example_row]).reshape(1, -1)

# Define a function for making predictions
def predict_house_price(input_array):
    # Scale the input data
    scaled_input = scaler.transform(input_array)

    # Make prediction
    prediction = model.predict(scaled_input)
    return prediction[0]

# Predict house price based on the hardcoded row
predicted_price = predict_house_price(input_array)

# Output the result
print(f"\nPredicted house price of unit area for the selected row: {predicted_price:.2f}")


Predicted house price of unit area for the selected row: 3894.68




In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [63]:
# Load the dataset
df = pd.read_csv('/content/Real_Estate.csv')
# Assuming df is your DataFrame
df = df.drop(columns=['Transaction date'])

In [64]:
df.tail()

Unnamed: 0,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area
409,18.3,170.1289,6,24.981186,121.486798,29.09631
410,11.9,323.6912,2,24.95007,121.483918,33.871347
411,0.0,451.6419,8,24.963901,121.543387,25.255105
412,35.9,292.9978,5,24.997863,121.558286,25.28562
413,12.0,90.45606,6,24.952904,121.526395,37.580554


In [76]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [77]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['House price of unit area'])
y = df['House price of unit area']




In [78]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [79]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [80]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'R²: {r2}')


MSE: 145.46209754507166
R²: 0.4741866928483548


In [81]:
importances = model.feature_importances_
feature_importances = pd.DataFrame(importances, index=X.columns, columns=['Importance'])
print(feature_importances.sort_values(by='Importance', ascending=False))


                                     Importance
Distance to the nearest MRT station    0.605370
Number of convenience stores           0.143083
Latitude                               0.114393
Longitude                              0.097938
House age                              0.039215


In [82]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'R²: {r2}')


MSE: 134.65583405073835
R²: 0.5132489450900213


In [83]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

randomized_search = RandomizedSearchCV(RandomForestRegressor(), param_distributions=param_dist, n_iter=10, cv=5)
randomized_search.fit(X_train, y_train)
best_model = randomized_search.best_estimator_
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best Model MSE: {mse}')
print(f'Best Model R²: {r2}')


Best Model MSE: 130.64551444784394
Best Model R²: 0.5277453633922502


In [84]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_scaled)


In [85]:
from sklearn.preprocessing import PolynomialFeatures

# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_scaled)

# Split the data
X_train_poly, X_test_poly, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Train the model
model = GradientBoostingRegressor()
model.fit(X_train_poly, y_train)

# Make predictions
y_pred = model.predict(X_test_poly)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE with Polynomial Features: {mse}')
print(f'R² with Polynomial Features: {r2}')


MSE with Polynomial Features: 157.27774042617358
R² with Polynomial Features: 0.43147575739308985


In [86]:

from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(GradientBoostingRegressor(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the best model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best Model MSE: {mse}')
print(f'Best Model R²: {r2}')


Best Model MSE: 132.76303538959658
Best Model R²: 0.5200909935726425
