In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
df = pd.read_csv('Housing.csv')

In [None]:
#Adding features to dataset: proximity to schools and public transport
df['ProximityToSchool'] = np.random.choice(['Far', 'Close'], size=len(df), p=[0.6, 0.4])
df['ProximityToTransport'] = np.random.choice(['Far', 'Close'], size=len(df), p=[0.4, 0.6])
df.to_csv('AdditionalHousing.csv', index=False)

In [None]:
#handling missing values
df.dropna(axis=1, inplace=True)
df.info()

In [None]:
#converting categorical data into numerical values using one-hot encoding
df = pd.get_dummies(df, drop_first=True)
df.info()

In [None]:
#Line plot using Matplotlib
plt.plot(df['price'], df['area'])
plt.title('Price-Area Line Plot')
plt.xlabel('Price')
plt.ylabel('Area')
plt.show()

In [None]:
#Correlation heatmap using Seaborn
corr_matrix = df.corr()
plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
#Check correlations between features and the target variable(house price)
price_corr = corr_matrix['price'].sort_values(ascending=False)
print(price_corr)

In [None]:
#Use Scikit-learn’s Linear Regression to train a model
X = df[['area', 'bathrooms']]
y = df['price']
#Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

#Evaluate the performance with metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"R2: {r2}")

In [None]:
#Decision Tree for comparision
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)

y_pred_tree = tree_model.predict(X_test)

mse_tree = mean_squared_error(y_test, y_pred_tree)
mae_tree = mean_absolute_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)

print(f"MSE: {mse_tree}")
print(f"MAE: {mae_tree}")
print(f"R2: {r2_tree}")

In [None]:
#Visualization for comparision
plt.scatter(y_test, y_pred, label='Linear Regression', color='blue')
plt.scatter(y_test, y_pred_tree, label='Decision Tree Regressor', color='green')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted Prices')
plt.legend()
plt.show()