Get data at https://www.kaggle.com/datasets/yasserh/housing-prices-dataset

In [None]:
import pandas as pd

data = pd.read_csv('Housing.csv')

print(data.shape)
print(data.head())

In [None]:
print(data.isnull().sum())
data = data.dropna()


In [None]:
from sklearn.preprocessing import StandardScaler

X = data.iloc[:, 1:]
y = data['price']

# Should I drop the first column or not?
X = pd.get_dummies(X, drop_first=True)

original_columns = X.columns
print(original_columns)

area_scaler = StandardScaler()
X['area'] = area_scaler.fit_transform(X[['area']])

price_scaler = StandardScaler()
y = price_scaler.fit_transform(y.values.reshape(-1, 1))

print(X.head())
print(y[:5])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, y_train)

print(model.coef_)
print(model.intercept_)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error: ", mse)
print("R2 Score: ", r2)

In [None]:
import matplotlib.pyplot as plt

plt.scatter(X_train['area'], price_scaler.inverse_transform(y_train), color='blue', label="Actual")
plt.scatter(X_test['area'], price_scaler.inverse_transform(y_test), color='red', label="Predicted")
plt.xlabel("Area")
plt.ylabel("Price")
plt.legend()
plt.show()



In [None]:
# How to give a new input to the model

def predict(features):
    features = pd.DataFrame(features, index=[0])
    features = pd.get_dummies(features)
    features = features.reindex(columns=original_columns, fill_value=False)
    features['area'] = area_scaler.transform(features[['area']])
    return price_scaler.inverse_transform(model.predict(features))[0][0]

prediction = predict({'area': 4400, 'bedrooms': 4, 'bathrooms': 3, 'stories': 2, 'mainroad': 'no', 'guestroom': 'yes', 'basement': 'no', 'hotwaterheating': 'yes', 'airconditioning': 'yes', 'parking': 2, 'prefarea': 'yes', 'furnishingstatus': 'unfurnished'})
print(f"Prediction: {prediction}")

In [None]:
# A little demonstration of reindex.

df = pd.DataFrame({"length": [5, 10, 15, 20, 25, 30], "width": [5, 10, 15, 20, 25, 30]})

df = df.reindex(columns=['width', 'length', 'height'], fill_value=0)

print(df)