In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('train.csv')
df.columns

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(df['SalePrice'], kde=True)
plt.show()

In [None]:
missing = df.isna().sum().sort_values(ascending=False)
print(missing)

In [None]:
df = df.drop(columns=['PoolQC'])

print('PoolQC' in df.columns)

In [None]:
numeric_df = df.select_dtypes(include='number')

correlations = numeric_df.corrwith(df['SalePrice']).sort_values(ascending=False).head(10)

print(correlations)

In [None]:
plt.figure(figsize=(5, 5))
sns.scatterplot(x=df.OverallQual, y=df['SalePrice'])
plt.title('Overall House Quality Againt Sale Price')
plt.show()

In [None]:
plt.figure(figsize=(5, 5))
sns.scatterplot(x=df.GrLivArea, y=df.SalePrice)
plt.title('Area Against Sale Price')
plt.show()

In [None]:
# removing outliers to improve prediction

print('Before:', df.shape)
df = df.drop(df[(df['SalePrice'] < 300000) & (df['GrLivArea'] > 4000)].index)
print('After:', df.shape)

In [None]:
# defining the train/test data

features = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', '1stFlrSF']

x = df[features]
y = df['SalePrice']

In [None]:
# Using LinearRegression first

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score


# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


# Training the linearRegression Model
model  = LinearRegression()
model.fit(X_train, y_train)

# Predictions
predictions = model.predict(X_test)

# Evaluation of model prediction
print('Mean Absolute Error(MAE):', mean_absolute_error(y_test, predictions))
print('R2 Score:', r2_score(y_test, predictions))

In [None]:
# introducing a new feature 'Neighborhood' to see how results are affected

features = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', '1stFlrSF', 'Neighborhood']

x = df[features]
y = df['SalePrice']


In [None]:
# Filling missing numeric values

x.loc[:, x.select_dtypes(include='number').columns] = x.select_dtypes(include='number').fillna(0)

In [None]:
# Convert Text to Numbers for the Neighborhod column
x = pd.get_dummies(x, columns=['Neighborhood'], drop_first=True)

print('New Shape:', x.shape)


In [None]:
# Trying LinearRegression once again after adding a new column('Neighborhood')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score


# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


# Training the linearRegression Model
model  = LinearRegression()
model.fit(X_train, y_train)

# Predictions
predictions = model.predict(X_test)

# Evaluation of model prediction
print('New Mean Absolute Error(MAE):', mean_absolute_error(y_test, predictions))
print('New R2 Score:', r2_score(y_test, predictions))

In [None]:
# Using RandomForest to see if results can be improved any further

from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

rf_predictions = rf_model.predict(X_test)

print('RF Mean Absolute Error(MAE):', mean_absolute_error(y_test, rf_predictions))
print('RF R2 Score:', r2_score(y_test, rf_predictions))