In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

In [15]:
data = pd.read_csv("../data/train.csv")

In [16]:
df = data

In [17]:
# Display basic information about the dataset and Missing Value Information 
df_info = df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [18]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [26]:
df_description = df.describe(include="all")
print(df_description)

                 Id   MSSubClass MSZoning  LotFrontage        LotArea Street  \
count   1460.000000  1460.000000     1460  1460.000000    1460.000000   1460   
unique          NaN          NaN        5          NaN            NaN      2   
top             NaN          NaN       RL          NaN            NaN   Pave   
freq            NaN          NaN     1151          NaN            NaN   1454   
mean     730.500000    56.897260      NaN    69.863699   10516.828082    NaN   
std      421.610009    42.300571      NaN    22.027677    9981.264932    NaN   
min        1.000000    20.000000      NaN    21.000000    1300.000000    NaN   
25%      365.750000    20.000000      NaN    60.000000    7553.500000    NaN   
50%      730.500000    50.000000      NaN    69.000000    9478.500000    NaN   
75%     1095.250000    70.000000      NaN    79.000000   11601.500000    NaN   
max     1460.000000   190.000000      NaN   313.000000  215245.000000    NaN   

       Alley LotShape LandContour Utili

In [20]:
df_missing_values = df.isnull().sum()
print(df_missing_values)

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64


In [27]:
# Filling missing numerical values with the median of the respective column
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

df[num_cols] = df[num_cols].apply(lambda x: x.fillna(x.median()))

In [28]:
#To check if the missing numerical values have been filled 
print(df.isnull().sum())

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 81, dtype: int64


In [34]:
# Fill missing categorical values with 'None' 
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna('None')

In [35]:
#To check if the missing categorical values have been filled
print(df_missing_values)

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64


### Categorical Encoding and Feature scaling

In [32]:
#Seperating the target variable from the features
X = df.drop(columns=['SalePrice', 'Id'])
y = df['SalePrice']


In [33]:
#Identifying categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [36]:
#Defining Transformations
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features), #To standardize the numerical features
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) #To one-hot encode the categorical features

])

In [37]:
#Applying transformations to the dataset
X_transformed = preprocessor.fit_transform(X)

In [38]:
#Shape of the transformed dataset
print(X_transformed.shape)

(1460, 303)


# Feature Engineering : Creating Two New Features

In [39]:
#Age of the House in Years
df["HouseAge"] = 2025 - df["YearBuilt"]

#Total Livable Area in the House (above the basement)
df["TotalArea"] = df["GrLivArea"] + df["TotalBsmtSF"]

#Adding Two New Features 
X["HouseAge"] = df["HouseAge"]
X["TotalArea"] = df["TotalArea"]



Reasoning for Creation of Features Above
HouseAge: Older houses migh be lower in price due to wear nd tear, while newer houses could be more valuable
TotalArea : A larger livable area would have a positive correlation with price

In [None]:
#Updating Feature List
numerical_features.extend(["HouseAge", "TotalArea"])

In [None]:
#Reapply transformations to include the new features
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

Training of Models

In [45]:
#Initializing Models
lr = LinearRegression()
rf = RandomForestRegressor(n_estimators=100, random_state=42)
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate =0.1, random_state=42)

In [46]:
#Perform cross-validation and store results
models = [lr, rf, gbr]
cv_results = {}

In [None]:
for name, model in models.items():
    scores = cross_val_score(model, X_transformed, y, cv=5, scoring='neg_mean_squared_error')
    cv_results[name] = scores

cv_results = {}

In [82]:
# Initialize models
lr_simple = LinearRegression()
rf = RandomForestRegressor(n_estimators=100, random_state=42)
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)


In [54]:
#Loading the Traning Dataset
df_test = pd.read_csv("../data/train.csv")


In [55]:
#Store the Test IDs for submissiion
test_ids = df_test['Id']

In [56]:
#Drop the ID column 
X_test = df_test.drop(columns=['Id'])

In [70]:
# Recreate feature-engineered columns
X_test["HouseAge"] = 2025 - X_test["YearBuilt"]
X_test["TotalArea"] = X_test["GrLivArea"] + X_test["TotalBsmtSF"]

In [71]:
#Handling Missing Values 
X_test[numerical_features] = X_test[numerical_features].apply(lambda x: x.fillna(x.median()))
X_test[categorical_features] = X_test[categorical_features].fillna('None')


In [87]:
#Applying the transformations to the test dataset
X_test_transformed = preprocessor.transform(X_test)


In [88]:
X_test_transformed = preprocessor.fit_transform(X_test)

In [90]:
# Predict house prices using the trained Random Forest model
y_test_pred = rf.predict(X_test_transformed)

NotFittedError: This RandomForestRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

# Modelling, Evaluation and Cross Validation

In [73]:
# Initialize models
lr_simple = LinearRegression()
rf = RandomForestRegressor(n_estimators=100, random_state=42)
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)


In [74]:
 # Simple linear regression using the first feature
lr_simple.fit(X_transformed[:, [0]], y) 
y_pred_simple = lr_simple.predict(X_transformed[:, [0]])

In [75]:
rf.fit(X_transformed, y)  # Random Forest
y_pred_rf = rf.predict(X_transformed)

In [76]:
gbr.fit(X_transformed, y)  # Gradient Boosting
y_pred_gbr = gbr.predict(X_transformed)

In [77]:
# Evaluation function
def evaluate_model(model_name, y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} Performance:")
    print(f"MSE: {mse:.4f}")
    print(f"R-squared: {r2:.4f}\n")

# Evaluate all models
evaluate_model("Simple Linear Regression", y, y_pred_simple)
evaluate_model("Random Forest", y, y_pred_rf)
evaluate_model("Gradient Boosting", y, y_pred_gbr)


Simple Linear Regression Performance:
MSE: 6261986323.2575
R-squared: 0.0071

Random Forest Performance:
MSE: 125378412.2365
R-squared: 0.9801

Gradient Boosting Performance:
MSE: 209387129.1452
R-squared: 0.9668

