In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, make_scorer

In [24]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

print("Training Data Shape:", df_train.shape)
print("Testing Data Shape:", df_test.shape)


Training Data Shape: (1200, 81)
Testing Data Shape: (260, 80)


In [25]:
# Save 'Id' columns for submission
test_ID = df_test['Id']

In [26]:
# Drop the original 'Id' column from the dataframes
df_train.drop("Id", axis=1, inplace=True)
df_test.drop("Id", axis=1, inplace=True)

# Separate target variable (HotelValue)
y_train_raw = df_train['HotelValue']
df_train.drop('HotelValue', axis=1, inplace=True)

In [27]:

missing_counts = df_train.isna().sum().astype(np.int64)
print(missing_counts)

PropertyClass           0
ZoningCategory          0
RoadAccessLength      223
LandArea                0
RoadType                0
                     ... 
ExtraFacilityValue      0
MonthSold               0
YearSold                0
DealType                0
DealCondition           0
Length: 79, dtype: int64


Clearing the columns with many NaN values

In [11]:

missing_counts = (
    df_train[['PoolQuality', 'ExtraFacility', 'ServiceLaneType', 'BoundaryFence']]
    .isna()
    .sum()
    .astype(np.int64)
)

print(missing_counts)

PoolQuality        1194
ExtraFacility      1154
ServiceLaneType    1125
BoundaryFence       963
dtype: int64


In [28]:

cols_to_drop = [
    'PoolQuality', 'ExtraFacility', 'ServiceLaneType', 
    'BoundaryFence', 'FacadeType'
]
df_train = df_train.drop(columns=cols_to_drop)


For some columns very less amount of data is missing so its better to remove those rows as it will confuse our model

In [29]:
missing_counts = (
    df_train[['FacadeArea', 'ElectricalSystem']]
    .isna()
    .sum()
    .astype(np.int64)
)

print(missing_counts)

FacadeArea          7
ElectricalSystem    1
dtype: int64


In [30]:
df_train.dropna(subset=['FacadeArea', 'ElectricalSystem'], inplace=True)

In [31]:
missing_counts = (
    df_train[['FacadeArea', 'ElectricalSystem']]
    .isna()
    .sum()
    .astype(np.int64)
)

print(missing_counts)

FacadeArea          0
ElectricalSystem    0
dtype: int64


Impute NaN values with 'None' for features where missing means absence

In [32]:
# Basement-related columns
basement_cols = [
    'BasementHeight', 'BasementCondition', 'BasementExposure', 
    'BasementFacilityType1', 'BasementFacilityType2'
]
df_train[basement_cols] = df_train[basement_cols].fillna('None')

# Garage/Parking related columns (excluding ParkingConstructionYear)
parking_cat_cols = [
    'ParkingType', 'ParkingFinish', 'ParkingQuality', 
    'ParkingCondition'
]
df_train[parking_cat_cols] = df_train[parking_cat_cols].fillna('None')

# Other categorical columns
df_train['LoungeQuality'] = df_train['LoungeQuality'].fillna('None')

Checking for duplicates

In [34]:
df_train.duplicated().sum()

np.int64(0)

Possible incosistencies

Inconsistency,Rows Found,Action
RenovationYear < ConstructionYear
YearSold < ConstructionYear
YearSold < RenovationYear