In [377]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split # Used if you were splitting here


In [378]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

print("Training Data Shape:", df_train.shape)
print("Testing Data Shape:", df_test.shape)


Training Data Shape: (1200, 81)
Testing Data Shape: (260, 80)


In [379]:
# Save 'Id' columns for submission
test_ID = df_test['Id']

In [380]:
# Drop the original 'Id' column from the dataframes
df_train.drop("Id", axis=1, inplace=True)
df_test.drop("Id", axis=1, inplace=True)


Outlier Variable Values in Training Data Removal

In [381]:
# Remove samples based on Target Value and key predictors
X_train_raw = df_train.drop(columns=['HotelValue'])
y_train_raw = df_train['HotelValue']
initial_row_count = len(df_train)

# 1. Target-based cleaning: Remove extreme values (bottom 0.1% and top 0.1%)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

# 2. Predictor-based cleaning
if 'UsableArea' in df_train.columns:
    outlier_mask &= (df_train['UsableArea'] < 4000)

if 'OverallQuality' in df_train.columns and 'UsableArea' in df_train.columns:
    outlier_mask &= ~((df_train['OverallQuality'] < 3) & (df_train['UsableArea'] > 3000))

# Apply the mask to *df_train* as well
df_train = df_train[outlier_mask].copy()

# Update X_train and y_train after cleaning
X_train = df_train.drop(columns=['HotelValue']).copy()
y_train = df_train['HotelValue'].copy()

print(f"Rows removed due to extreme outliers: {initial_row_count - len(df_train)}")



Rows removed due to extreme outliers: 6


In [382]:
# # 2. Predictor-based cleaning (Common for this type of dataset)
# # Remove properties with extremely large UsableArea (e.g., > 4000 sq ft)
# if 'UsableArea' in X_train_raw.columns:
#     outlier_mask &= (X_train_raw['UsableArea'] < 4000)

# # Remove properties with poor OverallQuality and high UsableArea (often errors)
# if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
#     outlier_mask &= ~((X_train_raw['OverallQuality'] < 3) & (X_train_raw['UsableArea'] > 3000))

# # Apply the mask to both features and target
# X_train = X_train_raw[outlier_mask].copy()
# y_train = y_train_raw[outlier_mask].copy()

# df_train = df_train[outlier_mask].copy()


# print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")

In [383]:

missing_counts = df_train.isna().sum().astype(np.int64)
print(missing_counts)

PropertyClass         0
ZoningCategory        0
RoadAccessLength    223
LandArea              0
RoadType              0
                   ... 
MonthSold             0
YearSold              0
DealType              0
DealCondition         0
HotelValue            0
Length: 80, dtype: int64


Clearing the columns with many NaN values

In [384]:

missing_counts = (
    df_train[['ExtraFacility', 'ServiceLaneType', 'BoundaryFence','LoungeQuality','FacadeType']]
    .isna()
    .sum()
    .astype(np.int64)
)

print(missing_counts)

ExtraFacility      1148
ServiceLaneType    1119
BoundaryFence       960
LoungeQuality       558
FacadeType          699
dtype: int64


In [385]:
missing_counts = (
    df_test[['ExtraFacility', 'ServiceLaneType', 'BoundaryFence','LoungeQuality','FacadeType']]
    .isna()
    .sum()
    .astype(np.int64)
)

print(missing_counts)

ExtraFacility      252
ServiceLaneType    244
BoundaryFence      216
LoungeQuality      130
FacadeType         170
dtype: int64


In [386]:


print(df_train.shape)
print(df_test.shape)
df_train.drop(columns=[
    'ExtraFacility', 'ServiceLaneType', 
    'BoundaryFence', 'FacadeType','LoungeQuality'
],errors='ignore', inplace=True)

df_test.drop(columns=[
    'ExtraFacility', 'ServiceLaneType', 
    'BoundaryFence', 'FacadeType','LoungeQuality'
],errors='ignore', inplace=True)

print(df_train.shape)
print(df_test.shape)


(1194, 80)
(260, 79)
(1194, 75)
(260, 74)


For some columns very less amount of data is missing so its better to remove those rows as it will confuse our model

In [387]:

print(df_train[['ElectricalSystem']].isna().sum().astype(np.int64))
print(df_test[['ElectricalSystem']].isna().sum().astype(np.int64))


ElectricalSystem    1
dtype: int64
ElectricalSystem    0
dtype: int64


In [388]:
df_train.dropna(subset=['ElectricalSystem'], inplace=True)

Checking for duplicates

In [389]:
print(df_train.duplicated().sum())
print(df_test.duplicated().sum())

0
0


Possible incosistencies :


RenovationYear < ConstructionYear

YearSold < ConstructionYear


In [390]:
df_cleaned1 = df_train[
    (df_train['RenovationYear'] >= df_train['ConstructionYear']) &
    (df_train['YearSold'] >= df_train['ConstructionYear'])
].copy()

print("Original rows:", len(df_train))
print("Cleaned rows:", len(df_cleaned1))
print("Rows removed:", len(df_train) - len(df_cleaned1))

df_train = df_cleaned1


df_cleaned2 = df_test[
    (df_test['RenovationYear'] >= df_test['ConstructionYear']) &
    (df_test['YearSold'] >= df_test['ConstructionYear'])
].copy()

print("Original rows:", len(df_test))
print("Cleaned rows:", len(df_cleaned2))
print("Rows removed:", len(df_test) - len(df_cleaned2))

df_test = df_cleaned2

Original rows: 1193
Cleaned rows: 1193
Rows removed: 0
Original rows: 260
Cleaned rows: 260
Rows removed: 0


Creating temporal features and removing original features

For Basement

In [391]:
# Merge Basement Features into Weighted Quality Score
print("\nMerging basement features...")
basement_quality_map = {
    'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0
}

# Fill NaN values
df_train['BasementFacilitySF1'] = df_train['BasementFacilitySF1'].fillna(0)
df_train['BasementFacilitySF2'] = df_train['BasementFacilitySF2'].fillna(0)

# Map types to scores
df_train['Type1_Score'] = df_train['BasementFacilityType1'].fillna('None').map(basement_quality_map).fillna(0)
df_train['Type2_Score'] = df_train['BasementFacilityType2'].fillna('None').map(basement_quality_map).fillna(0)

# Calculate weighted quality score
df_train['TotalBasementScore'] = (df_train['Type1_Score'] * df_train['BasementFacilitySF1']) + (df_train['Type2_Score'] * df_train['BasementFacilitySF2'])
df_train['BasementFinishedSF'] = df_train['BasementFacilitySF1'] + df_train['BasementFacilitySF2']

# Drop original basement facility columns
df_train.drop(columns=['BasementFacilityType1', 'BasementFacilityType2', 
                 'BasementFacilitySF1', 'BasementFacilitySF2',
                 'Type1_Score', 'Type2_Score'], errors='ignore', inplace=True)


Merging basement features...


In [392]:
# Fill NaN values
df_test['BasementFacilitySF1'] = df_test['BasementFacilitySF1'].fillna(0)
df_test['BasementFacilitySF2'] = df_test['BasementFacilitySF2'].fillna(0)

# Map types to scores
df_test['Type1_Score'] = df_test['BasementFacilityType1'].fillna('None').map(basement_quality_map).fillna(0)
df_test['Type2_Score'] = df_test['BasementFacilityType2'].fillna('None').map(basement_quality_map).fillna(0)

# Calculate weighted quality score
df_test['TotalBasementScore'] = (df_test['Type1_Score'] * df_test['BasementFacilitySF1']) + (df_test['Type2_Score'] * df_test['BasementFacilitySF2'])
df_test['BasementFinishedSF'] = df_test['BasementFacilitySF1'] + df_test['BasementFacilitySF2']

# Drop original basement facility columns
df_test.drop(columns=['BasementFacilityType1', 'BasementFacilityType2', 
                 'BasementFacilitySF1', 'BasementFacilitySF2',
                 'Type1_Score', 'Type2_Score'], errors='ignore', inplace=True)

For Pool

In [393]:
print("Engineering Pool features...")
# Define a quality map for PoolQuality. 
# 'None' (or NaN) = 0, 'Fa' (Fair) = 1, 'Ex' (Excellent) = 4.
# Added 'TA' (Typical) and 'Gd' (Good) as they are common.
pool_quality_map = {
    'None': 0,
    'Fa': 1,
    'Ex': 2,
}

# Fill NaN values first. 'PoolArea' NaNs mean 0 area.
df_train['SwimmingPoolArea'] = df_train['SwimmingPoolArea'].fillna(0)
df_train['PoolQuality'] = df_train['PoolQuality'].fillna('None')

# Map quality strings to numeric scores
df_train['PoolQuality_Score'] = df_train['PoolQuality'].map(pool_quality_map).fillna(0)

# Create the new feature by multiplying quality by area
df_train['TotalPoolScore'] = df_train['PoolQuality_Score'] * df_train['SwimmingPoolArea']

# Now drop the original columns since they are combined
df_train.drop(columns=['PoolQuality', 'SwimmingPoolArea','PoolQuality_Score'],
         errors='ignore', inplace=True)

Engineering Pool features...


In [394]:
# Fill NaN values first. 'PoolArea' NaNs mean 0 area.
df_test['SwimmingPoolArea'] = df_test['SwimmingPoolArea'].fillna(0)
df_test['PoolQuality'] = df_test['PoolQuality'].fillna('None')

# Map quality strings to numeric scores
df_test['PoolQuality_Score'] = df_test['PoolQuality'].map(pool_quality_map).fillna(0)

# Create the new feature by multiplying quality by area
df_test['TotalPoolScore'] = df_test['PoolQuality_Score'] * df_test['SwimmingPoolArea']

# Now drop the original columns since they are combined
df_test.drop(columns=['PoolQuality', 'SwimmingPoolArea','PoolQuality_Score'],
         errors='ignore', inplace=True)

For Open Porch/Veranda

In [395]:
# Merge Porch/Veranda Features
print("Merging porch features...")

df_train['TotalPorchArea'] = (
    df_train['OpenVerandaArea'].fillna(0)+
    df_train['EnclosedVerandaArea'].fillna(0) + 
    df_train['SeasonalPorchArea'].fillna(0) + 
    df_train['ScreenPorchArea'].fillna(0)
)
df_train.drop(columns=['OpenVerandaArea','EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea'], 
         errors='ignore', inplace=True)

Merging porch features...


In [396]:
df_test['TotalPorchArea'] = (
    df_test['OpenVerandaArea'].fillna(0)+
    df_test['EnclosedVerandaArea'].fillna(0) + 
    df_test['SeasonalPorchArea'].fillna(0) + 
    df_test['ScreenPorchArea'].fillna(0)
)
df_test.drop(columns=['OpenVerandaArea','EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea'], 
         errors='ignore', inplace=True)

In [397]:

df_train['TotalOutdoorArea'] = (df_train['TerraceArea'] + df_train['TotalPorchArea']).fillna(0)
df_train['TotalSF'] = (df_train['GroundFloorArea'] + df_train['UpperFloorArea'] + df_train['ParkingArea'] + df_train['TotalOutdoorArea']).fillna(0)

df_train['TotalBaths'] = (df_train['FullBaths'] + 0.5 * df_train['HalfBaths'] +
                    df_train['BasementFullBaths'] + 0.5 * df_train['BasementHalfBaths']).fillna(0)

df_train['OverallScore'] = (df_train['OverallQuality'] + df_train['OverallCondition']) / 2.0 # Assumes these columns were NOT dropped

# --- 2. Temporal Features ---
df_train['Age'] = df_train['YearSold'] - df_train['ConstructionYear']
df_train['YearsSinceRemodel'] = df_train['YearSold'] - df_train['RenovationYear']
df_train['YearsSinceRemodel'] = np.where(df_train['YearsSinceRemodel'] < 0, 0, df_train['YearsSinceRemodel'])
df_train.loc[df_train['RenovationYear'] == df_train['ConstructionYear'], 'YearsSinceRemodel'] = df_train['Age']

# --- 3. Interaction Feature (Example) ---
df_train['Qual_x_GroundSF'] = df_train['OverallQuality'] * df_train['GroundFloorArea'] # Assumes these columns were NOT dropped



In [398]:
df_test['TotalOutdoorArea'] = (df_test['TerraceArea'] + df_test['TotalPorchArea']).fillna(0)
df_test['TotalSF'] = (df_test['GroundFloorArea'] + df_test['UpperFloorArea'] + df_test['ParkingArea'] + df_test['TotalOutdoorArea']).fillna(0)

df_test['TotalBaths'] = (df_test['FullBaths'] + 0.5 * df_test['HalfBaths'] +
                    df_test['BasementFullBaths'] + 0.5 * df_test['BasementHalfBaths']).fillna(0)

df_test['OverallScore'] = (df_test['OverallQuality'] + df_test['OverallCondition']) / 2.0 # Assumes these columns were NOT dropped

# --- 2. Temporal Features ---
df_test['Age'] = df_test['YearSold'] - df_test['ConstructionYear']
df_test['YearsSinceRemodel'] = df_test['YearSold'] - df_test['RenovationYear']
df_test['YearsSinceRemodel'] = np.where(df_test['YearsSinceRemodel'] < 0, 0, df_test['YearsSinceRemodel'])
df_test.loc[df_test['RenovationYear'] == df_test['ConstructionYear'], 'YearsSinceRemodel'] = df_test['Age']

# --- 3. Interaction Feature (Example) ---
df_test['Qual_x_GroundSF'] = df_test['OverallQuality'] * df_test['GroundFloorArea'] # Assumes these columns were NOT dropped

# --- 4. Feature Reduction/Drop ---
drop_cols = ['GroundFloorArea', 'UpperFloorArea', 
                'ConstructionYear', 'RenovationYear', 
                'FullBaths', 'HalfBaths','ParkingArea',
                'BasementFullBaths', 'BasementHalfBaths',
                'TerraceArea', 'OverallQuality', 'OverallCondition','MonthSold','YearSold']
df_train.drop(columns=drop_cols, inplace=True)
df_test.drop(columns=drop_cols, inplace=True)


Giving New Ordinal Mapping to Parameters for Better comparing

Parking Features New Mapping

In [399]:
# --- NEW ORDINAL PARKING MAPPING ---
# Define maps for ordinal parking features
quality_map_5pt = {
    'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0
}
parking_finish_map = {
    'Fin': 3, 'RFn': 2, 'Unf': 1, 'None': 0
}

# Overwrite categorical columns with their new numerical scores

# Impute and map Quality
df_train['ParkingQuality'] = df_train['ParkingQuality'].fillna('None').map(quality_map_5pt).fillna(0)

# Impute and map Condition
df_train['ParkingCondition'] = df_train['ParkingCondition'].fillna('None').map(quality_map_5pt).fillna(0)

# Impute and map Finish
df_train['ParkingFinish'] = df_train['ParkingFinish'].fillna('None').map(parking_finish_map).fillna(0)

In [400]:
# Impute and map Quality
df_test['ParkingQuality'] = df_test['ParkingQuality'].fillna('None').map(quality_map_5pt).fillna(0)

# Impute and map Condition
df_test['ParkingCondition'] = df_test['ParkingCondition'].fillna('None').map(quality_map_5pt).fillna(0)

# Impute and map Finish
df_test['ParkingFinish'] = df_test['ParkingFinish'].fillna('None').map(parking_finish_map).fillna(0)

New Conditional Mapping

In [401]:
# --- NEW PROPERTY FUNCTIONALITY MAPPING ---
# This feature represents deductions from 'Typical'
functionality_map = {
    'Typ': 7,  # Typical
    'Min1': 6, # Minor Deductions 1
    'Min2': 5, # Minor Deductions 2
    'Mod': 4,  # Moderate Deductions
    'Maj1': 3, # Major Deductions 1
    'Maj2': 2, # Major Deductions 2
    'Sev': 1,  # Severely Damaged
    'None': 0  # Assuming 'None' is worse than 'Sev' or not applicable
}
##--- NEW EXTERIOR QUALITY/CONDITION MAPPING ---
df_train['ExteriorQuality'] = df_train['ExteriorQuality'].fillna('None').map(quality_map_5pt).fillna(0)
df_train['ExteriorCondition'] = df_train['ExteriorCondition'].fillna('None').map(quality_map_5pt).fillna(0)

# BasementCondition (uses 5-point map)
df_train['BasementCondition'] = df_train['BasementCondition'].fillna('None').map(quality_map_5pt).fillna(0)

# BasementExposure (custom map)
exposure_map = {
    'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'None': 0
}
df_train['BasementExposure'] = df_train['BasementExposure'].fillna('None').map(exposure_map).fillna(0)

# --- NEW KITCHEN/HEATING QUALITY MAPPING ---
df_train['KitchenQuality'] = df_train['KitchenQuality'].fillna('None').map(quality_map_5pt).fillna(0)
df_train['HeatingQuality'] = df_train['HeatingQuality'].fillna('None').map(quality_map_5pt).fillna(0)
# --- END NEW KITCHEN/HEATING SECTION ---
# Impute and map PropertyFunctionality. 
# Use fillna('Typ') if 'None' should be treated as 'Typical'
df_train['PropertyFunctionality'] = df_train['PropertyFunctionality'].fillna('None').map(functionality_map).fillna(0)

In [402]:

##--- NEW EXTERIOR QUALITY/CONDITION MAPPING ---
df_test['ExteriorQuality'] = df_test['ExteriorQuality'].fillna('None').map(quality_map_5pt).fillna(0)
df_test['ExteriorCondition'] = df_test['ExteriorCondition'].fillna('None').map(quality_map_5pt).fillna(0)

# BasementCondition (uses 5-point map)
df_test['BasementCondition'] = df_test['BasementCondition'].fillna('None').map(quality_map_5pt).fillna(0)

df_test['BasementExposure'] = df_test['BasementExposure'].fillna('None').map(exposure_map).fillna(0)

# --- NEW KITCHEN/HEATING QUALITY MAPPING ---
df_test['KitchenQuality'] = df_test['KitchenQuality'].fillna('None').map(quality_map_5pt).fillna(0)
df_test['HeatingQuality'] = df_test['HeatingQuality'].fillna('None').map(quality_map_5pt).fillna(0)
# --- END NEW KITCHEN/HEATING SECTION ---
# Impute and map PropertyFunctionality. 
# Use fillna('Typ') if 'None' should be treated as 'Typical'
df_test['PropertyFunctionality'] = df_test['PropertyFunctionality'].fillna('None').map(functionality_map).fillna(0)

Normalize the skewed HotelValue

In [403]:
df_train['HotelValue_Log'] = np.log1p(y_train)


Pre Processed CSVs

In [404]:
df_train.to_csv('preprocesssed_train.csv',index=False)
df_test.to_csv('preprocesssed_test.csv',index=False)

print(df_train.shape)
print(df_test.shape)

pd.set_option('display.max_rows',None)

missing_values = df_train.isna().sum().astype(np.int64)

print(missing_values)

pd.reset_option('display.max_rows')

(1193, 63)
(260, 61)
PropertyClass                0
ZoningCategory               0
RoadAccessLength           223
LandArea                     0
RoadType                     0
PlotShape                    0
LandElevation                0
UtilityAccess                0
PlotConfiguration            0
LandSlope                    0
District                     0
NearbyTransport1             0
NearbyTransport2             0
PropertyType                 0
HotelStyle                   0
RoofDesign                   0
RoofMaterial                 0
ExteriorPrimary              0
ExteriorSecondary            0
FacadeArea                   7
ExteriorQuality              0
ExteriorCondition            0
FoundationType               0
BasementHeight              29
BasementCondition            0
BasementExposure             0
BasementUnfinishedSF         0
BasementTotalSF              0
HeatingType                  0
HeatingQuality               0
CentralAC                    0
ElectricalSystem  

In [405]:
# Load your currently saved processed data
# X_train = pd.read_csv('preprocesssed_train.csv')
# X_test = pd.read_csv('preprocesssed_test.csv')
# y_train_log = X_train['HotelValue_Log']
# y_train = X_train['HotelValue']
# X_train.drop(columns=['HotelValue_Log', 'HotelValue'], inplace=True)

# print("--- Columns in X_train ---")
# # Use the .tolist() method for a clean, simple list output
# print(X_train.columns.tolist())

# print("\n--- Columns in X_test ---")
# print(X_test.columns.tolist())

# numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()
# categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# numerical_features_test = X_test.select_dtypes(include=np.number).columns.tolist()
# categorical_features_test = X_test.select_dtypes(include=['object', 'category']).columns.tolist

# print("\n--- 3. Feature Engineering Complete ---")
# print(f"Number of numerical features: {len(numerical_features)}")
# print(f"Number of categorical features: {len(categorical_features)}")
# print(f"Final training features shape: {X_train.shape}")

In [406]:
# from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer

# # Numerical Transformer: Impute, Scale, and add Polynomial Features
# numerical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler()), 
#     # The degree=2 poly features were commented out in your previous code to speed things up. 
#     # I'll keep them commented unless performance is a concern.
#     # ('poly', PolynomialFeatures(degree=2, include_bias=False)) 
# ])


# # Categorical Transformer: Impute and One-Hot Encode
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_features),
#         ('cat', categorical_transformer, categorical_features)
#     ],
#     remainder='drop'
# )

# preprocessor.fit(X_train)

# X_train_transformed = preprocessor.transform(X_train)
# X_train_processed_df = pd.DataFrame(X_train_transformed)

# X_test_transformed = preprocessor.transform(X_test)
# X_test_processed_df = pd.DataFrame(X_test_transformed)

# # Reattach the target to the training data
# X_train_processed_df['HotelValue_Log'] = y_train_log.values 
# X_train_processed_df['HotelValue'] = y_train.values 

# X_train_processed_df.fillna(0, inplace=True) 
# X_test_processed_df.fillna(0, inplace=True)

# # --- 4. Final Save ---
# X_train_processed_df.to_csv('final_processed_train.csv', index=False)
# X_test_processed_df.to_csv('final_processed_test.csv', index=False)



In [407]:
import pandas as pd
from sklearn.preprocessing import StandardScaler


X_train = pd.read_csv('preprocesssed_train.csv')
X_test = pd.read_csv('preprocesssed_test.csv')
y_train_log = X_train['HotelValue_Log']
y_train = X_train['HotelValue']
X_train.drop(columns=['HotelValue_Log', 'HotelValue'], inplace=True)

# 1. Combine for Unified One-Hot Encoding
combined_df = pd.concat([X_train, X_test], axis=0, ignore_index=True)

# 2. Convert all remaining object columns (Nominal Categorical) to dummy variables
X_combined_encoded = pd.get_dummies(combined_df, drop_first=True)

# 3. Separate back into train/test sets
# Note: The shapes of these DataFrames should now be identical (except for the row count)
X_train_final = X_combined_encoded.iloc[:len(X_train)]
X_test_final = X_combined_encoded.iloc[len(X_train):]

# 4. Final Sanity Check and NaN Guard (Essential Fix)
# This step ensures no NaNs survive encoding/scaling and caused your previous ValueError.
X_train_final.fillna(0, inplace=True)
X_test_final.fillna(0, inplace=True)

# 5. Scaling (Fit on Train, Transform on Test)
scaler = StandardScaler()

# Fit only on TRAIN and transform both
X_train_scaled = scaler.fit_transform(X_train_final)
X_test_scaled = scaler.transform(X_test_final)

# Convert back to DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train_final.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test_final.columns)

# 6. Final Save (Attach target to train set)
X_train_scaled_df['HotelValue_Log'] = y_train_log.values 
 
X_train_scaled_df.to_csv('final_processed_train.csv', index=False)
X_test_scaled_df.to_csv('final_processed_test.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_final.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_final.fillna(0, inplace=True)


In [409]:
df_train = pd.read_csv('final_processed_train.csv')
df_test = pd.read_csv('final_processed_test.csv')

print(df_train.shape)
print(df_test.shape)

pd.set_option('display.max_rows',None)

missing_values = df_train.isna().sum().astype(np.int64)

print(missing_values)

pd.reset_option('display.max_rows')


(1193, 183)
(260, 182)
PropertyClass                0
RoadAccessLength             0
LandArea                     0
FacadeArea                   0
ExteriorQuality              0
ExteriorCondition            0
BasementCondition            0
BasementExposure             0
BasementUnfinishedSF         0
BasementTotalSF              0
HeatingQuality               0
LowQualityArea               0
UsableArea                   0
GuestRooms                   0
Kitchens                     0
KitchenQuality               0
TotalRooms                   0
PropertyFunctionality        0
Lounges                      0
ParkingConstructionYear      0
ParkingFinish                0
ParkingCapacity              0
ParkingQuality               0
ParkingCondition             0
ExtraFacilityValue           0
TotalBasementScore           0
BasementFinishedSF           0
TotalPoolScore               0
TotalPorchArea               0
TotalOutdoorArea             0
TotalSF                      0
TotalBaths      