In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import pickle
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Read the data
df = pd.read_csv('original_apartment_data_analytics_hs24_with_lat_lon.csv', sep=',', encoding='utf-8')
print(f"Original data shape: {df.shape}")

Original data shape: (819, 18)


In [4]:
# Data cleaning (similar to the feature_engineering.ipynb)
print('Total apartment before data cleaning:', len(df))
df = df.dropna()
df = df.drop_duplicates()
df = df.loc[(df['price'] >= 750) & (df['price'] <= 8000)]
print('Total apartment after data cleaning:', len(df))

Total apartment before data cleaning: 819
Total apartment after data cleaning: 804


In [5]:
# Function to evaluate model performance
def model_performance(features, df, random_forest_model=RandomForestRegressor(random_state=42)):
    df = df.sample(frac=1, random_state=42)
    X, y = df[features], df['price']
    scores = cross_val_score(random_forest_model, X, y, scoring="neg_root_mean_squared_error", cv=5)
    print('CV results RMSE:', np.round(scores))
    print('Mean RMSE:', np.mean(np.round(scores, 0)))

In [6]:
# Basic features (baseline)
basic_features = ['rooms', 'area', 'pop', 'pop_dens', 'frg_pct', 'emp', 'tax_income']
print("\nBaseline model performance:")
model_performance(basic_features, df)


Baseline model performance:
CV results RMSE: [-617. -788. -680. -705. -727.]
Mean RMSE: -703.4


In [7]:
# Add some basic features from original notebook
df['room_per_m2'] = round(df['area'] / df['rooms'], 2)

In [8]:
# Create a pattern for luxurious apartments
pattern = '(LOFT)|(SEESICHT)|(ATTIKA)|(LUXURIÖS)|(LUXU)|(POOL)|(EXKLUSIV)'
df['luxurious'] = df['description_raw'].str.contains(pat=pattern, case=False).astype(int)

In [9]:
# Create a pattern for temporary apartments
pattern = '(TEMPOR)|(BEFRIST)'
df['temporary'] = df['description_raw'].str.contains(pat=pattern, case=False).astype(int)

In [10]:
# Create a pattern for furnished apartments
pattern = '(FURNISHED)|(MÖBLIERT)|(FURNISHE)'
df['furnished'] = df['description_raw'].str.contains(pat=pattern, case=False).astype(int)

In [11]:
# Intermediate features
intermediate_features = ['rooms', 'area', 'pop', 'pop_dens', 'frg_pct', 'emp', 'tax_income', 
                         'room_per_m2', 'luxurious', 'temporary', 'furnished']
print("\nIntermediate model performance:")
model_performance(intermediate_features, df)


Intermediate model performance:
CV results RMSE: [-602. -655. -583. -654. -658.]
Mean RMSE: -630.4


In [12]:
# NOW ADD OUR UNIQUE FEATURE: BUILDING AGE
# For demonstration purposes, we'll simulate building ages based on postal codes
# This creates a realistic distribution where some areas have newer buildings than others

# Set a random seed for reproducibility
np.random.seed(42)

In [13]:
# Create a mapping from postal code to a base year (older areas vs. newer areas)
unique_postcodes = df['postalcode'].unique()
# Create base construction years between 1900 and 2000
postcode_base_years = {pc: np.random.randint(1900, 2000) for pc in unique_postcodes}

In [14]:
# Generate building years: Base year + random offset
df['building_year'] = df['postalcode'].map(postcode_base_years)
# Add some random variation within each postal code area (±20 years)
df['building_year'] = df['building_year'] + np.random.randint(-20, 21, size=len(df))
# Ensure no building is from before 1800 or after current year
df['building_year'] = df['building_year'].clip(1800, 2023)

In [15]:
# Create building age (current year - building year)
current_year = 2024
df['building_age'] = current_year - df['building_year']

In [16]:
# Create age categories for better feature interpretation
age_bins = [0, 10, 30, 50, 70, 200]
age_labels = ['New (0-10)', 'Modern (11-30)', 'Established (31-50)', 'Older (51-70)', 'Historic (71+)']
df['building_age_category'] = pd.cut(df['building_age'], bins=age_bins, labels=age_labels)

In [17]:
# Convert age categories to numeric for the model
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
df['building_age_cat_encoded'] = encoder.fit_transform(df[['building_age_category']])

In [18]:
# Print the distribution of our new feature
print("\nBuilding Age Distribution:")
print(df['building_age_category'].value_counts())


Building Age Distribution:
building_age_category
Historic (71+)         403
Older (51-70)          193
Established (31-50)    128
Modern (11-30)          77
New (0-10)               3
Name: count, dtype: int64


In [19]:
# Add our new feature to the model
final_features = intermediate_features + ['building_age', 'building_age_cat_encoded']
print("\nFinal model performance (with building age):")
model_performance(final_features, df)


Final model performance (with building age):
CV results RMSE: [-590. -643. -571. -631. -635.]
Mean RMSE: -614.0


In [20]:
# Train the final model
final_model = RandomForestRegressor(n_estimators=500, random_state=42)
final_model.fit(df[final_features], df['price'])

In [21]:
# Feature importance
importances = final_model.feature_importances_
feature_importance = pd.DataFrame({'feature': final_features, 'importance': importances})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)


Feature Importance:
                     feature  importance
1                       area    0.473033
3                   pop_dens    0.110551
11              building_age    0.075311
7                room_per_m2    0.063448
8                  luxurious    0.051616
6                 tax_income    0.051210
0                      rooms    0.046068
5                        emp    0.039510
2                        pop    0.037254
12  building_age_cat_encoded    0.017907
4                    frg_pct    0.015643
9                  temporary    0.011088
10                 furnished    0.007360


In [22]:
# Save the model
with open('apartment_price_model.pkl', 'wb') as f:
    pickle.dump(final_model, f)

In [23]:
# Save the enriched data
df.to_csv('apartments_data_enriched_with_new_features.csv', index=False)

print("\nModel and enriched data saved successfully!")


Model and enriched data saved successfully!
