### Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

### Trainieren der Modelle

In [2]:
# Laden Trainingsdaten
train_data = pd.read_csv('./new_abgebrochene_wohnungen.csv')

# Vorbereitung der Daten
X = train_data.drop(['EGID', 'Year of Demolition'], axis=1)
y = train_data['Year of Demolition']
construction_year = train_data['Date of Construction']

# Daten in Trainings- und Validierungssätze aufteilen
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
construction_year_train, construction_year_valid = train_test_split(construction_year, test_size=0.2, random_state=42)

# Initialisierung der Modelle
random_forest = RandomForestRegressor(random_state=42)
gradient_boosting = GradientBoostingRegressor(random_state=42)
linear_regression = LinearRegression()

# Training der Modelle
random_forest.fit(X_train, y_train)
gradient_boosting.fit(X_train, y_train)
linear_regression.fit(X_train, y_train)

# Vorhersagen auf dem Validierungssatz
rf_preds = random_forest.predict(X_valid)
gb_preds = gradient_boosting.predict(X_valid)
lr_preds = linear_regression.predict(X_valid)

# Rundung der Vorhersagen auf ganze Zahlen
rf_preds_rounded = [round(num) for num in rf_preds]
gb_preds_rounded = [round(num) for num in gb_preds]
lr_preds_rounded = [round(num) for num in lr_preds]

# Berechnung des R^2 und MSE für jedes Modell
rf_r2 = r2_score(y_valid, rf_preds_rounded)
gb_r2 = r2_score(y_valid, gb_preds_rounded)
lr_r2 = r2_score(y_valid, lr_preds_rounded)
rf_mse = mean_squared_error(y_valid, rf_preds_rounded)
gb_mse = mean_squared_error(y_valid, gb_preds_rounded)
lr_mse = mean_squared_error(y_valid, lr_preds_rounded)

print(f"Random Forest R^2: {rf_r2}, MSE: {rf_mse}")
print(f"Gradient Boosting R^2: {gb_r2}, MSE: {gb_mse}")
print(f"Linear Regression R^2: {lr_r2}, MSE: {lr_mse}")


Random Forest R^2: 0.2771243285063647, MSE: 28.567441860465117
Gradient Boosting R^2: 0.21168654385145402, MSE: 31.153488372093022
Linear Regression R^2: 0.027809507474016226, MSE: 38.42015503875969


### Abbruchjahr mind. 60 Jahre nach Baujahr

In [3]:
# Korrektur der Vorhersagen
rf_preds_corrected = [max(pred, year + 60) for pred, year in zip(rf_preds, construction_year_valid)]
gb_preds_corrected = [max(pred, year + 60) for pred, year in zip(gb_preds, construction_year_valid)]
lr_preds_corrected = [max(pred, year + 60) for pred, year in zip(lr_preds, construction_year_valid)]

# Rundung der korrigierten Vorhersagen auf ganze Zahlen
rf_preds_rounded_corrected = [round(num) for num in rf_preds_corrected]
gb_preds_rounded_corrected = [round(num) for num in gb_preds_corrected]
lr_preds_rounded_corrected = [round(num) for num in lr_preds_corrected]

# Berechnung des R^2 und MSE für jedes Modell mit den korrigierten Vorhersagen
rf_r2_corrected = r2_score(y_valid, rf_preds_rounded_corrected)
gb_r2_corrected = r2_score(y_valid, gb_preds_rounded_corrected)
lr_r2_corrected = r2_score(y_valid, lr_preds_rounded_corrected)
rf_mse_corrected = mean_squared_error(y_valid, rf_preds_rounded_corrected)
gb_mse_corrected = mean_squared_error(y_valid, gb_preds_rounded_corrected)
lr_mse_corrected = mean_squared_error(y_valid, lr_preds_rounded_corrected)

print(f"Random Forest R^2 (corrected): {rf_r2_corrected}, MSE (corrected): {rf_mse_corrected}")
print(f"Gradient Boosting R^2 (corrected): {gb_r2_corrected}, MSE (corrected): {gb_mse_corrected}")
print(f"Linear Regression R^2 (corrected): {lr_r2_corrected}, MSE (corrected): {lr_mse_corrected}")


Random Forest R^2 (corrected): 0.00984157859395085, MSE (corrected): 39.13023255813953
Gradient Boosting R^2 (corrected): -0.026800442309937944, MSE (corrected): 40.57829457364341
Linear Regression R^2 (corrected): -0.1686215403480087, MSE (corrected): 46.18294573643411


### Prediction

In [4]:
# Laden der Testdaten
test_data = pd.read_csv('./new_bestehende_wohnungen.csv')
prediction_data = test_data.drop(['EGID'], axis=1)
construction_year_test = test_data['Date of Construction']

# Das beste Modell verwenden (Random Forest)
final_predictions = random_forest.predict(prediction_data)

# Korrektur und Rundung der Vorhersagen für den Testdatensatz
final_predictions_corrected = [max(pred, year + 60) for pred, year in zip(final_predictions, construction_year_test)]
final_predictions_rounded = [round(num) for num in final_predictions_corrected]

# Die Vorhersagen speichern
test_data['Predicted Demolition Year'] = final_predictions_rounded
test_data.to_csv('./prediction_bestehende_wohnungen.csv', index=False)

In [1]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Darwin | 23.4.0
Datetime: 2024-05-18 16:28:22
Python Version: 3.10.9
-----------------------------------
