In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [22]:
# Laden Sie Ihre Trainingsdaten
train_data = pd.read_csv('/Users/mariusaffolter/Documents/200_Studium/220_Semester/24FS/Einsatz Geodaten Marketing/Project/apartment_analysis_egm/new_abgebrochene_wohnungen.csv')

# Vorbereitung der Daten
X = train_data.drop(['EGID', 'Year of Demolition'], axis=1)
y = train_data['Year of Demolition']
construction_year = train_data['Date of Construction']

# Daten in Trainings- und Validierungssätze aufteilen
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
construction_year_train, construction_year_valid = train_test_split(construction_year, test_size=0.2, random_state=42)

# Initialisierung der Modelle
random_forest = RandomForestRegressor(random_state=42)
gradient_boosting = GradientBoostingRegressor(random_state=42)
linear_regression = LinearRegression()

# Training der Modelle
random_forest.fit(X_train, y_train)
gradient_boosting.fit(X_train, y_train)
linear_regression.fit(X_train, y_train)

# Vorhersagen auf dem Validierungssatz
rf_preds = random_forest.predict(X_valid)
gb_preds = gradient_boosting.predict(X_valid)
lr_preds = linear_regression.predict(X_valid)

# Korrektur der Vorhersagen
rf_preds_corrected = [max(pred, year + 100) for pred, year in zip(rf_preds, construction_year_valid)]
gb_preds_corrected = [max(pred, year + 100) for pred, year in zip(gb_preds, construction_year_valid)]
lr_preds_corrected = [max(pred, year + 100) for pred, year in zip(lr_preds, construction_year_valid)]

# Rundung der korrigierten Vorhersagen auf ganze Zahlen
rf_preds_rounded = [round(num) for num in rf_preds_corrected]
gb_preds_rounded = [round(num) for num in gb_preds_corrected]
lr_preds_rounded = [round(num) for num in lr_preds_corrected]

# Berechnung des R^2 und MSE für jedes Modell mit den korrigierten Vorhersagen
rf_r2 = r2_score(y_valid, rf_preds_rounded)
gb_r2 = r2_score(y_valid, gb_preds_rounded)
lr_r2 = r2_score(y_valid, lr_preds_rounded)
rf_mse = mean_squared_error(y_valid, rf_preds_rounded)
gb_mse = mean_squared_error(y_valid, gb_preds_rounded)
lr_mse = mean_squared_error(y_valid, lr_preds_rounded)

print(f"Random Forest R^2: {rf_r2}, MSE: {rf_mse}")
print(f"Gradient Boosting R^2: {gb_r2}, MSE: {gb_mse}")
print(f"Linear Regression R^2: {lr_r2}, MSE: {lr_mse}")

Random Forest R^2: -23.647565288158354, MSE: 974.0511627906977
Gradient Boosting R^2: -23.619711075265673, MSE: 972.9503875968992
Linear Regression R^2: -23.612178668398922, MSE: 972.6527131782946


In [23]:
# Laden Sie Ihre Testdaten
test_data = pd.read_csv('/Users/mariusaffolter/Documents/200_Studium/220_Semester/24FS/Einsatz Geodaten Marketing/Project/apartment_analysis_egm/new_bestehende_wohnungen.csv')
prediction_data = test_data.drop(['EGID'], axis=1)
construction_year_test = test_data['Date of Construction']

# Das beste Modell verwenden (Random Forest)
final_predictions = random_forest.predict(prediction_data)

# Korrektur und Rundung der Vorhersagen für den Testdatensatz
final_predictions_corrected = [max(pred, year + 100) for pred, year in zip(final_predictions, construction_year_test)]
final_predictions_rounded = [round(num) for num in final_predictions_corrected]

# Die Vorhersagen speichern oder weiter verwenden
test_data['Predicted Demolition Year'] = final_predictions_rounded
test_data.to_csv('/Users/mariusaffolter/Documents/200_Studium/220_Semester/24FS/Einsatz Geodaten Marketing/Project/apartment_analysis_egm/prediction_bestehende_wohnungen.csv', index=False)