In [278]:
import wetter_api as wt
import waldbrand
import model_eval as me
from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.linear_model import LinearRegression, Ridge, Lasso
# from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# from sklearn.svm import SVR
# from sklearn.metrics import mean_squared_error, r2_score
import logging
import pandas as pd
import common_paths

In [279]:
parameter_list = [
    wt.DwdObservationParameter.MONTHLY.PRECIPITATION_HEIGHT,
    wt.DwdObservationParameter.MONTHLY.CLIMATE_SUMMARY.WIND_FORCE_BEAUFORT,
    wt.DwdObservationParameter.MONTHLY.TEMPERATURE_AIR_MAX_200,
]
# Dictionary with short names as keys and full names as values
shortname_to_state = {
    "BW": "Baden-Württemberg",
    "BY": "Bayern",
    "BE": "Berlin",
    "BB": "Brandenburg",
    "HB": "Bremen",
    "HH": "Hamburg",
    "HE": "Hessen",
    "MV": "Mecklenburg-Vorpommern",
    "NI": "Niedersachsen",
    "NW": "Nordrhein-Westfalen",
    "RP": "Rheinland-Pfalz",
    "SL": "Saarland",
    "SN": "Sachsen",
    "ST": "Sachsen-Anhalt",
    "SH": "Schleswig-Holstein",
    "TH": "Thüringen",
}

# Example usage
shortname = "BB"
state = shortname_to_state.get(shortname)

In [280]:
def fetch_and_process_weather_data_test(parameter, state):
    try:
        weather_data = wt.fetch_observation_data(parameter, state)
        weather_data["date"] = pd.to_datetime(weather_data["date"])
        weather_data["Year"] = weather_data["date"].dt.year
        weather_data["Month"] = weather_data["date"].dt.month
        weather_data.sort_values(by=["Month", "Year"], inplace=True)
        weather_data.drop(
            columns=["dataset", "date", "quality", "parameter"],
            inplace=True,
        )
        weather_data["Year"] = weather_data["Year"].astype(int)
        weather_data["Month"] = weather_data["Month"].astype(int)
        parameter_column_name = f"{parameter}"
        weather_data.rename(columns={"value": parameter_column_name}, inplace=True)
        # weather_data = weather_data.groupby(["Year", "Month"]).mean().reset_index()
        return weather_data
    except Exception as e:
        logging.error(
            f"Error fetching or processing weather data for parameter {parameter}: {e}"
        )
        return pd.DataFrame()

In [281]:
def prepare_data_test(parameters, state):
    weather_data_list = [
        fetch_and_process_weather_data_test(param, state) for param in parameters
    ]
    combined_weather_data = weather_data_list[0]

    for additional_data in weather_data_list[1:]:
        combined_weather_data = pd.merge(
            combined_weather_data, additional_data, on=["Year", "Month", "station_id" ], how ='outer'
        )

    wildfire_obj = waldbrand.WildFire()
    wildfire_df = wildfire_obj.get_montly_numbers()
    state_wildfire_data = wildfire_df.loc[[state]]
    state_wildfire_data = wildfire_obj.melt_and_map_months(state_wildfire_data)
    state_wildfire_data["Year"] = state_wildfire_data["Year"].astype(int)
    state_wildfire_data["Month"] = state_wildfire_data["Month"].astype(int)
    merged_data = pd.merge(
        combined_weather_data, state_wildfire_data, on=["Year", "Month"]
    )
    weather_columns = combined_weather_data.columns.tolist()
    weather_columns.remove("Year")
    weather_columns.remove("Month")
    new_column_order = ["Year", "Month"]+ weather_columns + ["nFires"] 
    merged_data = merged_data[new_column_order]
    #merged_data.dropna(inplace=True)
    return merged_data

In [282]:
# shortname = "BW"
# state = shortname_to_state.get(shortname)
merged_data = prepare_data_test(parameter_list, state)

#merged_data = pd.read_csv(common_paths.DATA.joinpath("dwd/Brandenburg.csv"))

In [283]:
merged_data

Unnamed: 0,Year,Month,station_id,CLIMATE_SUMMARY.PRECIPITATION_HEIGHT,CLIMATE_SUMMARY.WIND_FORCE_BEAUFORT,CLIMATE_SUMMARY.TEMPERATURE_AIR_MAX_200,nFires
0,1995,1,00164,42.5,3.46,285.35,0.0
1,1995,1,00427,43.5,3.34,285.75,0.0
2,1995,1,00650,71.3,2.42,285.65,0.0
3,1995,1,00880,47.1,2.74,286.15,0.0
4,1995,1,01001,52.0,3.39,285.75,0.0
...,...,...,...,...,...,...,...
8376,2022,12,05745,53.9,,288.75,0.0
8377,2022,12,05825,45.4,2.29,289.95,0.0
8378,2022,12,06170,42.3,,291.25,0.0
8379,2022,12,06265,58.2,,290.95,0.0


In [284]:
mdf = merged_data.where(merged_data.Year>=1995).where(merged_data.Year<=2022)#.dropna()

In [285]:
mdf#.describe()

Unnamed: 0,Year,Month,station_id,CLIMATE_SUMMARY.PRECIPITATION_HEIGHT,CLIMATE_SUMMARY.WIND_FORCE_BEAUFORT,CLIMATE_SUMMARY.TEMPERATURE_AIR_MAX_200,nFires
0,1995,1,00164,42.5,3.46,285.35,0.0
1,1995,1,00427,43.5,3.34,285.75,0.0
2,1995,1,00650,71.3,2.42,285.65,0.0
3,1995,1,00880,47.1,2.74,286.15,0.0
4,1995,1,01001,52.0,3.39,285.75,0.0
...,...,...,...,...,...,...,...
8376,2022,12,05745,53.9,,288.75,0.0
8377,2022,12,05825,45.4,2.29,289.95,0.0
8378,2022,12,06170,42.3,,291.25,0.0
8379,2022,12,06265,58.2,,290.95,0.0


In [286]:
mdf.groupby(['station_id']).count()

Unnamed: 0_level_0,Year,Month,CLIMATE_SUMMARY.PRECIPITATION_HEIGHT,CLIMATE_SUMMARY.WIND_FORCE_BEAUFORT,CLIMATE_SUMMARY.TEMPERATURE_AIR_MAX_200,nFires
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
96,44,44,44,37,44,44
164,336,336,336,333,336,336
303,299,299,296,246,272,299
427,336,336,335,335,336,336
650,76,76,76,76,76,76
879,13,13,0,10,0,13
880,336,336,336,327,336,336
1001,336,336,335,320,336,336
1106,0,0,0,0,0,0
1426,0,0,0,0,0,0


In [287]:
mdf.rename(columns={'CLIMATE_SUMMARY.PRECIPITATION_HEIGHT':'pr', 'CLIMATE_SUMMARY.WIND_FORCE_BEAUFORT':'sfcWind', 'CLIMATE_SUMMARY.TEMPERATURE_AIR_MAX_200':'tasmax'}, inplace=True)

In [288]:
mdf['sfcWind'] =  0.836 * (mdf['sfcWind'] ** 1.5)

In [289]:
dffbb = pd.read_csv(common_paths.DATA.joinpath("dwd/future_Brandenburg.csv"))

In [290]:
dffbb.drop(columns=['Unnamed: 0', 'Bundesland'], inplace = True)

In [291]:
dffbb['pr'] = dffbb['pr'] * 3600 * 24 * 30

In [292]:
merge_all = pd.merge(mdf, dffbb, on=["Year", "Month", 'sfcWind', 'tasmax', 'pr'], how='outer')

Unnamed: 0,Year,Month,station_id,pr,sfcWind,tasmax,nFires
0,1995,1,00164,42.500000,5.380473,285.35000,0.0
1,1995,1,00427,43.500000,5.103005,285.75000,0.0
2,1995,1,00650,71.300000,3.147236,285.65000,0.0
3,1995,1,00880,47.100000,3.791684,286.15000,0.0
4,1995,1,01001,52.000000,5.218021,285.75000,0.0
...,...,...,...,...,...,...,...
78771,2054,1,,41.612901,4.516129,274.10160,
78772,2054,1,,51.967742,5.196774,273.57257,
78773,2054,1,,51.967742,5.316129,273.62740,
78774,2054,1,,45.483872,5.216129,273.60483,


In [293]:
merge_all.drop(columns=["station_id"], inplace=True)

In [294]:
merge_all

Unnamed: 0,Year,Month,pr,sfcWind,tasmax,nFires
0,1995,1,42.500000,5.380473,285.35000,0.0
1,1995,1,43.500000,5.103005,285.75000,0.0
2,1995,1,71.300000,3.147236,285.65000,0.0
3,1995,1,47.100000,3.791684,286.15000,0.0
4,1995,1,52.000000,5.218021,285.75000,0.0
...,...,...,...,...,...,...
78771,2054,1,41.612901,4.516129,274.10160,
78772,2054,1,51.967742,5.196774,273.57257,
78773,2054,1,51.967742,5.316129,273.62740,
78774,2054,1,45.483872,5.216129,273.60483,


In [301]:
mdf.drop(columns=["station_id"], inplace=True)

KeyError: "['station_id'] not found in axis"

In [302]:
mdf

Unnamed: 0,Year,Month,pr,sfcWind,tasmax,nFires
0,1995,1,42.5,5.380473,285.35,0.0
1,1995,1,43.5,5.103005,285.75,0.0
2,1995,1,71.3,3.147236,285.65,0.0
3,1995,1,47.1,3.791684,286.15,0.0
4,1995,1,52.0,5.218021,285.75,0.0
...,...,...,...,...,...,...
8376,2022,12,53.9,,288.75,0.0
8377,2022,12,45.4,2.897073,289.95,0.0
8378,2022,12,42.3,,291.25,0.0
8379,2022,12,58.2,,290.95,0.0


In [303]:
mdf.dropna(inplace=True)

In [304]:
mdf

Unnamed: 0,Year,Month,pr,sfcWind,tasmax,nFires
0,1995,1,42.5,5.380473,285.35,0.0
1,1995,1,43.5,5.103005,285.75,0.0
2,1995,1,71.3,3.147236,285.65,0.0
3,1995,1,47.1,3.791684,286.15,0.0
4,1995,1,52.0,5.218021,285.75,0.0
...,...,...,...,...,...,...
8370,2022,12,37.2,2.897073,290.65,0.0
8371,2022,12,40.7,2.897073,290.65,0.0
8373,2022,12,58.8,4.409306,290.45,0.0
8377,2022,12,45.4,2.897073,289.95,0.0


In [305]:
test_size=0.2
random_state=42
str_parameter_list = [str(param) for param in parameter_list]
feature_list = ["sfcWind", "pr", "tasmax"]
X = mdf[feature_list]
y = mdf["nFires"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state
)

print(y_train)
# results = evaluate_model(X_train, X_test, y_train, y_test)

6367      0.0
6519      0.0
1895      4.0
3117     99.0
4088    165.0
        ...  
4762    116.0
6554      9.0
6590      1.0
6795      5.0
1034      1.0
Name: nFires, Length: 5296, dtype: float64


In [None]:
# How to define this?
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": list(range(5, 21, 5)),  # Smaller range for max depth
    "min_samples_leaf": [1, 2, 4],
    "min_samples_split": [2, 5, 10]
}
rf = RandomForestRegressor()

grid_search = GridSearchCV(
    estimator=rf, param_grid=param_grid, cv=10, scoring="neg_mean_squared_error"
)

#searching for the right parameters using random Forest on a 10-fold Cross Validation (cv) and the nmse
grid_search.fit(X_train, y_train)

# saving best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [None]:
print(best_params)
print(best_model)
#grid_search.cv_results_
#grid_search.best_index_

In [45]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


#predict 'Anzahl Waldbrände' with the model
y_pred = best_model.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"MSE: {mse}")
print(f"R2 Score: {r2}")
print(f"MAE: {mae}")

MSE: 2.1759962049335927e-06
R2 Score: 0.9988510698669202
MAE: 0.00011100569259962064
