In [None]:
# Libraries
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import platform
import socket
from platform import python_version
from datetime import datetime

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

In [None]:
df = pd.read_csv('merged_data.csv')
df.head()

### Prep


In [None]:
# Create train and test samples
X_train, X_test, y_train, y_test = train_test_split(df[['Temperatur (°C)',
                                                        'Niederschlag',
                                                        'Richtung',
                                                        'Luftfeuchtigkeit (%Hr)',
                                                        'Luftdruck (hPa)',
                                                        'holiday']], 
                                                    df['AnzFahrzeuge'], 
                                                    test_size=0.20, 
                                                    random_state=42)
# Show X_train
print('X_train:')
print(X_train.head(), '\n')

# Show y_train
print('y_train:')
print(y_train.head())

In [None]:
lin_reg = LinearRegression().fit(X_train, y_train)

y_predicted = lin_reg.predict(X_test)

y_residuals = y_test - y_predicted

mse = mean_squared_error(y_test, y_predicted)

In [None]:
fig = plt.figure( figsize=(8,4))
n, bins, patches = plt.hist(x=y_residuals, 
                            bins=20, 
                            color='blue',
                            alpha=0.5
                   )

# Set labels
plt.xlabel('residuals', fontsize=10, labelpad=10)
plt.ylabel('frequency', fontsize=10, labelpad=10)
plt.title('Histogram of model residuals', fontsize=12, pad=10)

plt.show()

### Linear Regression

In [None]:
# train linear_model = LinearRegression()
linear_model = LinearRegression()

scores = cross_val_score(linear_model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5)
print('--{}--'.format(linear_model))
print(scores)
print(np.mean(scores)) 

In [None]:
linear_model.get_params()

### Polynominal

In [None]:
polynomial_model = Pipeline([
                                ("polynomial_features", PolynomialFeatures(degree=2)),
                                ("linear_regression", LinearRegression()),
                            ])

scores = cross_val_score(polynomial_model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5)
print('--{}--'.format(polynomial_model))
print(scores)
print(np.mean(scores)) 

In [None]:
polynomial_model.get_params()

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
'polynomial_features__degree': [2, 4, 6, 8], 
'polynomial_features__include_bias': [True, False],
}

grid_search_pl = GridSearchCV(polynomial_model, param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=2) # add param verbose = 2 to see the state
grid_search_pl.fit(X_train, y_train)
print(grid_search_pl.best_estimator_)
print(grid_search_pl.best_params_)
print(grid_search_pl.best_score_)

### Random Forest

In [None]:
randomforest_model = RandomForestRegressor(random_state=42)

scores = cross_val_score(randomforest_model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5)
print('--{}--'.format(randomforest_model))
print(scores)
print(np.mean(scores)) 

In [None]:
randomforest_model.get_params()

In [None]:
from sklearn.model_selection import GridSearchCV


param_grid = {'max_depth': [25, 30, 35],
'max_features': [7, 9]
}

grid_search_rf = GridSearchCV(randomforest_model, param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=2) # add param verbose = 2 to see the state
grid_search_rf.fit(X_train, y_train)
print(grid_search_rf.best_estimator_)
print(grid_search_rf.best_params_)
print(grid_search_rf.best_score_)

### Decision Tree Modell 

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

decisiontree_model = DecisionTreeRegressor(random_state=42)

scores = cross_val_score(decisiontree_model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5)
print('--{}--'.format(decisiontree_model))
print(scores)
print(np.mean(scores))


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

decisiontree_model = DecisionTreeRegressor(random_state=42)

param_grid = {
    'max_depth': [90, 100, 110],
    'max_features': [6, 9]
}

grid_search_dt = GridSearchCV(decisiontree_model, param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=2)
grid_search_dt.fit(X_train, y_train)
print(grid_search_dt.best_estimator_)
print(grid_search_dt.best_params_)
print(grid_search_dt.best_score_)


### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

gradientboost_model = GradientBoostingRegressor(random_state=42)

scores = cross_val_score(gradientboost_model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5)
print('--{}--'.format(gradientboost_model))
print(scores)
print(np.mean(scores))


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

gradientboost_model = GradientBoostingRegressor(random_state=42)

param_grid = {
    'max_depth': [10, 15, 20], 
    'max_features': [5, 7, 9]
}

grid_search_gb = GridSearchCV(gradientboost_model, param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=2)
grid_search_gb.fit(X_train, y_train)
print(grid_search_gb.best_estimator_)
print(grid_search_gb.best_params_)
print(grid_search_gb.best_score_)


### Feature Analyse

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assume you have a trained GridSearchCV model named 'grid_search_gb' from the previous code

# Extract the best fitted model from the GridSearchCV
best_model = grid_search_gb.best_estimator_

# Extract the feature importances from the best fitted model
feature_importance = best_model.feature_importances_

# Sort the feature importances in descending order
sorted_indices = np.argsort(feature_importance)[::-1]

# Extract the names of the features in the correct order
feature_names = X_train.columns[sorted_indices]

# Extract the feature importances in the correct order
sorted_importance = feature_importance[sorted_indices]

# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.barh(range(len(feature_names)), sorted_importance, align='center')
plt.yticks(range(len(feature_names)), feature_names)
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Gradient Boosting Regressor - Feature Importance')
plt.show()


### Testing

In [None]:
'''from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Erstelle das Decision Tree-Modell
model = DecisionTreeRegressor()

# Trainiere das Modell mit den Trainingsdaten
model.fit(X_train, y_train)

# Mache Vorhersagen auf den Trainingsdaten
y_train_pred = model.predict(X_train)

# Berechne den RMSE auf den Trainingsdaten
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)

# Mache Vorhersagen auf den Testdaten
y_test_pred = model.predict(X_test)

# Berechne den RMSE auf den Testdaten
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)

# Ausgabe der RMSE-Werte
print("Train RMSE:", rmse_train)
print("Test RMSE:", rmse_test)'''

In [None]:
'''from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

# Definieren Sie den Entscheidungsbaum-Regressor
tree = DecisionTreeRegressor()

# Definieren Sie den Parametergitter
param_grid = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3],
    'max_features': [None, 'sqrt', 'log2']
}

# Erstellen Sie die Grid Search
grid_search = GridSearchCV(estimator=tree, param_grid=param_grid, scoring='neg_root_mean_squared_error')

# Führen Sie die Grid Search mit den Trainingsdaten durch
grid_search.fit(X_train, y_train)

# Zeigen Sie die besten Parameterkombinationen an
print("Beste Parameterkombination: ", grid_search.best_params_)

# Rufen Sie das Modell mit den besten Parametern ab
best_model = grid_search.best_estimator_

# Bewertung des besten Modells auf den Testdaten
y_pred = best_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE mit besten Parametern: ", test_rmse)'''

In [None]:
'''from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Definieren Sie den Gradient Boosting-Regressor
gb_model = GradientBoostingRegressor()

# Trainieren Sie das Modell mit den Trainingsdaten
gb_model.fit(X_train, y_train)

# Vorhersage auf den Trainingsdaten
y_train_pred = gb_model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
print("Train RMSE: ", train_rmse)

# Vorhersage auf den Testdaten
y_test_pred = gb_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
print("Test RMSE: ", test_rmse)'''

In [None]:
'''from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Definieren Sie den Gradient Boosting-Regressor
gb_model = GradientBoostingRegressor()

# Definieren Sie die Parameter, die Sie optimieren möchten
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 1.0],
    'max_depth': [3, 5, 7]
}

# Initialisieren Sie die GridSearchCV mit dem Gradient Boosting-Regressor und dem Parametergitter
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error')

# Führen Sie die Gittersuche durch, um die besten Parameter zu finden
grid_search.fit(X_train, y_train)

# Rufen Sie die besten Parameter ab
best_params = grid_search.best_params_
print("Beste Parameter: ", best_params)

# Trainieren Sie das Modell mit den besten Parametern
best_gb_model = GradientBoostingRegressor(**best_params)
best_gb_model.fit(X_train, y_train)

# Vorhersage auf den Trainingsdaten
y_train_pred = best_gb_model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
print("Train RMSE: ", train_rmse)

# Vorhersage auf den Testdaten
y_test_pred = best_gb_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
print("Test RMSE: ", test_rmse)'''