# Model Training durch Gradient Boosting

In [9]:
import pandas as pd

df = pd.read_csv('../data/life_expectancy_cleaned.csv')
df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,1.0,Developing,0.544592,0.362881,0.034444,0.0,0.003659,0.653061,0.005439,...,0.03125,0.452118,0.649485,0.0,0.004889,0.026074,0.619565,0.603509,0.505274,0.487923
1,Afghanistan,0.933333,Developing,0.447818,0.373961,0.035556,0.0,0.003774,0.622449,0.002319,...,0.572917,0.453279,0.618557,0.0,0.005127,0.000253,0.630435,0.610526,0.50211,0.483092
2,Afghanistan,0.866667,Developing,0.447818,0.369806,0.036667,0.0,0.003759,0.642857,0.002027,...,0.614583,0.450377,0.639175,0.0,0.005287,0.024525,0.637681,0.617544,0.495781,0.478261
3,Afghanistan,0.8,Developing,0.440228,0.375346,0.038333,0.0,0.004014,0.673469,0.013135,...,0.666667,0.473012,0.670103,0.0,0.005608,0.002857,0.644928,0.62807,0.488397,0.47343
4,Afghanistan,0.733333,Developing,0.434535,0.379501,0.039444,0.0,0.000364,0.683673,0.0142,...,0.677083,0.435287,0.680412,0.0,0.000519,0.002302,0.655797,0.635088,0.478903,0.458937


In [10]:
# Anwenden von One-Hot-Encoding auf die kategorialen Variablen
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from joblib import dump


df_encoded = pd.get_dummies(df, drop_first=True)

# Aufteilen der Daten in Features und Zielvariable
X = df_encoded.drop('Life expectancy', axis=1)
y = df_encoded['Life expectancy']

# Aufteilen der Daten in Trainings- und Testsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialisieren des Modells
gb = GradientBoostingRegressor()

# Trainieren des Modells
gb.fit(X_train, y_train)

# Vorhersagen auf den Testdaten
gb_preds = gb.predict(X_test)

# Berechnen der RMSE für das Modell
gb_rmse = mean_squared_error(y_test, gb_preds, squared=False)

# Berechnen des R^2 für das Modell
gb_r2 = gb.score(X_test, y_test)
print('Random Forest R-Squared:', gb_r2)

# Ausgeben der RMSE für das Modell
print('Gradient Boosting RMSE:', gb_rmse)

# Ausgeben der Anzahl der Trainings- und Testdaten
print('No. of training data:', len(X_train))
print('No. of test data:', len(X_test))

# Speichern des Modells
dump(gb, '../Models/gradient_boosting_model.joblib') 

Random Forest R-Squared: 0.948471726443684
Gradient Boosting RMSE: 0.04009217012856987
No. of training data: 2350
No. of test data: 588


['../Models/gradient_boosting_model.joblib']