# Megatutorial 4: Regression

In [None]:
from pandas import read_csv

# Transformer/Funktionen zur Vorverarbeitung
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Estimators für die Regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

# Metriken für die Regression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import root_mean_squared_error

## Daten laden

In [2]:
data = read_csv("../../data/bikesharing.csv", index_col=0)

## Preprocessing

### Fehlende Werte

In [5]:
imputer_engine = SimpleImputer(strategy="median")
data["hum"] = imputer_engine.fit_transform(data[["hum"]])

### Target/Feature auswählen

In [7]:
features = [
    'season', 'holiday', 'weekday', 'weathersit', 'temp',
    'atemp', 'hum', 'windspeed', 'month'
]

target = [
    'cnt'
]

X = data[features]
y = data[target]

### Label Encoding

In [8]:
season_encoder_engine = LabelEncoder()
holiday_encoder_engine = LabelEncoder()
weekday_encoder_engine = LabelEncoder()
weathersit_encoder_engine = LabelEncoder()

X.loc[:, "season"] = season_encoder_engine.fit_transform(X["season"])
X.loc[:, "holiday"] = holiday_encoder_engine.fit_transform(X["holiday"])
X.loc[:, "weekday"] = weekday_encoder_engine.fit_transform(X["weekday"])
X.loc[:, "weathersit"] = weathersit_encoder_engine.fit_transform(X["weathersit"])

### Hold-Out-Resampling

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2
)

## Modelling

In [20]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

predictions = linear_model.predict(X_test)

print(
    "R²", r2_score(y_test, predictions),
    "RMSE", root_mean_squared_error(y_test, predictions),
    "MAE", mean_absolute_error(y_test, predictions)
)

R² 0.5159052210536125 RMSE 1383.1656222283655 MAE 1194.010257619289


In [27]:
tree_model = DecisionTreeRegressor(max_depth=5)
tree_model.fit(X_train, y_train)

predictions = tree_model.predict(X_test)

print(
    "R²", r2_score(y_test, predictions),
    "RMSE", root_mean_squared_error(y_test, predictions),
    "MAE", mean_absolute_error(y_test, predictions)
)

R² 0.44855730624829937 RMSE 1476.247517831682 MAE 1225.511157838044
