## Modelling

Model a Logistic regression

In [68]:
# import libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.under_sampling import RandomUnderSampler

## Assess data

In [61]:
df = pd.read_csv('data/preprocessed_df.csv')

In [62]:
X = df.drop(['classificatie', 'aankomststationVerkorting', 'vertrekstationVerkorting', 'slice dt iso', 'vertrekmoment_utc', 'verkeersdatum_ams'], axis=1)
y = df['classificatie']

## Build model

In [75]:
# Random undersampling
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_resample(X, y)

In [76]:
# Make train test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [77]:
# Make pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ols', LogisticRegression())
])

In [78]:
# fit pipeline
pipeline.fit(X_train, y_train)

In [79]:
# Predict on test
y_pred = pipeline.predict(X_test)

In [80]:
# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of the model:', accuracy)

Accuracy: 0.5692307692307692


In [81]:
ols_model = pipeline.named_steps['ols']

In [82]:
ols_model.coef_

array([[ 0.94825286, -0.25855626, -0.04704712,  0.03607085,  0.14106403],
       [-0.23850929,  0.10709163,  0.07400543,  0.0516202 ,  0.01234153],
       [-0.70974357,  0.15146464, -0.02695831, -0.08769105, -0.15340555]])

In [83]:
ols_model.intercept_

array([-0.23231612,  0.19001118,  0.04230493])