In [None]:
# Necesarry imports
import pandas as pd
from sklearn import svm, linear_model
from sklearn.utils import shuffle
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2
# settings to display all columns
pd.set_option("display.max_columns", None)

In [None]:
C = ["XS", "S", "M", "L"]

In [None]:
final_df = pd.read_csv("../data/final_dataset.csv")

In [None]:
car_df = final_df[final_df["vehicleType"] == "car"]

# Add the vehicle class
classes = { 'BMW 1er': "M",
 'BMW 2er Active Tourer': "M", 'BMW 2er Cabrio': "L", 'BMW I3': "M",
       'BMW X1': "L", 'BMW X2': "L", 'MINI 3-tuerer': "S", 'MINI 5-tuerer': "S",
       'MINI Cabrio': "S", 'MINI Clubman': "S", 'MINI Countryman': "S",
       'Mercedes-Benz A-Klasse': "M", 'Mercedes-Benz B-Klasse': "M",
       'Mercedes-Benz GLA': "M", 'bmw 1er': "M", 'bmw 2er active tourer': "M",
       'bmw 2er cabrio': "L", 'bmw i3': "M", 'bmw x1': "L", 'bmw x2': "L", 'mini 3-tuerer': "S",
       'mini 5-tuerer': "S", 'mini cabrio': "S", 'mini clubman': "S", 'mini countryman': "S",
       'smart fortwo 3rd generation': "XS"}

car_df["class"] = car_df["model"].apply(lambda model: classes[model])

In [None]:
age_src_columns = ["Unter 18", "18 - 29","30 - 49","50 - 64","65 und älter"]
hh_src_columns = ["Einpersonenhaushalte (Singlehaushalte)", "Paare ohne Kind(er)", "Alleinerziehende Elternteile", "Paare mit Kind(ern)", "Mehrpersonenhaushalte ohne Kernfamilie"]

tmp_df = car_df[[*age_src_columns, *hh_src_columns, "distance", "class", "Bezirk"]]

# Calculate the distribution classes
def calculate_dist(columns: list[str], prefix: str):
  sum = tmp_df[columns].sum(axis=1)
  for (i, c) in enumerate(columns):
    tmp_df[f"{prefix}_{i + 1}"] = (tmp_df[c] / sum)

calculate_dist(age_src_columns, "age")
calculate_dist(hh_src_columns, "hh")

# Now drop the unused classes
learning_df = tmp_df.drop([*age_src_columns, *hh_src_columns, "Bezirk"], axis=1)

# Also drop malformed entries
learning_df.dropna(inplace=True)

In [None]:
# Also create the bezirks data for the stations in the simulation stage
bezirk_df = tmp_df.drop([*age_src_columns, *hh_src_columns, "class", "distance"], axis=1)
print(bezirk_df.groupby("Bezirk").count().sort_values("age_1"))
bezirk_df = bezirk_df.groupby("Bezirk").mean()

# Aaand normalize columns
def normalize_columns(columns):
  sum = bezirk_df[columns].sum(axis=1)
  for c in columns:
    bezirk_df[c] = bezirk_df[c] / sum

normalize_columns([f"age_{i}" for i in range(1, 6)])
normalize_columns([f"hh_{i}" for i in range(1, 6)])


In [None]:
header = ["Under 18 Years", "18-29 Years", "30-49 Years", "50-64 Years", "Over 65 Years", "Single Household", "Pairs", "Single Parents", "Parents with children", "Multiperson household"]
# bezirk_df[[f"hh_{i}" for i in range(1, 6)]].sum(axis=1)
bezirk_df.to_latex("../paper/Appendices/district-table.tex", header=header, longtable=True, label="table:Districts", float_format="%.2f")

In [None]:
bezirk_df

In [None]:
car_df.info()

In [None]:
simulation_districts = ["Pankow", "Reinickendorf", "Friedrichshain-Kreuzberg", "Charlottenburg-Wilmersdorf"]

relevant_data = car_df.loc[car_df["Bezirk"].isin(simulation_districts), "datetime_start"]

r = pd.to_datetime(relevant_data,errors='coerce', utc=True)
hour_df = pd.DataFrame(data={"day": r.dt.date, "hour": r.dt.hour, "id": relevant_data.index})

In [None]:
hour_df

In [None]:
hd = hour_df.groupby(["day", "hour"]).count()
# palette = sns.light_palette("#786fa6")
# sns.barplot(x=hd.index, y=hd.values, palette=palette
daily_demand = hd.groupby("hour").mean().round()

palette = sns.color_palette(["#3B3659", "#4C4672", "#5C558B", "#6F67A2", "#8781B1"])
sns.barplot(data=daily_demand, x=daily_demand.index, y="id", palette=palette)
plt.xlabel("Hour of the day")
plt.ylabel("Average number of rentals")

In [None]:
start = pd.to_datetime(car_df["datetime_start"], utc=True)
end = pd.to_datetime(car_df["datetime_end"], utc=True)

In [None]:
diff = car_df["distance"] / (end - start).dt.seconds
diff = diff * 60

In [None]:
diff.replace([np.inf, -np.inf], np.nan).dropna().mean()

In [None]:
car_df["distance"].mean()

In [None]:
# Figure out the availability of classes
unique_cars = car_df.groupby("class")["id"].nunique()
num_of_dp = learning_df.groupby("class")["class"].count()
min_class = num_of_dp.idxmin()


def truncate_shuffle(c):
  num = num_of_dp[c]
  unique = unique_cars[c]

  scale = unique_cars[c] / unique_cars[min_class]
  truncated_num = round(scale * num_of_dp[min_class])

  a = learning_df.loc[learning_df["class"] == c, :]
  shuffle(a)
  a = a[:80000] # truncated_num

  print(c, num, unique, scale, truncated_num, a.shape)
  return a

# learning_df = pd.concat([truncate_shuffle(c) for c in C])

In [None]:
learning_df.groupby("class").count()

In [None]:
# Training a classifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline

X = learning_df.loc[:, learning_df.columns != "class"]
Y = learning_df.loc[:, learning_df.columns == "class"]["class"]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

clf = make_pipeline(MinMaxScaler(), ComplementNB())
clf.fit(X_train.values, y_train)

In [None]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier()
xgb_clf.fit(X_train.values, y_train)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

nn_clf = make_pipeline(StandardScaler(), MLPClassifier(random_state=1, max_iter=1200))
nn_clf.fit(X_train, y_train)

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

sgd_clf = make_pipeline(StandardScaler(),
                    SGDClassifier(max_iter=1000, tol=1e-3, class_weight="balanced", loss="log"))

sgd_clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score
from sklearn.inspection import permutation_importance
 
# Test the classifier
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

imps = permutation_importance(clf, X_test, y_test)
print(imps.importances_mean)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score
# Test the classifier
y_pred = sgd_clf.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
from simulation import Simulation

p = 1 / daily_demand["id"]
station_data = bezirk_df.loc[bezirk_df.index.isin(simulation_districts), :]



In [None]:
alpha_r = [(alpha, Simulation(station_data=station_data, p=p, alpha=alpha, capacity=5, pred=clf.predict_proba)) for alpha in np.arange(0.01, 0.05, 0.002)]

In [None]:
alpha_df = pd.DataFrame(data=map(lambda a: [a[0], a[1].td, a[1].urr, a[1].pi], alpha_r), columns=["alpha", "TD", "URR", "PI"])
alpha_df["alpha"] = alpha_df["alpha"].round(3)

In [None]:
capacity_r = [(c, Simulation(station_data=station_data, p=p, alpha=0.05, capacity=c, pred=clf.predict_proba)) for c in np.arange(2, 50, 2)]

In [None]:
capacity_df = pd.DataFrame(data=map(lambda a: [a[0], a[1].td, a[1].urr, a[1].pi], capacity_r), columns=["Capacity", "TD", "URR", "PI"])

In [None]:
sns.pointplot(x=capacity_df["Capacity"], y=capacity_df["TD"], palette=palette)
sns.pointplot(x=capacity_df["Capacity"], y=capacity_df["URR"], palette=palette)
sns.pointplot(x=capacity_df["Capacity"], y=capacity_df["PI"], palette=palette)

In [None]:
def metric_plot(df, index, xlabel):
  fig, ax = plt.subplots(1, 3, figsize=(21, 7),constrained_layout=True )
  c = ["#4C4672", "#5C558B", "#6F67A2"]
  
  sns.pointplot(x=df[index], y=capacity_df["URR"], color=c[0], ax=ax[0])
  sns.pointplot(x=df[index], y=capacity_df["PI"], color=c[1], ax=ax[1])
  sns.pointplot(x=df[index], y=capacity_df["TD"], color=c[2], ax=ax[2])
  fig.autofmt_xdate()
  for a in ax:
    a.set_xlabel(xlabel)


In [None]:
metric_plot(alpha_df, "alpha", "Substitution effect α") # "α"

In [None]:
metric_plot(capacity_df, "Capacity")

In [None]:
s = Simulation(station_data=station_data, p=p, alpha=0.003, capacity=30, pred=clf.predict_proba)

fig, axs = plt.subplots(2, 2, figsize=(16, 9), constrained_layout=True )
for station, ax in zip(s.stations, axs.flatten()):
  # ax = axs[i]
  # This is just for the ide
  ax: plt.Axes = ax
  ax.set_title(station.name)
  ax.set_xlim(0, 24)
  ax.set_xlabel("Simulation time")
  ax.set_ylabel("Store Delta State")
  for (c, df) in station.history_df.items():
    sns.lineplot(x=df.index / 60, y=df["amount"].values, ax=ax)
  ax.legend(labels=station.history_df.keys())

In [None]:
s.print_metrics()

In [None]:
sgd_clf.predict_proba([[5, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]])
  

In [None]:
X

In [None]:
def age_class(i, name):
  car_df[f"age_{i}"] = car_df[name] / car_df["Einheiten insgesamt"]

age_class(1, "Unter 18")
age_class(2, "18 - 29")
age_class(3, "30 - 49")
age_class(4, "50 - 64")
age_class(5, "65 und älter")


In [None]:
income_curr = ["Erwerbstätige / Monatliches Nettoeinkommen unter 900", "Erwerbstätige / Monatliches Nettoeinkommen 900 – 1500", "Erwerbstätige / Monatliches Nettoeinkommen 1500 und mehr"]

car_df["income_total"] = car_df[income_curr].sum(axis=1)

def income_class(i, name):
  car_df[f"income_{i}"] = car_df[name] / car_df["income_total"]

for (i, name) in enumerate(income_curr):
  income_class(i + 1, name)

In [None]:
car_df.head()

In [None]:
classes = { 'BMW 1er': "M",
 'BMW 2er Active Tourer': "M", 'BMW 2er Cabrio': "L", 'BMW I3': "M",
       'BMW X1': "L", 'BMW X2': "L", 'MINI 3-tuerer': "S", 'MINI 5-tuerer': "S",
       'MINI Cabrio': "S", 'MINI Clubman': "S", 'MINI Countryman': "S",
       'Mercedes-Benz A-Klasse': "M", 'Mercedes-Benz B-Klasse': "M",
       'Mercedes-Benz GLA': "M", 'bmw 1er': "M", 'bmw 2er active tourer': "M",
       'bmw 2er cabrio': "L", 'bmw i3': "M", 'bmw x1': "L", 'bmw x2': "L", 'mini 3-tuerer': "S",
       'mini 5-tuerer': "S", 'mini cabrio': "S", 'mini clubman': "S", 'mini countryman': "S",
       'smart fortwo 3rd generation': "XS"}

car_df["class"] = car_df["model"].apply(lambda model: classes[model])

In [None]:
age_columns = [f"age_{i}" for i in range(1, 6)]
income_columns = [f"income_{i}" for i in range(1, 4)]

fdf = car_df[[*age_columns, *income_columns, "class"]]

In [None]:
# "XS" is overpopulated
sns.countplot(data=car_df, x="model", order=C)

In [None]:
sns.displot(car_df, x="distance")

In [None]:
# palette = sns.light_palette("#786fa6")
ax = sns.boxplot(data=learning_df, palette=palette)


In [None]:
car_df["model_lc"] = car_df["model"].apply(lambda a: a.lower())


In [None]:

plt.figure(figsize=(8,4)) # this creates a figure 8 inch wide, 4 inch high

palette = sns.light_palette("#786fa6")ax = sns.countplot(data=car_df, x="model_lc", palette=palette, order=car_df["model_lc"].value_counts().index)
ax.set_ylim(0, 1200000)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.ticklabel_format(style='plain', axis='y')
plt.xlabel("Model class name")
plt.ylabel("Number of rentals")
plt.show()

In [None]:
car_df["model_lc"].value_counts()[1:].sum()

In [None]:
fleet_size = car_df[["model_lc", "id"]].groupby("model_lc").nunique()
fleet_size.sort_values(by="id", ascending=False, inplace=True)

ax = sns.barplot(x=fleet_size.index, y=fleet_size.values.reshape(1, -1)[0], palette=palette)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.xlabel("Model class name")
plt.ylabel("Number of unique vehicles")
plt.show()
# fleet_size.index, fleet_size.values.reshape(1, -1)[0]


In [None]:
fleet_size

In [None]:
fleet_ids = fleet_size["id"]
fleet_size.max(), fleet_size.min(), fleet_size.sum(), fleet_size

In [None]:
count = fdf.groupby("class").count().min(axis=1).min()

def truncate_class(c):
  a = fdf[fdf["class"] == c];
  shuffle(a)
  a = a[:count]
  return a

fdf = pd.concat([truncate_class(c) for c in C])
fdf.dropna(inplace=True)

In [None]:
# now not anymore
sns.countplot(data=fdf, x="class", order=C)

In [None]:

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours

X = fdf[[*age_columns, *income_columns]]
Y = fdf["class"]

# Oversample using SMOTE
# sm = SMOTE(random_state=42)
# x_smote, y_smote = sm.fit_resample(X, Y)

# enn = EditedNearestNeighbours()
# x_train_enn, y_train_enn = enn.fit_resample(X, Y)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [None]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train, y_train)
clf_classes = clf.classes_

In [None]:
clf_classes

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score
 
# Test the classifier
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
print(classification_report(y_test,y_pred))
# print(accuracy_score(y_test,y_pred))
# print("Precision Score : ",precision_score(y_test,y_pred, 
#                                            pos_label='positive',
#                                            average='micro'))
# print("Recall Score : ",recall_score(y_test,y_pred, 
#                                            pos_label='positive',
#                                            average='micro'))

In [None]:
import seaborn as sns


In [None]:
districts = car_df[["Bezirk", *age_columns, *income_columns]].groupby("Bezirk")
districts = districts.mean()
districts

In [None]:
# Figure out and normalize data per sector
def normalize_columns(columns):
  total = districts[columns].sum(axis=1)
  for c in columns:
    districts[c] = districts[c] / total

normalize_columns(age_columns)
normalize_columns(income_columns)

In [None]:
districts

In [None]:
def predice_choices(d, alpha):
  p = clf.predict_proba([d])[0]
  i = p.argmax()
  print(p)
  p_max = p[i]

  choices = filter(lambda a: p[a[0]] >= p_max - alpha, enumerate(clf_classes))
  choices = map(lambda a: a[1], choices)

  return list(choices)


def predict_district(name: str, alpha: float):
  return predice_choices(districts.loc[name][[*age_columns, *income_columns]], alpha)
  

print([(on, predict_district(on, 0.04)) for on in districts.index])



In [None]:
dt = pd.to_datetime(car_df["datetime_start"][0:50000], errors="raise")  
dt

In [None]:
v = car_df[0:50000].groupby(dt.dt.hour).count()["id"]
sns.barplot(x=v.index, y=v.values, palette=palette)

plt.xlabel("Hour of the day")
plt.ylabel("Rides")

In [None]:
dt

In [None]:
predice_choices([0, 1, 0, 0, 0, 0, 0, 1], 0.1), clf_classes

In [None]:
predice_choices([0, 0, 0, 1, 0, 0, 1, 0], 0.1), clf_classes

In [None]:
predice_choices([0, 0, 0.3, 0, 0, 0, 0, 1], 0.1), clf_classes

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
cross_val_score(XGBClassifier(), X_train, y_train)

In [None]:
times = car_df[["datetime_start", "datetime_end"]]
times.min(), times.max()

In [None]:
car_df["provider"].unique()

In [None]:
car_df