**Forecast of tram delay in Krakov** (https://www.kaggle.com/c/tram-predict-delay-ds1/submissions)

In [None]:
# 1 - import library
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import lightgbm as lgb
import catboost as ctb
import math

np.random.seed(0)

In [None]:
# 2 - load date
df_train = pd.read_hdf('input/tram.train.h5')
df_test = pd.read_hdf('input/tram.test.h5')

In [None]:
# 3 - concate train and test
df = pd.concat([df_train, df_test])
df.shape

(308152, 11)

In [None]:
# 4 - change and create features

df["stop_name_cat"] = df["stop_name"].factorize()[0]     # get the numeric representation for "stop_name"
df["direction_cat"] = df["direction"].factorize()[0]     # get the numeric representation for "direction"
df["planned_time_hours"] = df["planned_time"].apply(lambda x: int(x[11:13]))  # get hours from "planned_time"

df["seq_num"] = df["seq_num"].fillna(0)                  # fill NA/NaN values using "0" for "seq_num"
df["vehicle_id_cat"] = df["vehicle_id"].factorize()[0]   # get the numeric representation for "vehicle_id"
df["seq_num_pl_numb"] = df["number"] + df["seq_num"] + df["direction_cat"]
df["number_pl_stop_name"] = df["number"] + df["stop_name_cat"]
df["number_pl_direction_stop_name"] = df["number"] + df["direction_cat"] + df["stop_name_cat"]

feats = df.select_dtypes("number").columns
black_list = ["id", 'delay', "stop_name", "direction", "vehicle_id", "trip_id", "planned_time", "datetime"]
feats = [x for x in feats if x not in black_list]
print(feats)

['stop', 'number', 'seq_num', 'stop_name_cat', 'direction_cat', 'planned_time_hours', 'vehicle_id_cat', 'seq_num_pl_numb', 'number_pl_stop_name', 'number_pl_direction_stop_name']


In [None]:
# 4 - create datasets train and test
from sklearn.preprocessing import MinMaxScaler,StandardScaler,MaxAbsScaler
df_train = df[ df["delay"].notnull() ].copy()
df_test = df[ df["delay"].isnull() ].copy()

X_train = df_train[feats].values
df_train["delay"] = df_train["delay"]/60     # convert target variable: seconds -> minutes
y_train = df_train["delay"].values
X_test = df_test[feats].values

ss = MaxAbsScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [None]:
# 5 - learning model
model = xgb.XGBRegressor(max_depth=10, n_estimators=50, random_state=0)
scores = cross_val_score(model, X_train, y_train, cv=3, scoring="neg_mean_absolute_error")
np.mean(scores), np.std(scores)

(-0.8232544994131864, 0.010511846057428447)

In [None]:
# 6 - predict 
model = xgb.XGBRegressor(max_depth=10, n_estimators=50, random_state=0)
model.fit(X_train, y_train)
df_test["delay"] = model.predict(X_test)

In [None]:
# 7 - transform sec -> min
df_test["delay"] = df_test["delay"].apply(lambda x: 0 if x < 0 else round(x, 0))
df_test["delay"] = df_test["delay"]*60     # convert target variable: minutes -> seconds
print(df_test["delay"])

47215       0.0
47216       0.0
47217     540.0
47218      60.0
47219     660.0
          ...  
308141    180.0
308142      0.0
308143    180.0
308144     60.0
308145      0.0
Name: delay, Length: 132166, dtype: float64


In [None]:
# 8 - save results
df_test[ ["id", "delay"] ].to_csv('result_xgboost.csv', index=False) 

In [None]:
# create other features - 1 

# One Hot Encoding for "stop_name"
df_dummies = pd.get_dummies (df.stop_name, prefix = 'stop_name')
df = pd.concat ( [df, df_dummies] , axis = 1)

# One Hot Encoding for "direction"
df_dummies = pd.get_dummies (df.direction, prefix = 'direction')
df = pd.concat ( [df, df_dummies] , axis = 1)

# One Hot Encoding for "vehicle_id"
df_dummies = pd.get_dummies (df.vehicle_id, prefix = 'vehicle_id')
df = pd.concat ( [df, df_dummies] , axis = 1)

# create new feature "planned_day_week" = weekday number and use One Hot Encoding
# create new feature "weekend" = 0 if workday, 1 if weekend, and use One Hot Encoding
df["planned_time_d"] = pd.to_datetime( df["planned_time"] )
df['planned_day_week'] = df['planned_time_d'].apply(lambda day_week: day_week.dayofweek)

df["weekend"] = df["planned_day_week"].apply(lambda x: 0 if x > 4 else 1)
df_dummies = pd.get_dummies (df.weekend, prefix = 'weekend')
df = pd.concat ( [df, df_dummies] , axis = 1)

# create new feature "count_stop" = number of stops for each trip
df['count_stop'] = df.groupby('trip_id')['trip_id'].transform('count')

# create new feature "planned_period_time"  = time period: 1 - rush hours/ 0 - not rush hours  (период часы пик/не часы пик)
df["planned_time_hours"] = df["planned_time"].apply(lambda x: int(x[11:13]))  #df["planned_time"].dt.hour

df["planned_period_time"] = df["planned_time_hours"].apply(lambda x: 0 
                                                                if ((x <= 7) | (x >= 19) | ((x >= 10) & ( x <= 15)))
                                                                    else 1)
df[df["planned_period_time"] == 1][["planned_time", "planned_time_hours", "planned_period_time"]]

df_dummies = pd.get_dummies (df.planned_period_time, prefix = 'planned_period_time')
df = pd.concat ( [df, df_dummies] , axis = 1)

# create new feature "seq_numb_difference" - difference between current and previous "seq_numb" for the same "trip_id"
# (для одного маршрута разница между seg_numb)
df['seq_numb_difference'] = df.groupby(['trip_id']).seg_numb.diff(-1)

In [None]:
# create other features - 2
# create new feature "delay_shift" = the previous trip delay (поле с прошлой задержкой маршрута трамвая)
pd.options.mode.chained_assignment = None   # remove warning 

lst_trip = df["trip_id"].unique()
df["delay_shift"] = df["delay"]

df_res = df[ df["trip_id"] == 0]
i = 0
for i_trip in lst_trip:
    print(i, "trip = ", i_trip)
    df_tr = pd.DataFrame()
    df_tr = df[ df["trip_id"] == i_trip]
    df_tr["delay_shift"] = df[ df["trip_id"] == i_trip] ["delay"].shift(periods=1, axis=0)
    df_res = pd.concat([df_res, df_tr])
    i += 1