# Tekmovanje

In [1]:
import pandas as pd
import numpy as np
from datetime import date, datetime, timedelta
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

In [2]:
def createLabel(data):
	return data['Route'].astype(str) + "-|-" + data['Route Direction'] + "-|-" + data['Route description'] #+ "-|-" + data['First station'] + "-|-" + data['Last station']

In [3]:
train = pd.read_csv('train.csv', sep='\t')
test = pd.read_csv('test.csv', sep='\t')

train['label'] = createLabel(train)
test['label'] = createLabel(test)

weather_data = pd.read_csv('weather_data.csv', sep='\t', index_col=0)
prazniki = pd.read_csv('prazniki_prosti.txt', sep='\t', index_col=0, header=None)
pocitnice = pd.read_csv('pocitnice.txt', sep='\t', index_col=0, header=None)

In [24]:
len(set(train['Driver ID'].values))

430

Podatkom dodajmo ključe, ki enolično identificirajo linijo.

Na koncu prikažemo število učnih podatkov za vsako linijo iz testnih podatkov.

In [5]:
train_routes = {s for s in train['label']}
test_routes = {s for s in test['label']}

s = sorted([(l, len(train['label'][train['label'] == l]), len(test['label'][test['label'] == l])) for l in test_routes], key=lambda x: x[1])
print(len(s))
s

121


[('1-|-MESTNI LOG - VIŽMARJE-|-  VIŽMARJE; sejem', 0, 827),
 ('1-|-VIŽMARJE - MESTNI LOG-|-  MESTNI LOG; sejem', 0, 865),
 ('30-|-MEDVODE - VODICE-|-  VODICE', 38, 100),
 ('30-|-VODICE - MEDVODE-|-  MEDVODE', 41, 85),
 ('14-|-B BOKALCE - GARAŽA-|-  GARAŽA', 91, 5),
 ('27-|-NS RUDNIK - GARAŽA-|-  GARAŽA', 104, 2),
 ('2-|-ZELENA JAMA - GARAŽA-|-  GARAŽA', 105, 6),
 ('1-|-N BAVARSKI DVOR - VIŽMARJE-|-N VIŽMARJE', 107, 7),
 ('27-|-LETALIŠKA - GARAŽA-|-  GARAŽA', 130, 3),
 ('3-|-N ŠKOFLJICA - BAVARSKI DVOR-|-N BAVARSKI DVOR', 135, 10),
 ('8-|-GAMELJNE - BAVARSKI DVOR-|-  BAVARSKI DVOR', 148, 10),
 ('27-|-K KOLOSEJ - BLEIWEISOVA-|-K BAVARSKI DVOR', 170, 9),
 ('15-|-MEDVODE - STANEŽIČE-|-  STANEŽIČE', 185, 140),
 ('3-|-L TOVARNA LEK - BAVARSKI DVOR-|-L BAVARSKI DVOR', 192, 6),
 ('19-|- I TOMAČEVO - GARAŽA-|-  GARAŽA', 198, 5),
 ('27-|-K BLEIWEISOVA - KOLOSEJ-|-K KOLOSEJ', 199, 22),
 ('15-|-STANEŽIČE - MEDVODE-|-  MEDVODE', 201, 138),
 ('1-|-N VIŽMARJE - BAVARSKI DVOR-|-  BAVARSKI DVOR', 203, 

# Prepare the data

In [72]:
def convert(data):
	d = pd.DataFrame()
	d['departure'] = pd.to_datetime(data['Departure time'])
	d['weekday'] = d['departure'].dt.weekday
	d['month'] = d['departure'].dt.month
	d['hour'] = d['departure'].dt.hour
	d['minute_of_day'] = d['departure'].dt.hour*60 + d['departure'].dt.minute
	d['second_of_day'] = d['departure'].dt.hour*3600 + d['departure'].dt.minute*60 + d['departure'].dt.second
	d['day_of_year'] = d['departure'].dt.dayofyear
	d['driver'] = data['Driver ID']
	d['driver_l'] = np.where(d['driver'] < 254, d['driver'], 254)
	d['driver_h'] = np.where(d['driver'] >= 255, d['driver']-255, 254)
	d['weather_key'] = ((d['departure'].copy().round('30min') - pd.to_datetime(date(1800,1,1))).dt.total_seconds()//60).astype(int)
	d['holiday'] = d['departure'].dt.strftime('%Y-%m-%d').isin(prazniki.index).astype(int)
	d['school_holiday'] = d['departure'].dt.strftime('%Y-%m-%d').isin(pocitnice.index).astype(int)
	d['label'] = data['First station'] + "---" + data['Last station']

	# seconds on the circle
	sec = (d["second_of_day"] / (24*60*60)) * 2 * np.pi
	d["sec_circ_x"] = np.cos(sec)
	d["sec_circ_y"] = np.sin(sec)

	# month on the circle
	month = (d["month"] / 12) * 2 * np.pi
	d["month_circ_x"] = np.cos(month)
	d["month_circ_y"] = np.sin(month)

	# day of year on the circle
	day = (d["day_of_year"] / 365) * 2 * np.pi
	d["day_circ_x"] = np.cos(day)
	d["day_circ_y"] = np.sin(day)

	# merge weather data
	d = pd.merge(d, weather_data, left_on='weather_key', right_index=True, how="left")
	d['freezing'] = (d['t2m'] < 0).astype(int)
	return d

In [73]:
def get_X_train_test(train, test):
	d = convert(train.append(test))
	t = d[["driver", "weekday", "month", "freezing", "school_holiday", "holiday", "sec_circ_x", "sec_circ_y", "day_circ_x", "day_circ_y", "padavine"]].values
	# t = d[["weekday", "month", "freezing", "school_holiday", "holiday", "second_of_day", "day_of_year", "t2m", "padavine", "veter_hitrost"]].values

	return t[:len(train)], t[len(train):], d[:len(train)]

def get_y_train(data):
	d = pd.DataFrame()
	d['departure'] = pd.to_datetime(data['Departure time'])	
	d['arrival'] = pd.to_datetime(data['Arrival time'])
	d['duration'] = (d['arrival'] - d['departure']).dt.total_seconds()

	return d['duration'].to_numpy()


def get_result(data, prediction):
	result = (pd.to_datetime(data['Departure time']) + pd.to_timedelta(prediction, unit='s')).round('1ms')
	return result

## Average ✔

In [64]:
prediction = pd.DataFrame()
prediction['label'] = test['label']
prediction['start'] = test['Departure time']
prediction['duration'] = 0

for l in test_routes:
	X_train, X_test, t = get_X_train_test(train[train['label'] == l], test[test['label'] == l])
	y_train = get_y_train(train[train['label'] == l])
	
	y_pred = 0
	if(len(X_train) > 0):
		y_pred = np.mean(y_train)
	else:
		# y_pred = np.mean(get_y_train(train))
		s = 0
		n = 0
		for l2 in train_routes:
			if(l2 != l):
				s += np.mean(get_y_train(train[train['label'] == l2]))
				n += 1
		y_pred = s / n

	prediction.loc[prediction['label'] == l, 'duration'] = get_result(test[test['label'] == l], y_pred)

	

# Create models

In [74]:
X_train_all, X_test_all, t = get_X_train_test(train, test)
y_train_all = get_y_train(train)

In [76]:
def make_prediction(train, test):
	prediction = pd.DataFrame()
	prediction['label'] = test['label']
	prediction['start'] = test['Departure time']
	prediction['duration'] = 0

	routes = {s for s in test['label']} 

	for l in routes:
		# X_train, X_test, t = get_X_train_test(train[train['label'] == l], test[test['label'] == l])
		# y_train = get_y_train(train[train['label'] == l])
		X_train = X_train_all[train['label'] == l]
		X_test = X_test_all[test['label'] == l]
		y_train = y_train_all[train['label'] == l]
		
		y_pred = np.ones(len(X_test)) * 34 * 60
		if(len(X_train) > 0): # 
			reg = GradientBoostingRegressor(max_depth=13).fit(X_train, y_train)
			y_pred = reg.predict(X_test)
		else:
			y_pred = np.mean(get_y_train(train)) * np.ones(len(X_test))

		prediction.loc[prediction['label'] == l, 'arrival_prediction'] = get_result(test[test['label'] == l], y_pred)

	prediction['arrival_prediction_round'] = pd.to_datetime(prediction['arrival_prediction']).round('1ms')

	return prediction

In [None]:
for i in range(1, 12):
	t_test = train[pd.to_datetime(train['Departure time']).dt.month == i]
	t_train = train[pd.to_datetime(train['Departure time']).dt.month != i]

	prediction = make_prediction(t_test, t_test)
	error = (pd.to_datetime(t_test['Arrival time']) - pd.to_datetime(prediction['arrival_prediction_round'])).dt.total_seconds().values

	print(i, np.mean(np.abs(error)))

```
1 101.68681694581282
2 111.40611644182377
3 114.87555887309112
4 116.97508007887225
5 128.51011340919575
6 108.35978541671118
7 91.69127673127448
8 92.97712371649138
9 115.06750550144667
10 127.08306864785148
11 116.07500394192104
```

In [77]:
prediction = make_prediction(train, test)

prediction[['arrival_prediction_round']].to_csv('result_t3_8.txt', index=False, header=False)

In [235]:
prediction = pd.DataFrame()
prediction['label'] = test['label']
prediction['start'] = test['Departure time']
prediction['duration'] = 0

MAEs = []

for l in test_routes:
	X_train, X_test, t = get_X_train_test(train[train['label'] == l], test[test['label'] == l])
	y_train = get_y_train(train[train['label'] == l])
	
	y_pred = np.ones(len(X_test)) * 34 * 60
	if(len(X_train) > 0):
		N = len(X_train) // 10
		X_t_train = X_train[:-N]
		X_t_test = X_train[-N:]
		y_t_train = y_train[:-N]
		y_t_test = y_train[-N:]

		reg_t = Ridge(alpha=3).fit(X_t_train, y_t_train)
		y_pred_t = reg_t.predict(X_t_test)
		mae = np.mean(np.abs(y_pred_t - y_t_test))
		MAEs.append(mae)

		reg = Ridge(alpha=3).fit(X_train, y_train)
		y_pred = reg.predict(X_test)
	else:
		y_pred = np.mean(get_y_train(train)) * np.ones(len(X_test))

	prediction.loc[prediction['label'] == l, 'duration'] = get_result(test[test['label'] == l], y_pred)

prediction['duration_round'] = pd.to_datetime(prediction['duration']).round('1ms')

np.mean(MAEs)

120.85968138140302

In [224]:
np.mean(MAEs)

120.85968138140302

In [236]:
prediction[['duration_round']].to_csv('result_t_4o.txt', index=False, header=False)

In [80]:
prediction[prediction['label'] == 'Tbilisijska---VIŽMARJE']

Unnamed: 0,label,duration,duration_round
3301,Tbilisijska---VIŽMARJE,2012-12-11 04:35:47,2012-12-11 04:35:47
3302,Tbilisijska---VIŽMARJE,2012-12-11 06:01:19,2012-12-11 06:01:19
3303,Tbilisijska---VIŽMARJE,2012-12-11 07:42:38,2012-12-11 07:42:38
3304,Tbilisijska---VIŽMARJE,2012-12-11 09:31:09,2012-12-11 09:31:09
3307,Tbilisijska---VIŽMARJE,2012-12-11 15:41:49,2012-12-11 15:41:49
...,...,...,...
22963,Tbilisijska---VIŽMARJE,2012-12-17 12:21:10,2012-12-17 12:21:10
22964,Tbilisijska---VIŽMARJE,2012-12-17 13:54:04,2012-12-17 13:54:04
22965,Tbilisijska---VIŽMARJE,2012-12-17 15:26:02,2012-12-17 15:26:02
22966,Tbilisijska---VIŽMARJE,2012-12-17 17:10:31,2012-12-17 17:10:31


In [82]:
test[test['label'] == 'Tbilisijska---VIŽMARJE']

Unnamed: 0,Registration,Driver ID,Route,Route Direction,Route description,First station,Departure time,Last station,Arrival time,label
3301,LJ LPP-165,51,1,MESTNI LOG - VIŽMARJE,VIŽMARJE; sejem,Tbilisijska,2012-12-11 04:01:47.000,VIŽMARJE,?,Tbilisijska---VIŽMARJE
3302,LJ LPP-165,51,1,MESTNI LOG - VIŽMARJE,VIŽMARJE; sejem,Tbilisijska,2012-12-11 05:27:19.000,VIŽMARJE,?,Tbilisijska---VIŽMARJE
3303,LJ LPP-165,51,1,MESTNI LOG - VIŽMARJE,VIŽMARJE; sejem,Tbilisijska,2012-12-11 07:08:38.000,VIŽMARJE,?,Tbilisijska---VIŽMARJE
3304,LJ LPP-165,51,1,MESTNI LOG - VIŽMARJE,VIŽMARJE; sejem,Tbilisijska,2012-12-11 08:57:09.000,VIŽMARJE,?,Tbilisijska---VIŽMARJE
3307,LJ LPP-165,229,1,MESTNI LOG - VIŽMARJE,VIŽMARJE; sejem,Tbilisijska,2012-12-11 15:07:49.000,VIŽMARJE,?,Tbilisijska---VIŽMARJE
...,...,...,...,...,...,...,...,...,...,...
22963,LP LPP-392,75,1,MESTNI LOG - VIŽMARJE,VIŽMARJE; sejem,Tbilisijska,2012-12-17 11:47:10.000,VIŽMARJE,?,Tbilisijska---VIŽMARJE
22964,LP LPP-392,75,1,MESTNI LOG - VIŽMARJE,VIŽMARJE; sejem,Tbilisijska,2012-12-17 13:20:04.000,VIŽMARJE,?,Tbilisijska---VIŽMARJE
22965,LP LPP-392,75,1,MESTNI LOG - VIŽMARJE,VIŽMARJE; sejem,Tbilisijska,2012-12-17 14:52:02.000,VIŽMARJE,?,Tbilisijska---VIŽMARJE
22966,LP LPP-392,75,1,MESTNI LOG - VIŽMARJE,VIŽMARJE; sejem,Tbilisijska,2012-12-17 16:36:31.000,VIŽMARJE,?,Tbilisijska---VIŽMARJE
