#install

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install pomegranate
!pip install dipy



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import collections
from collections import Counter
import pickle
from pomegranate import *

#read data

In [4]:
test_trips = '/content/drive/My Drive/2021 Route Prediction/Project-1/Source-Code/data/3.1_cluster_on_trips/trips_for_lebel/lebeled/testtrips_2019_8-12.csv'
train_trips = '/content/drive/My Drive/2021 Route Prediction/Project-1/Source-Code/data/3.1_cluster_on_trips/trips_for_lebel/lebeled/testtrips_2019_1-7.csv'
train_path = '/content/drive/My Drive/2021 Route Prediction/Project-1/Source-Code/model/3.1'
road_data_path = '/content/drive/My Drive/2021 Route Prediction/Project-1/Source-Code/data/route_network/clustered'

In [5]:
df_train_trips = pd.read_csv(train_trips, parse_dates=['time_stamp'])
df_test_trips = pd.read_csv(test_trips, parse_dates=['time_stamp'])

In [6]:
def day_month_trip(df) :
  df['day'] = df['time_stamp'].apply(lambda x: x.day)
  df['month'] = df['time_stamp'].apply(lambda x: x.month)

In [7]:
day_month_trip(df_train_trips)
day_month_trip(df_test_trips)

In [8]:
df_train_trips = df_train_trips[['trip_id', 'sp', 'day', 'month','route_cluster']]
df_test_trips = df_test_trips[['trip_id', 'sp', 'day', 'month','route_cluster']]

#get rid of mini cluster

In [9]:
pkl_filename = train_path+"/QBcluster_baseline_treshold12.pkl"
with open(pkl_filename, 'rb') as file:
    clusters = pickle.load(file)

print("Nb. clusters:", len(clusters))
print("Cluster sizes:", map(len, clusters))
print("Small clusters:", collections.Counter(clusters < 2)[True], end=' ')
print("Small clusters:", clusters < 2)
print("Streamlines indices of the first cluster:\n", clusters[0].indices)
# print("Centroid of the last cluster:\n", clusters[-1].centroid)

is_mini_cluster = (clusters == 1)

mini_cluster = []
for ci in range(0, len(is_mini_cluster)):
  if is_mini_cluster[ci]:
    mini_cluster.append(ci)
print(len(mini_cluster)) 

df_train_trips.loc[df_train_trips.route_cluster.isin(mini_cluster), 'route_cluster'] = -1
df_train_trips.loc[df_train_trips.route_cluster == -1].sp.value_counts()

df_test_trips.loc[df_test_trips.route_cluster.isin(mini_cluster), 'route_cluster'] = -1
df_test_trips.loc[df_test_trips.route_cluster == -1].sp.value_counts()

Nb. clusters: 1193
Cluster sizes: <map object at 0x7f4169b575d0>
Small clusters: 758 Small clusters: [ True False False ...  True  True  True]
Streamlines indices of the first cluster:
 [0]
758


197     136
209      89
217      83
379      76
192      75
       ... 
1578      1
1538      1
5624      1
5616      1
2047      1
Name: sp, Length: 1446, dtype: int64

In [10]:
df_train_trips.route_cluster.value_counts()

-1      10240
 203      409
 772      242
 2        216
 100      198
        ...  
 993       20
 210       20
 776       20
 261       20
 961       18
Name: route_cluster, Length: 436, dtype: int64

In [11]:
df_test_trips.route_cluster.value_counts()

-1      8375
 203     461
 772     276
 100     188
 2       182
        ... 
 961      12
 399      12
 257      12
 975      12
 39        9
Name: route_cluster, Length: 414, dtype: int64

#X and Label

In [12]:
state_names = []
uniq_r_cluster = df_train_trips.route_cluster.unique()
for cluster in uniq_r_cluster:
  state_names.append(str(cluster))
print(len(state_names))
state_names[:5]


436


['-1', '1', '2', '3', '4']

In [13]:
X_train = []
labels = []
uniq_trips = df_train_trips.trip_id.unique()
for trip_id in uniq_trips:
  trip_train = df_train_trips[df_train_trips.trip_id==trip_id]
  X_train.append(trip_train.sp.to_numpy().astype(str))
  # labels.append(['None-start']+[str(l) for l in trip.route_cluster.to_list()])
  labels.append([str(l) for l in trip_train.route_cluster.to_list()])
print(X_train[0][:5])
print(labels[0][:5])
print(len(X_train))
print(len(labels))

['5072' '1683' '1683' '1683' '1683']
['-1', '-1', '-1', '-1', '-1']
1377
1377


In [14]:
X_test = []
label_test = []
uniq_trips = df_test_trips.trip_id.unique()
for trip_id in uniq_trips:
  trip_test = df_test_trips[df_test_trips.trip_id==trip_id]
  X_test.append(trip_test.sp.to_numpy().astype(str))
  label_test.append([str(l) for l in trip_test.route_cluster.to_list()])
print(X_test[0][:5])
print(labels[0][:5])
print(len(X_test))
print(len(label_test))

['1806' '1693' '1693' '987' '651']
['-1', '-1', '-1', '-1', '-1']
1163
1163


In [15]:
import random
random.seed(42)

X2_train = X_train.copy()
labels2 = labels.copy()

temp = list(zip(X2_train, labels2))
random.shuffle(temp)
X2_train, labels2= zip(*temp)
X2_train = list(X2_train)
labels2 = list(labels2)

print(X_train[0][:5])
print(X2_train[0][:5])
print(len(X_train))
print(len(X2_train))

print(labels[0][:5])
print(labels2[0][:5])
print(len(labels))
print(len(labels2))

['5072' '1683' '1683' '1683' '1683']
['2232' '2233' '2233' '1494' '2674']
1377
1377
['-1', '-1', '-1', '-1', '-1']
['-1', '-1', '-1', '-1', '-1']
1377
1377


In [16]:
road_df = pd.read_csv(road_data_path+"/route_network_with_cluster.csv")

cluster_df = road_df[['cluster', 'center_lat', 'center_lon']]
cluster_df = cluster_df.drop_duplicates()
cluster_df.rename(columns = {'cluster':'sp', 'center_lat':'lat', 'center_lon':'lon'}, inplace = True)

sp_list = set(map(str, cluster_df['sp'].tolist()))

#algorithm

In [17]:
def score_acc(model, seq, label):
  p = []
  prob,path = model.viterbi(seq)
  for s in path[1:]:
    p.append(s[1].name)

  tp = 0
  for _state_name, _label in zip(p, label):
    if _state_name == _label:
      tp+=1
  total = len(label)
  return tp/total


def score_hitAtK(model,seq,label):
  p = []
  k = 3
  tp = 0
  seq_total = len(seq)

  prob,path = model.viterbi(seq)
  for s in path[1:]:
    p.append(s[1].name)

  for i in range(0,seq_total):
    if i > seq_total - k:
      predict, label_list = p[i], seq[seq_total-k:]
    else:
      predict, label_list = p[i], seq[i:i+k]

    if predict in label_list:
      tp += 1

  return tp/seq_total

def score_top_k(model,seq,label):
  p = []
  k = 3
  tp = 0
  seq_total = len(seq)

  prob,path = model.viterbi(seq)
  for s in path[1:]:
    p.append(s[1].name)

  for i in range(0, len(seq_total)-k):
    predict, label_list = p[i:i+k], seq[i:i+k]

    count = 0
    for s in predict:
      if s in label_list:
        count += 1

    tp += count/k

  return tp/(seq_total - k)

def top_k_score(model, seq, label):
  p = []
  prob, path = model.viterbi(seq)
  for s in path[1:]:
    p.append(s[1].name)
  
  K = 5
  tp = 0
  total = len(seq)
  for i in range(0, len(seq)):
    try:
      points, label_list = p[i:i+K], label[i:i+K]
      count_total = K
    except:
      points, label_list = p[i:], label[i:]
      count_total = len(p[i:])
    count = 0
    for s in points:
      if s in label_list:
        count += 1
    tp += count/count_total
  return tp/total

def score_auc(model, seq, label):
  p = []
  prob, path = model.viterbi(seq)
  for s in path[1:]:
    p.append(s[1].name)

  hit_list = []
  true_count=0
  for pp, l in zip(p, label):
    value = (pp == l)
    if value:
      true_count += 1
    hit_list.append(value)
  
  n1=0
  false_count=0
  for i, true in enumerate(hit_list):
    if true==True:
      for not_true in hit_list[i:]:
        if not not_true:
          n1 += 1
    else:
      false_count += 1

  if true_count == 0:
    return 0
  if false_count == 0:
    return 1
  n = true_count*false_count
  return n1/n
  

#train

In [18]:
x_train = X_train
label_train = labels

uniq_sp = set()
for seq in x_train:
  for sp in seq:
    uniq_sp.add(sp)

uniq_label = set()
for label in label_train:
  for uniq in list(set(label)):
    uniq_label.add(uniq)

new_label = []
for label in label_train:
  new_label.append(['None-start']+label)
label_train = new_label

unseen_sp = sp_list - uniq_sp

x_train.append(np.array(list(unseen_sp)))
label_train.append(['-1'] * len(unseen_sp))

print("sp_list : ",len(sp_list))
print("uniq_sp : ",len(uniq_sp))
print("unseen_sp : ",len(unseen_sp))

sp_list :  8054
uniq_sp :  2451
unseen_sp :  5603


In [19]:
model = HiddenMarkovModel.from_samples(
    DiscreteDistribution, 
    n_components=len(uniq_label), 
    X=x_train, 
    labels=label_train,
    algorithm='labeled',
    state_names=list(uniq_label), 
    inertia=0.001,
    # max_iterations=10,
    max_iterations=1,
    n_jobs=-1
)

#predict

## ACC

In [20]:
acc_train = []
acc_test = []
fail_count = 0

for percent in [0.25, 0.5, 0.75, 0.9, 1]:
  acc = 0
  acc_t = 0
  #////////////  train ////////////////////
  for seq, label in zip(x_train, label_train):
    try:
      sc_acc = score_acc(model, np.array(seq[:int(len(seq)*percent)]), label[1:])
    except:
      print("Fail!!", end="\t")
      fail_count = fail_count + 1
      continue
      
    acc += sc_acc
  acc_train.append(acc/len(x_train))
    #/////////////// test /////////////////////////
  for seq, label in zip(X_test, label_test):
    try:
      sc_acc = score_acc(model, np.array(seq[:int(len(seq)*percent)]), label[1:])
    except:
      print("Fail!!", end="\t")
      fail_count = fail_count + 1
      continue
      
    acc_t += sc_acc
  acc_test.append(acc_t/len(X_test))


print("=== acc")
print(f'Train score')
print(f'\t25% trip traverse : {acc_train[0]}')
print(f'\t50% trip traverse : {acc_train[1]}')
print(f'\t75% trip traverse : {acc_train[2]}')
print(f'\t90% trip traverse : {acc_train[3]}')
print(f'\t100%trip traverse : {acc_train[4]}') 
print(f'Test score')
print(f'\t25% trip traverse : {acc_test[0]}')
print(f'\t50% trip traverse : {acc_test[1]}')
print(f'\t75% trip traverse : {acc_test[2]}')
print(f'\t90% trip traverse : {acc_test[3]}')
print(f'\t100%trip traverse : {acc_test[4]}')

=== acc
Train score
	25% trip traverse : 0.1453649850456057
	50% trip traverse : 0.3192551363834759
	75% trip traverse : 0.4910745130650407
	90% trip traverse : 0.5998192021331049
	100%trip traverse : 0.7391933152943477
Test score
	25% trip traverse : 0.030528786238683655
	50% trip traverse : 0.06519001002013626
	75% trip traverse : 0.09965288456722046
	90% trip traverse : 0.12405470237521751
	100%trip traverse : 0.14095223148902628


##Hit@K

In [21]:
hit_train = []
hit_test = []
fail_count = 0

for percent in [0.25, 0.5, 0.75, 0.9, 1]:
  hit = 0
  hit_t = 0
  top = 0
  top_t = 0
  #////////////  train ////////////////////
  for seq, label in zip(x_train, label_train):
    try:
      sc = top_k_score(model, np.array(seq[:int(len(seq)*percent)]), label[1:])
    except:
      print("Fail!!", end="\t")
      fail_count = fail_count + 1
      continue
    if sc > 0:
      hit += 1
    # hit += sc
  hit_train.append(hit/len(x_train))
  
    #/////////////// test /////////////////////////
  for seq, label in zip(X_test, label_test):
    try:
      sc = top_k_score(model, np.array(seq[:int(len(seq)*percent)]), label[1:])
    except:
      print("Fail!!", end="\t")
      fail_count = fail_count + 1
      continue
      
    if sc > 0:
      hit_t += 1
    # hit_t += sc
  hit_test.append(hit_t/len(X_test))
  

print("=== hit")
print(f'Train score')
print(f'\t25% trip traverse : {hit_train[0]}')
print(f'\t50% trip traverse : {hit_train[1]}')
print(f'\t75% trip traverse : {hit_train[2]}')
print(f'\t90% trip traverse : {hit_train[3]}')
print(f'\t100%trip traverse : {hit_train[4]}') 
print(f'Test score')
print(f'\t25% trip traverse : {hit_test[0]}')
print(f'\t50% trip traverse : {hit_test[1]}')
print(f'\t75% trip traverse : {hit_test[2]}')
print(f'\t90% trip traverse : {hit_test[3]}')
print(f'\t100%trip traverse : {hit_test[4]}')

=== hit
Train score
	25% trip traverse : 0.716255442670537
	50% trip traverse : 0.8047895500725689
	75% trip traverse : 0.8577648766328012
	90% trip traverse : 0.8969521044992743
	100%trip traverse : 0.965166908563135
Test score
	25% trip traverse : 0.1994840928632846
	50% trip traverse : 0.2820292347377472
	75% trip traverse : 0.33619948409286327
	90% trip traverse : 0.38005159071367156
	100%trip traverse : 0.4445399828030954


## AUC

In [22]:
auc_train = []
auc_test = []
fail_count = 0

for percent in [0.25, 0.5, 0.75, 0.9, 1]:
  auc = 0
  auc_t = 0
  #////////////  train ////////////////////
  for seq, label in zip(x_train, label_train):
    try:
      sc_auc = score_auc(model, np.array(seq[:int(len(seq)*percent)]), label[1:])
    except:
      print("Fail!!", end="\t")
      fail_count = fail_count + 1
      continue
      
    auc += sc_auc
  auc_train.append(auc/len(x_train))
    #/////////////// test /////////////////////////
  for seq, label in zip(X_test, label_test):
    try:
      sc_auc = score_auc(model, np.array(seq[:int(len(seq)*percent)]), label[1:])
    except:
      print("Fail!!", end="\t")
      fail_count = fail_count + 1
      continue
      
    auc_t += sc_auc
  auc_test.append(auc_t/len(X_test))


print("=== auc")
print(f'Train score')
print(f'\t25% trip traverse : {auc_train[0]}')
print(f'\t50% trip traverse : {auc_train[1]}')
print(f'\t75% trip traverse : {auc_train[2]}')
print(f'\t90% trip traverse : {auc_train[3]}')
print(f'\t100%trip traverse : {auc_train[4]}') 
print(f'Test score')
print(f'\t25% trip traverse : {auc_test[0]}')
print(f'\t50% trip traverse : {auc_test[1]}')
print(f'\t75% trip traverse : {auc_test[2]}')
print(f'\t90% trip traverse : {auc_test[3]}')
print(f'\t100%trip traverse : {auc_test[4]}')

=== auc
Train score
	25% trip traverse : 0.6259084626913671
	50% trip traverse : 0.6530404847108472
	75% trip traverse : 0.6522391827278796
	90% trip traverse : 0.6347372042488978
	100%trip traverse : 0.6174907834703016
Test score
	25% trip traverse : 0.12730555492865234
	50% trip traverse : 0.15007861775990455
	75% trip traverse : 0.16627605023489564
	90% trip traverse : 0.1720869617913756
	100%trip traverse : 0.17053449467005427


##Top K

In [23]:
top_train = []
top_test = []
fail_count = 0

for percent in [0.25, 0.5, 0.75, 0.9, 1]:
  top = 0
  top_t = 0
  #////////////  train ////////////////////
  for seq, label in zip(x_train, label_train):
    try:
      sc_top = top_k_score(model, np.array(seq[:int(len(seq)*percent)]), label[1:])
    except:
      print("Fail!!", end="\t")
      fail_count = fail_count + 1
      continue
    if sc_top > 0:
      top += sc_top
    # top += sc_top
  top_train.append(top/len(x_train))
    #/////////////// test /////////////////////////
  for seq, label in zip(X_test, label_test):
    try:
      sc_top = top_k_score(model, np.array(seq[:int(len(seq)*percent)]), label[1:])
    except:
      print("Fail!!", end="\t")
      fail_count = fail_count + 1
      continue
      
    if sc_top > 0:
      top_t += sc_top
    # top_t += sc_top
  top_test.append(top_t/len(X_test))

print("=== top")
print(f'Train score')
print(f'\t25% trip traverse : {top_train[0]}')
print(f'\t50% trip traverse : {top_train[1]}')
print(f'\t75% trip traverse : {top_train[2]}')
print(f'\t90% trip traverse : {top_train[3]}')
print(f'\t100%trip traverse : {top_train[4]}') 
print(f'Test score')
print(f'\t25% trip traverse : {top_test[0]}')
print(f'\t50% trip traverse : {top_test[1]}')
print(f'\t75% trip traverse : {top_test[2]}')
print(f'\t90% trip traverse : {top_test[3]}')
print(f'\t100%trip traverse : {top_test[4]}')

=== top
Train score
	25% trip traverse : 0.333831799671618
	50% trip traverse : 0.5101653497903192
	75% trip traverse : 0.5837617422626697
	90% trip traverse : 0.6218231439699345
	100%trip traverse : 0.6780284845558575
Test score
	25% trip traverse : 0.06777099550224204
	50% trip traverse : 0.10498339839786357
	75% trip traverse : 0.1207217502140577
	90% trip traverse : 0.13070394208371994
	100%trip traverse : 0.14044091477728846


#plot

In [34]:
!pip install plotly_express

Collecting plotly_express
  Downloading plotly_express-0.4.1-py2.py3-none-any.whl (2.9 kB)
Installing collected packages: plotly-express
Successfully installed plotly-express-0.4.1


In [47]:
import plotly_express as px
px.set_mapbox_access_token("pk.eyJ1Ijoic2hha2Fzb20iLCJhIjoiY2plMWg1NGFpMXZ5NjJxbjhlM2ttN3AwbiJ9.RtGYHmreKiyBfHuElgYq_w")

df_acc = pd.DataFrame()
df_acc['trips'] = [0.25, 0.5, 0.75, 0.9, 1]
df_acc['acc'] = acc_test
fig = px.bar(df_acc, x="trips", y="acc",
             height=400)
fig.show()

In [48]:
df_hit = pd.DataFrame()
df_hit['%trips'] = ["25%", "50%", "75%", "90%", "100"]
df_hit['acc'] = hit_test
fig = px.bar(df_hit, x="trips", y="acc",
             height=400)
fig.show()

In [50]:
import plotly.graph_objects as go
trips=["25%", "50%", "75%", "90%", "100%"]

fig = go.Figure(data=[
    go.Bar(name='acc', x=trips, y=acc_test),
    go.Bar(name='hit@k', x=trips, y=hit_test),
    go.Bar(name='topK', x=trips, y=top_test),
    go.Bar(name='auc', x=trips, y=auc_test)
])
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()