In [None]:
#import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import time

In [None]:
MIN_RECORD = 10
MAX_SEGMENT = 5

In [None]:
#function v-clustering
def v_clustering(L, threshold):
    def WAV(L1, L2):
        n = len(L)
        return (len(L1) / n) * np.var(L1) + (len(L2) / n) * np.var(L2)

    def delta_V(L, i):
        L1 = L[:i]
        L2 = L[i:]
        return np.var(L) - WAV(L1, L2)

    def find_best_split(L):
        max_delta_V = -1
        best_split_index = None

        for i in range(1, len(L)):
            delta_v = delta_V(L, i)

            if delta_v > max_delta_V:
                max_delta_V = delta_v
                best_split_index = i

        return best_split_index

    split_index = find_best_split(L)

    if split_index is None:
        return [L]

    L1 = L[:split_index]
    L2 = L[split_index:]
    if delta_V(L, split_index) < threshold :
        return [L]
    return v_clustering(L1, threshold) + v_clustering(L2, threshold)

In [None]:
def Label_cluster(current_segment, clusters, sort_term):
  result_df = pd.DataFrame(columns=['Value', 'Cluster'])
  for i, sub_array in enumerate(clusters):
      label = i + 1
      for element in sub_array:
          result_df = pd.concat([result_df, pd.DataFrame({'Value': [element], 'Cluster': [label]})], ignore_index=True)
  current_segment = current_segment.sort_values(by=[sort_term])
  if (len(current_segment) == len(result_df)):
    for i in range(len(current_segment)):
      current_segment['Cluster label'] = result_df['Cluster'].tolist()
  return current_segment

In [None]:
def cluster(SEGMENT_ID, DAY_OF_WEEK, ARRIVAL_TIME, DURATION, time_added, TRIP_ID):
  #Read data
  if SEGMENT_ID <= 15:
    data_name = "../dataset/1_Kandy_to_Digana.csv"
  else:
    data_name = "../dataset/2_Digana_to_Kandy.csv"
  input_data = pd.read_csv(data_name)
  df = input_data.copy()
  df = pd.DataFrame(df)

  #Filter by SEGMENT_ID and Day_of_week
  data = df[df['segment'] == SEGMENT_ID]
  data = data[data['day_of_week'] == DAY_OF_WEEK]

  #Reset index
  data = data.reset_index()
  data.drop(columns=["index"],inplace=True)
  current_segment = data.copy()

  #drop unnecessary columns
  data.drop(columns=["trip_id", "segment", "duration", "date", "end_time", "without_dwell_time", "avg_speed", "day_of_week"],inplace=True)

  #Normalize dataset
  for i in range(len(data)):
    string = data.loc[i, 'start_time']
    data.loc[i, 'start_time'] = float(string[0:2]) * 3600 + float(string[3:5]) * 60 + float(string[6:8])

  #Obtain array of values
  data_array=data.values

  #Specify data types
  data_array[:, 0:1] = data_array[:, 0:1].astype(float)
  data_array = data_array.tolist()

  #Transform clusters
  cluster_list = []
  for row in data_array:
    cluster_list.append(row[0])

  # Clustering for start_time
  actual_times = sorted(cluster_list)
  threshold = 100000
  clusters = v_clustering(actual_times, threshold)


  #Label clusters for original dataset
  current_segment = Label_cluster(current_segment, clusters, 'start_time')

  #Input processing
  ARRIVAL_TIME = datetime.strptime(ARRIVAL_TIME, '%H:%M:%S').time()
  seconds = (ARRIVAL_TIME.hour * 60 + ARRIVAL_TIME.minute) * 60 + ARRIVAL_TIME.second
  seconds += time_added
  #Determine belonged cluster
  center_list = []
  for element in clusters:
    center_list.append(np.mean(element))
  arr = np.array(center_list)
  difference_array = np.absolute(arr - seconds)
  index = difference_array.argmin() + 1

  #Extract trips
  trip_list = current_segment[current_segment['Cluster label'] == index]
  trip_list = trip_list['trip_id'].tolist()

  CUR_SEG = SEGMENT_ID - 1
  while (len(trip_list) >= MIN_RECORD and CUR_SEG > 0 and SEGMENT_ID - CUR_SEG < 5):
    #Filter by SEGMENT_ID and Day_of_week
    data = df[df['segment'] == CUR_SEG]
    data = data[data['day_of_week'] == DAY_OF_WEEK]
    data = data[data['trip_id'].isin(trip_list)]

    #Reset index
    data = data.reset_index()
    data.drop(columns=["index"],inplace=True)

    #drop unnecessary columns
    current_segment = data.copy()
    data.drop(columns=["trip_id", "segment", "start_time", "date", "end_time", "without_dwell_time", "avg_speed", "day_of_week"],inplace=True)

    #obtain array of values
    data_array=data.values

    #specify data types
    data_array[:, 0:1] = data_array[:, 0:1].astype(float)
    data_array = data_array.tolist()
    cluster_list = []
    for row in data_array:
      cluster_list.append(row[0])

    # Clustering for duration
    actual_times = cluster_list
    actual_times = sorted(actual_times)
    threshold = np.var(actual_times)*0.5
    clusters = v_clustering(actual_times, threshold)

    #Label clusters for original dataset
    current_segment = Label_cluster(current_segment, clusters, 'duration')

    #Passed Segment Duration
    duration = DURATION[CUR_SEG - 1]

    #Determine belonged cluster
    center_list = []
    for element in clusters:
      center_list.append(np.mean(element))
    arr = np.array(center_list)
    difference_array = np.absolute(arr - duration)
    index = difference_array.argmin() + 1

    #Extract trip
    if len(current_segment[current_segment['Cluster label'] == index]) > 1:
      trip_list = current_segment[current_segment['Cluster label'] == index]
      trip_list = trip_list['trip_id'].tolist()
    else:
      break
    CUR_SEG = CUR_SEG - 1
  data = df[df['segment'] == SEGMENT_ID]
  data = data[data['day_of_week'] == DAY_OF_WEEK]
  output = data[data['trip_id'].isin(trip_list)]
  temp_df = df[df['trip_id'] == TRIP_ID]
  real_duration = temp_df[temp_df['segment'] == SEGMENT_ID].reset_index().loc[0, 'duration']
  output = output[~output['trip_id'].isin([TRIP_ID])]
  return [real_duration, np.mean(output['duration'])]

In [None]:
def trip_prediction(TRIP_ID, SEGMENT_ID):
  #Define trip
  dataset = pd.read_csv("../dataset/1_Kandy_to_Digana.csv")
  dataset = pd.DataFrame(dataset)
  trip = dataset[dataset['trip_id'] == TRIP_ID].reset_index()
  trip
  #Input data
  DAY_OF_WEEK = trip.loc[0, 'day_of_week']
  ARRIVAL_TIME = trip.loc[SEGMENT_ID - 1, 'start_time']
  #Supposed segment duration
  DURATION = []
  for i in range(1, SEGMENT_ID):
    DURATION.append(trip.loc[i - 1, 'duration'])

  #Prediction
  error = []
  time_added = 0
  while (SEGMENT_ID <= 15):
    result = cluster(SEGMENT_ID, DAY_OF_WEEK, ARRIVAL_TIME, DURATION, time_added, TRIP_ID)
    DURATION.append(np.round(result[1], 2))
    error.append(np.round(np.absolute(result[1]- result[0]), 2))
    SEGMENT_ID = SEGMENT_ID + 1
    time_added += result[1]


  #Evaluate result
  RMSE = 0
  for i in range(len(error)):
    RMSE += error[i]**2
  RMSE = np.sqrt(RMSE/len(error))

  #Result
  print("ERROR: ", error)
  print("DURATION: ", DURATION)
  print("Trip id: ", TRIP_ID)
  print("RMSE: ", np.round(RMSE, 2))
  print("Real total duration: ",np.sum(trip['duration']))
  print("Predicted total duration: ",np.round(np.sum(DURATION),2))
  print("Different: ", np.round(np.abs(np.sum(DURATION) - np.sum(trip['duration'])),2 ))
  return [np.round(RMSE, 2), np.round(np.abs(np.sum(DURATION) - np.sum(trip['duration'])),2 ), error, DURATION]

In [None]:
df = pd.read_csv("../dataset/1_Kandy_to_Digana.csv")
RMSE_list = []
Diff_list = []
count = 0
for i in df['trip_id'].unique():
  if (count == 100):
    break
  count += 1
  result = trip_prediction(i, 10)
  RMSE_list.append(result[0])
  Diff_list.append(result[1])

In [None]:
x = []
count = 0
for i in df['trip_id'].unique():
  if (count == 100):
    break
  count += 1
  x.append(i)

In [None]:
plt.plot(x, RMSE_list, color='red', linewidth=2)
plt.xlabel('TRIP_ID')
plt.ylabel('RMSE')
plt.title('RMSE Result')

# Add the grid
plt.grid(True)

# Show the plot
plt.show()
print("Average RMSE:", np.round(np.mean(RMSE_list),2))

In [None]:
plt.plot(x, Diff_list, color='red', linewidth=2)
plt.xlabel('TRIP_ID')
plt.ylabel('Different')
plt.title('Different Result')

# Add the grid
plt.grid(True)

# Show the plot
plt.show()
print("Average Different:", np.round(np.mean(Diff_list),2))

In [None]:
from google.colab import files
filename = "seg10_dir1_100trip.csv"
df = pd.DataFrame(list(zip(x, RMSE_list, Diff_list)),
               columns =['TRIP_ID', 'RMSE', 'DIFFERENT'])
df.to_csv(filename, encoding = 'utf-8-sig')
files.download(filename)

In [None]:
result = trip_prediction(32 ,1)

In [None]:
dataset = pd.read_csv("../dataset/1_Kandy_to_Digana.csv")
dataset = pd.DataFrame(dataset)
trip = dataset[dataset['trip_id'] == 181].reset_index()
trip['prediction'] = result[3]
trip['error'] = result[2]
trip

In [None]:
df = pd.read_csv("seg1_dir1_100trip.csv")
df = pd.DataFrame(df)
df = df[['TRIP_ID', 'DIFFERENT']]
df = df[df['DIFFERENT'] < 10]
low_dif_trip = df['TRIP_ID'].tolist()


df = pd.read_csv("seg10_dir1_100trip.csv")
df = pd.DataFrame(df)
df = df[['TRIP_ID', 'DIFFERENT']]
df = df[df['DIFFERENT'] < 10]
low_dif_trip.append(df['TRIP_ID'].tolist())
print(low_dif_trip)

In [None]:
result = trip_prediction(312 , 1)

In [None]:
dataset = pd.read_csv("../dataset/1_Kandy_to_Digana.csv")
dataset = pd.DataFrame(dataset)
trip = dataset[dataset['trip_id'] == 312].reset_index()
trip['prediction'] = result[3]
trip['error'] = result[2]
trip = trip[['trip_id', 'segment', 'duration', 'prediction', 'error']]
trip

In [None]:
df = pd.read_csv("bus_trips_654.csv")
df = pd.DataFrame(df)
df = df[(df['direction'] == 1) & (df['duration_in_mins'] > 70)]
df = df.sort_values(by='duration_in_mins', ascending=False)
long_trip = df['trip_id'].tolist()
df = pd.read_csv("../dataset/1_Kandy_to_Digana.csv")
df = pd.DataFrame(df)
RMSE_list = []
Diff_list = []
x = []
for i in long_trip:
    if (i in df['trip_id'].unique()):
      result = trip_prediction(i, 10)
      x.append(i)
      RMSE_list.append(result[0])
      Diff_list.append(result[1])

In [None]:
trip_prediction(31, 1)