# Load the segmented trips

In [5]:
import matplotlib.pyplot as plt

In [6]:
import os
import json
import pandas as pd

# Initialize empty dataframes
road_df_list = []
second_segment_df_list = []

# Iterate over each dateID in the specified range (24 to 30)
for date_id in range(24, 31):
    # Generate filenames with dateID + 1 (to match the desired format)
    road_file_name = f'Splitting/Segmented Trial Week/Segmented_Trips_01_{date_id + 1}.json'
    segment_file_name = f'Splitting/Segmented Trial Week/Merchants_Segments_01_{date_id + 1}.json'
    
    # Check if the JSON files already exist
    if os.path.exists(road_file_name) and os.path.exists(segment_file_name):
        print(f"Loading segments from '{road_file_name}' and '{segment_file_name}'")
        
        # Load road segments from JSON
        with open(road_file_name, 'r') as file:
            road_json = [json.loads(line) for line in file]
        road_df_filtered = pd.DataFrame(road_json)
        
        # Load merchant segments from JSON
        with open(segment_file_name, 'r') as file:
            second_seg_json = [json.loads(line) for line in file]
        second_segment_df_filtered = pd.DataFrame(second_seg_json)
    
    else:
        # display that the files do not exist
        print(f"Files '{road_file_name}' and '{segment_file_name}' do not exist.")

        # # Filter road_df and second_segment_df for the current dateID
        # road_df_filtered = road_df[road_df['dateID'] == date_id]
        # second_segment_df_filtered = second_segment_df[second_segment_df['dateID'] == date_id]
        
        # # Convert to JSON format
        # road_json = road_df_filtered.to_dict(orient='records')
        # second_seg_json = second_segment_df_filtered.to_dict(orient='records')
        
        # # Save road segments to JSON
        # with open(road_file_name, 'w') as file:
        #     for json_obj in road_json:
        #         json.dump(json_obj, file)
        #         file.write('\n')

        # # Save merchant segments to JSON
        # with open(segment_file_name, 'w') as file:
        #     for json_obj in second_seg_json:
        #         json.dump(json_obj, file)
        #         file.write('\n')

        # print(f"Segments 1 & 3 saved to '{road_file_name}'")
        # print(f"Segment 2 with wait times saved to '{segment_file_name}'")
    
    # Append filtered data to lists
    road_df_list.append(road_df_filtered)
    second_segment_df_list.append(second_segment_df_filtered)

# Concatenate all filtered dataframes
road_df = pd.concat(road_df_list, ignore_index=True)
second_segment_df = pd.concat(second_segment_df_list, ignore_index=True)

# Print columns for verification
# print(road_df.columns)
# print(second_segment_df.columns)


Loading segments from 'Splitting/Segmented Trial Week/Segmented_Trips_01_25.json' and 'Splitting/Segmented Trial Week/Merchants_Segments_01_25.json'
Loading segments from 'Splitting/Segmented Trial Week/Segmented_Trips_01_26.json' and 'Splitting/Segmented Trial Week/Merchants_Segments_01_26.json'
Loading segments from 'Splitting/Segmented Trial Week/Segmented_Trips_01_27.json' and 'Splitting/Segmented Trial Week/Merchants_Segments_01_27.json'
Loading segments from 'Splitting/Segmented Trial Week/Segmented_Trips_01_28.json' and 'Splitting/Segmented Trial Week/Merchants_Segments_01_28.json'
Loading segments from 'Splitting/Segmented Trial Week/Segmented_Trips_01_29.json' and 'Splitting/Segmented Trial Week/Merchants_Segments_01_29.json'
Loading segments from 'Splitting/Segmented Trial Week/Segmented_Trips_01_30.json' and 'Splitting/Segmented Trial Week/Merchants_Segments_01_30.json'
Loading segments from 'Splitting/Segmented Trial Week/Segmented_Trips_01_31.json' and 'Splitting/Segmented

# Remove the trips with distances greater than 50 km

In [7]:
# display trips that have dist value greater than 20
len(road_df[road_df['dist'] >= 20])

169

In [8]:
len(road_df)

64611

In [9]:
road_df.describe()

Unnamed: 0,dist,trip_time,weekID,timeID,dateID,time_offset,segmentID
count,64611.0,64611.0,64611.0,64611.0,64611.0,64611.0,64611.0
mean,4.245647,545.493941,2.995945,882.030065,26.88745,462.391729,2.081147
std,25.907593,437.792605,1.946319,280.35904,2.044701,537.200286,0.99671
min,0.0,0.0,0.0,3.0,24.0,0.0,1.0
25%,0.883395,208.5,1.0,686.0,25.0,0.0,1.0
50%,2.373664,432.0,3.0,904.2,27.0,344.0,3.0
75%,5.662286,780.0,5.0,1094.9,29.0,815.0,3.0
max,2872.321048,4980.0,6.0,1437.4,30.0,6442.0,3.0


In [10]:
# drop entries that have dist greater than 20
road_df = road_df[road_df['dist'] <= 20]

In [11]:
len(road_df[road_df['dist'] < 20])

64442

# Generate the two-hour segment

In [12]:
# Define 2-hour segment ranges (in minutes)
two_hour_segments = [(i * 120, (i + 1) * 120) for i in range(12)]  # Each tuple is a (start, end) range in minutes
two_hour_labels = [f"{str(i*2).zfill(2)}:00 - {str(i*2+2).zfill(2)}:00" for i in range(12)]  # Two-hour labels in "HH:MM - HH:MM" format

# Function to determine which 2-hour segment a timeID belongs to
def get_two_hour_segment(timeID):
    for idx, (start, end) in enumerate(two_hour_segments):
        if start <= timeID < end:
            return two_hour_labels[idx]  # Return the corresponding 2-hour label
    return None  # Return None if the timeID is outside the 0-1440 range (but it shouldn't happen)

# Apply the 2-hour segment function to assign each row to a 2-hour segment based on timeID
road_df['two_hour_segment'] = road_df['timeID'].apply(get_two_hour_segment)

# Group by the 'two_hour_segment' column and count the number of rows (orders) in each segment
orders_per_two_hour_segment = road_df.groupby('two_hour_segment').size().reset_index(name='order_count')


In [13]:
os.getcwd()

'd:\\Senior Project - Main Workspace\\eta-rfr'

In [14]:
# change directory to "Clustering"
os.chdir("Clustering_Experiments")

In [15]:
# Define 1-hour segment ranges (in minutes)
hour_segments = [(i * 60, (i + 1) * 60) for i in range(24)]  # Each tuple is a (start, end) range in minutes
hour_labels = [f"{str(i).zfill(2)}:00 - {str(i+1).zfill(2)}:00" for i in range(24)]  # Hour labels in "HH:MM - HH:MM" format

# Function to determine which hour segment a timeID belongs to
def get_hour_segment(timeID):
    for idx, (start, end) in enumerate(hour_segments):
        if start <= timeID < end:
            return hour_labels[idx]  # Return the corresponding hour label
    return None  # Return None if the timeID is outside the 0-1440 range (but it shouldn't happen)

# Apply the hour segment function to assign each row to an hourly segment based on timeID
road_df['hour'] = road_df['timeID'].apply(get_hour_segment)

# Group by the 'hour' column and count the number of rows (orders) in each hour
orders_per_hour = road_df.groupby('hour').size().reset_index(name='order_count')

# Write the result to a CSV file
output_file_path = 'orders_per_hour_segment_trial_week.csv'
orders_per_hour.to_csv(output_file_path, index=False)

print(f"Results have been saved to {output_file_path}")


Results have been saved to orders_per_hour_segment_trial_week.csv


# Clustering

## DBSCAN

### Determine the best values of the hyperparameters

In [16]:
from sklearn.neighbors import NearestNeighbors

In [17]:
import sklearn

In [18]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
# import matplotlib.pyplot as plt

# Step 1: Flatten the coordinates for clustering
all_lats = [lat for sublist in road_df['lats'] for lat in sublist]
all_lngs = [lng for sublist in road_df['lngs'] for lng in sublist]
coords = np.array(list(zip(all_lats, all_lngs)))

# Step 1: Fit NearestNeighbors model to find k-nearest neighbors
# 'coords' is the array of coordinates you want to cluster (lats and lngs)
k = 4  # Set k to be equal to min_samples
nearest_neighbors = NearestNeighbors(n_neighbors=k)
neighbors = nearest_neighbors.fit(coords)
distances, indices = neighbors.kneighbors(coords)

# Step 2: Sort the distances and plot them
# We're interested in the distance to the k-th nearest neighbor for each point
distances = np.sort(distances[:, k-1], axis=0)
plt.plot(distances)
plt.ylabel('k-NN Distance')
plt.xlabel('Points sorted by distance to k-th nearest neighbor')
plt.title('k-distance Graph for DBSCAN')
plt.show()


ImportError: 

IMPORTANT: PLEASE READ THIS FOR ADVICE ON HOW TO SOLVE THIS ISSUE!

Importing the numpy C-extensions failed. This error can happen for
many reasons, often due to issues with your setup or how NumPy was
installed.

We have compiled some common reasons and troubleshooting tips at:

    https://numpy.org/devdocs/user/troubleshooting-importerror.html

Please note and check the following:

  * The Python version is: Python3.11 from "C:\Users\nadab\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe"
  * The NumPy version is: "1.24.3"

and make sure that they are the versions you expect.
Please carefully study the documentation linked above for further help.

Original error was: DLL load failed while importing _multiarray_umath: The specified module could not be found.


ImportError: numpy._core.multiarray failed to import

### Actual Clustering Implementation

In [None]:
import folium
import numpy as np
from sklearn.cluster import DBSCAN
from shapely.geometry import MultiPoint, Polygon


# Step 1: Flatten the coordinates for clustering
all_lats = [lat for sublist in road_df['lats'] for lat in sublist]
all_lngs = [lng for sublist in road_df['lngs'] for lng in sublist]
coords = np.array(list(zip(all_lats, all_lngs)))

# Step 2: Perform DBSCAN clustering
dbscan = DBSCAN(eps=0.01, min_samples=4)  # Adjust `eps` and `min_samples` based on your data
cluster_labels = dbscan.fit_predict(coords)

# Create a mapping from coordinates to cluster labels
coord_to_cluster = dict(zip(map(tuple, coords), cluster_labels))

# Step 3: Assign cluster IDs to road_df for the start and end points
def get_cluster_id(lat, lon):
    return coord_to_cluster.get((lat, lon), None)

road_df['start_cluster'] = [get_cluster_id(row['lats'][0], row['lngs'][0]) for _, row in road_df.iterrows()]
road_df['end_cluster'] = [get_cluster_id(row['lats'][-1], row['lngs'][-1]) for _, row in road_df.iterrows()]


### Visualizing the clusters

## OPTICS