# Detect obstacles in roads from accelerometer data

In [1]:
#Import Libraries
import pandas as pd
import numpy as np 
import folium
import geopy
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

Workflow:
1. Data Wrangling & Cleaning
2. Map Creation
3. Feature Generation
4. Clustering

## Data Wrangling & Cleaning

In [2]:
df = pd.read_csv("Data/acceleration_data_in_bilbao_SP.csv", parse_dates=['time'], infer_datetime_format=True)

In [3]:
df.shape

(262443, 13)

In [4]:
df = df.sort_values("time")

In [5]:
df["Moving_Average_Speed"] = df.speed.rolling(50).mean()

In [6]:
df["Average_Acc"] = (df['acc_x']+df['acc_y']+df['acc_z'])/3

## Map Creation

In [7]:
#Create coordinations and speed lists for mapping
coords = df[["latitude","longitude"]].values.tolist()
speed = df["Moving_Average_Speed"].values.tolist()

In [8]:
coords_map = coords[0::250]
speed_map = speed[0::250]

In [9]:
# Create starting point for map initialization
starting_point = coords[0]
end_point = coords[-1]

In [10]:
# Create the map in Folium showing intervals of location and average speed

map_object = folium.Map(location=starting_point, zoom_start=12)

# Add markers for each set of coordinates
for coord in coords_map:
    folium.Marker(location=coord).add_to(map_object)

# Create a line connecting the markers
for i in range(len(coords_map) - 1):
    folium.PolyLine(coords_map, color='red', weight=speed_map[i]/20, opacity=1).add_to(map_object)
    
# Display the map
map_object.save('map.html')

## Feature Generation

In [11]:
# Create feature to store the change in heading as the vehicle travels
df["Change_Heading"] = (df['heading'] - df['heading'].shift(50)) / df['heading'].shift(50) * 100

# Fill the NaN values in the 'Change_Heading' column with 0
df['Change_Heading'].fillna(0, inplace=True)

#Cap the change to prevent anomalies in clustering analysis
df['Change_Heading'] = np.where(df['Change_Heading'] > 500, 500, df['Change_Heading'])

df['Change_Heading'] = abs(df['Change_Heading'])


In [12]:
# Create feature to store the change in speed as the vehicle travels
df["Change_Speed"] = (df['speed'] - df['speed'].shift(100)) / df['speed'].shift(100) * 100

# Fill the NaN values in the 'Change_Speed' column with 0
df['Change_Speed'].fillna(0, inplace=True)

#Cap the change to prevent anomalies in clustering analysis
df['Change_Speed'] = np.where(df['Change_Speed'] > 1000, 1000, df['Change_Heading'])

In [13]:
# Create feature to store the change in acceleration as the vehicle travels
df["Change_Acc"] = (df['Average_Acc'] - df['Average_Acc'].shift(100)) / df['Average_Acc'].shift(100) * 100

# Fill the NaN values in the 'Change_Acc' column with 0
df['Change_Acc'].fillna(0, inplace=True)


In [14]:
# Create feature to show how close the vehicle is to cross the speed limit
df['Percent_Speed_Limit'] = df['speed']/df['road_speed_limit']

## Clustering

In [15]:
# Create a dataframe with the required values
df_clustering = df[['time','speed', 'heading', 'road_speed_limit', 'acc_x', 'acc_y', 'acc_z', 'road_type', 'Change_Heading', 'Change_Speed', 'Change_Acc', 'Percent_Speed_Limit']]
df_clustering = df_clustering.set_index('time')

In [16]:
df_clustering.road_type.unique()

array(['LOCAL_ROAD_OF_MAJOR_IMPORTANCE', 'CONNECTING_ROAD', 'MAJOR_ROAD',
       'SECONDARY_ROAD', 'MOTORWAY', 'DESTINATION_ROAD',
       'MAJOR_ROAD_OF_HIGH_IMPORTANCE'], dtype=object)

In [17]:
# Map road types as algorithm used only accepts numeric values
map_dict = {'LOCAL_ROAD_OF_MAJOR_IMPORTANCE': 0, 'CONNECTING_ROAD': 1, 'MAJOR_ROAD': 2, 'SECONDARY_ROAD': 3,
           'MOTORWAY':4, 'DESTINATION_ROAD':5, 'MAJOR_ROAD_OF_HIGH_IMPORTANCE':6}

# Apply the map method to column 'road_type'
df_clustering['road_type'] = df_clustering['road_type'].map(map_dict)

In [18]:
# Resample the data into one second intervals to make analysis simpler
df_clustering = df_clustering.resample('1S').mean()

In [19]:
# Get rid of anomalies
df_clustering = df_clustering.replace([np.inf, -np.inf], np.nan)
df_clustering = df_clustering.dropna(axis=0)

In [20]:
df_clustering['road_type'] = round(df_clustering['road_type'],0)

In [21]:
df_clustering.head()

Unnamed: 0_level_0,speed,heading,road_speed_limit,acc_x,acc_y,acc_z,road_type,Change_Heading,Change_Speed,Change_Acc,Percent_Speed_Limit
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-09-29 03:46:06,0.0,318.0,40.0,-0.029667,-0.028833,1.010167,0.0,0.0,0.0,0.0,0.0
2022-09-29 03:46:07,0.2,318.0,40.0,-0.0296,-0.0283,1.0097,0.0,0.0,0.0,0.0,0.005
2022-09-29 03:46:08,0.0,318.0,40.0,-0.0306,-0.0279,1.0089,0.0,0.0,0.0,0.0,0.0
2022-09-29 03:46:09,0.0,318.0,40.0,-0.0305,-0.0271,1.0089,0.0,0.0,0.0,0.0,0.0
2022-09-29 03:46:10,0.0,318.0,40.0,-0.0318,-0.0274,1.0109,0.0,0.0,0.0,0.0,0.0


In [22]:
# Scaling the data as K means requires it
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_clustering)

In [23]:
# Fit KMeans model to data
kmeans = KMeans(n_clusters=3)
kmeans.fit(scaled_data)

# Predict the cluster for each data point
labels = kmeans.predict(scaled_data)

In [24]:
# Assign the labels to rows
df_clustering['Cluster'] = labels

## Clustering Result Analysis

In [25]:
# Distribution of samples between clustering
for cluster in [0,1,2]:
    print(f'Cluster {cluster} Sample Size ', len(df_clustering[df_clustering.Cluster==cluster]))

Cluster 0 Sample Size  22616
Cluster 1 Sample Size  586
Cluster 2 Sample Size  2923


In [30]:
# Average acceleration for each cluster
for cluster in [0,1,2]:
    print(f'Cluster {cluster} acc_x value ', df_clustering[df_clustering.Cluster==cluster]['acc_x'].mean())
    print(f'Cluster {cluster} acc_y value ', df_clustering[df_clustering.Cluster==cluster]['acc_y'].mean())
    print(f'Cluster {cluster} acc_z value ', df_clustering[df_clustering.Cluster==cluster]['acc_z'].mean())
    print('-'*50)

Cluster 0 acc_x value  -0.005597126539344595
Cluster 0 acc_y value  -0.0024376777909774425
Cluster 0 acc_z value  1.012339235695684
--------------------------------------------------
Cluster 1 acc_x value  -0.004052311113232607
Cluster 1 acc_y value  -0.000652290826352259
Cluster 1 acc_z value  1.012165244795023
--------------------------------------------------
Cluster 2 acc_x value  -0.005894265439202139
Cluster 2 acc_y value  0.001226734116607532
Cluster 2 acc_z value  1.0110554799687748
--------------------------------------------------


In [26]:
# Average speed, change in heading and change in acceleration for each cluster
for var in ['speed', 'Change_Heading', 'Change_Acc']:
    print('-'*50)
    print(f'Cluster 0 {var}', df_clustering[df_clustering.Cluster==0][var].mean())
    print(f'Cluster 1 {var}', df_clustering[df_clustering.Cluster==1][var].mean())
    print(f'Cluster 2 {var}', df_clustering[df_clustering.Cluster==2][var].mean())
    print('-'*50)

--------------------------------------------------
Cluster 0 speed 86.97958524938063
Cluster 1 speed 64.11860068259388
Cluster 2 speed 37.80283954840907
--------------------------------------------------
--------------------------------------------------
Cluster 0 Change_Heading 8.747487663885888
Cluster 1 Change_Heading 370.9365374452143
Cluster 2 Change_Heading 16.331549082081853
--------------------------------------------------
--------------------------------------------------
Cluster 0 Change_Acc 0.3228240039083801
Cluster 1 Change_Acc 1.0408788863851386
Cluster 2 Change_Acc 2.532738818808002
--------------------------------------------------


In [27]:
# Reverse mapping to make results readable
map_dict_reversed = {v: k for k, v in map_dict.items()}
df_clustering['road_type'] = df_clustering['road_type'].map(map_dict_reversed)

In [28]:
# Road type distribution for each cluster
for cluster in [0,1,2]:
    print(f'Cluster {cluster} Road Types \n', df_clustering[df_clustering.Cluster==cluster]['road_type'].value_counts(normalize=True))
    print('-'*50)

Cluster 0 Road Types 
 MOTORWAY                         0.802175
MAJOR_ROAD_OF_HIGH_IMPORTANCE    0.186019
SECONDARY_ROAD                   0.010081
DESTINATION_ROAD                 0.001282
CONNECTING_ROAD                  0.000442
Name: road_type, dtype: float64
--------------------------------------------------
Cluster 1 Road Types 
 MOTORWAY                          0.631399
SECONDARY_ROAD                    0.160410
MAJOR_ROAD                        0.080205
MAJOR_ROAD_OF_HIGH_IMPORTANCE     0.066553
LOCAL_ROAD_OF_MAJOR_IMPORTANCE    0.059727
CONNECTING_ROAD                   0.001706
Name: road_type, dtype: float64
--------------------------------------------------
Cluster 2 Road Types 
 SECONDARY_ROAD                    0.523435
MAJOR_ROAD                        0.153951
MOTORWAY                          0.126924
LOCAL_ROAD_OF_MAJOR_IMPORTANCE    0.097160
MAJOR_ROAD_OF_HIGH_IMPORTANCE     0.060212
CONNECTING_ROAD                   0.034896
DESTINATION_ROAD                  0.003