In [None]:
import os
import pandas as pd
import numpy as np
import json
from datetime import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.dates import DateFormatter
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns

def plot_with_background(ax, alpha = .5):
	"""
	Creates a plot with a background image.

	Parameters:
	- ax: The matplotlib axis object containing the plot.
	"""
	# Path to your image file
	image_path = '../data-raw/background/image3195.png'
	
	# Coordinates for the image placement
	image_extent = (0, 51, -0.02, 14.214)

	# Load the background image
	img = mpimg.imread(image_path)
	
	# If an extent is provided, use it to correctly scale and position the image
	if image_extent:
		ax.imshow(img, aspect='auto', extent=image_extent, zorder=-1, alpha = alpha)
	else:
		ax.imshow(img, aspect='auto', zorder=-1, alpha = alpha)
	
	return ax



# Load the JSON data
with open('../data-raw/background/config.json') as f:
	data = json.load(f)

# Navigate to the singlesensors within bern-multi01
geometries = data['multisensors']['bern-multi01']['geometries']

# Filter geometries for zones containing "entry" in their name
geometries = [g for g in geometries if g['type'] == 'ZONE' and 'entry' in g['name'].lower()]

# Preprocess geometries to remove "act_" prefix from names
for geometry in geometries:
	if geometry['name'].startswith('act_entry_'):
		geometry['name'] = geometry['name'][10:]  # Remove the first 4 characters 'act_'

# Add additional seating areas
seating_area_coords = [[11.31, 2.6], [47.5, 2.6], [47.5, 6.5], [11.31, 6.5]]
seating_area_geometry = {
	'geometry': seating_area_coords,
	'type': 'ZONE',
	'name': 'seating area 1'
}
geometries.append(seating_area_geometry)

seating_area_2_coords = [[13.5, 0], [47.5, 0], [47.5, 1.75], [13.5, 1.75]]
seating_area_2_geometry = {
	'geometry': seating_area_2_coords,
	'type': 'ZONE',
	'name': 'seating area 2'
}
geometries.append(seating_area_2_geometry)

seating_area_3_coords = [[8-1.8-0.7, 8.8], [8-1.8, 8.8], [8-1.8, 8.8+4], [8-1.8-0.7, 8.8+4]]
seating_area_3_geometry = {
	'geometry': seating_area_3_coords,
	'type': 'ZONE',
	'name': 'seating area 3'
}
geometries.append(seating_area_3_geometry)

seating_area_4_coords = [[8.2-2.4, 6.3], [8.2, 6.3], [8.2, 6.3+.5], [8.2-2.4, 6.3+.5]]
seating_area_4_geometry = {
	'geometry': seating_area_4_coords,
	'type': 'ZONE',
	'name': 'seating area 4'
}
geometries.append(seating_area_4_geometry)

# Add presumptive TB area
check_area = [[8, .8], [11.3, .8], [11.3, 6.2], [8, 6.2]]
check_area = {
	'geometry': check_area,
	'type': 'ZONE',
	'name': 'Check area'
}
geometries.append(check_area)
check_tb_area = [[9.1, 2.6], [11.3, 2.6], [11.3, 3.7], [9.1, 3.7]]
check_tb_area = {
	'geometry': check_tb_area,
	'type': 'ZONE',
	'name': 'TB area'
}
geometries.append(check_tb_area)



def plot_with_background_geom(ax, geometries):
	"""
	Plots the geometries on a background image.

	Parameters:
	- ax: The matplotlib axes object where the plot will be drawn.
	- geometries: A list of geometry dictionaries, each containing 'geometry', 'type', and 'name' keys.

	Returns:
	- The modified axes object with the geometries plotted.
	"""
	# Plot the background image first
	ax = plot_with_background(ax, 1)

	# Color cycle for different polygons
	colors = plt.cm.viridis(np.linspace(0, 1, len(geometries)))

	for i, geometry in enumerate(geometries):
		# Extract the coordinates directly from the 'geometry' key
		coords = geometry['geometry']
		
		# Check if the geometry is a LINE or a ZONE to decide on closure
		if geometry['type'] == 'ZONE':
			closed = True
		else:  # For 'LINE', do not close the polygon
			closed = False
		
		# Create a polygon or line from the coordinates
		polygon = Polygon(xy=coords, closed=closed, color=colors[i], label=geometry['name'], alpha=0.5)
		
		# Add the polygon or line to the plot
		ax.add_patch(polygon)
		
		# Label the polygon or line with its name
		# Use the first vertex for the label position
		ax.annotate(geometry['name'], xy=coords[0], color='white', weight='bold')

	# Adjust the legend to be below the plot
	ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), shadow=True, ncol=2)

	return ax

fig, ax = plt.subplots(figsize=(16, 12))
ax = plot_with_background(ax, 1)
plt.show()

fig, ax = plt.subplots(figsize=(16, 12))
ax = plot_with_background_geom(ax, geometries)
plt.show()

## Data

### Test data 

In [None]:
# unlinked tracks
test_unlinked = pd.read_csv('../data-clean/tracking/unlinked/2024-06-26.csv')
test_unlinked['timestamp'] = pd.to_datetime(test_unlinked['time'], unit='ms', origin='unix', utc=True)
n_unlink = test_unlinked['track_id'].nunique()

# linked tracks
mapping = pd.read_csv('../data-clean/tracking/linked/2024-06-26.csv')
test_linked = test_unlinked.merge(mapping, left_on='track_id', right_on='raw_track_id', how='left')
test_linked['track_id'] = test_linked['track_id_y'].combine_first(test_linked['track_id_x'])
test_linked = test_linked.drop(columns=['track_id_x', 'track_id_y'])
n_link = test_linked['track_id'].nunique()

# linked tracks
print(f'Number of unlinked tracks: {n_unlink}')
print(f'Number of linked tracks: {n_link}')
prop_links = (n_unlink - n_link) / n_unlink * 100
print(f'Proportion of tracks linked: {prop_links:.2f}%')

### Overall summary

In [None]:
# compute proportion of complete and lost tracks
def categorize_track(group):
		"""Categorizes each track based on 'near_entry' status."""
		first = group.iloc[0]['in_tb_cs']
		last = group.iloc[-1]['in_tb_cs']
		if first or last:
			return "TB area staff"
		first = group.iloc[0]['near_entry']
		last = group.iloc[-1]['near_entry']
		if not first and last:
			return "Lost start"
		elif first and not last:
			return "Lost end"
		elif not first and not last:
			return "Lost both"
		else:
			return "Complete"
		
def count_category_tracks(df):
		"""Processes each dataset to compute track proportions."""
		categories = df.groupby('track_id', group_keys=False).apply(categorize_track, include_groups=False).reset_index(name='Category')
		category_counts = categories['Category'].value_counts().reset_index()
		category_counts.columns = ['Label', 'Number of Tracks']
		total_tracks = len(categories)
		category_counts['Proportion'] = (category_counts['Number of Tracks'] / total_tracks) * 100
		return category_counts.set_index('Label')

In [None]:
date_csv = [f for f in os.listdir("../data-clean/tracking/unlinked/") if f.endswith('.csv')]

results_df = pd.DataFrame(columns=[
  'date', 
  'no_tracks_raw', 'no_tracks_matched', 'prop_links', 
  'mean_no_links', 'max_no_links',
  'complete_raw', 'complete_matched'
  ]).astype ({
    'date': 'str',
    'no_tracks_raw': 'int',
    'no_tracks_matched': 'int',
    'prop_links': 'float64',
    'mean_no_links': 'float64',
    'max_no_links': 'int',
    'complete_raw': 'float64',
    'complete_matched': 'float64'
})

for f in date_csv:
  # load data
  print(f)
  date = f.replace('.csv', '')
  unlinked_data = pd.read_csv(os.path.join("../data-clean/tracking/unlinked/", f))
  mapping_data = pd.read_csv(os.path.join("../data-clean/tracking/linked/", f))
  linked_data = unlinked_data.merge(mapping_data, left_on='track_id', right_on='raw_track_id', how='left')
  linked_data['track_id'] = linked_data['track_id_y'].combine_first(linked_data['track_id_x'])
  linked_data = linked_data.drop(columns=['track_id_x', 'track_id_y'])
  
  # number of links
  n_ul = unlinked_data['track_id'].nunique()
  n_l = linked_data['track_id'].nunique()
  p_l = (n_ul - n_l) / n_ul
  
  # mean and max number of links per track_id
  n_link_per_track = linked_data.groupby('track_id', as_index=False)['raw_track_id'].nunique().rename(columns={'raw_track_id': 'links'})
  mean_n_l = np.mean(n_link_per_track['links'])
  max_n_l = np.max(n_link_per_track['links'])
  
  # data quality
  unlinked_qual = count_category_tracks(unlinked_data)
  linked_qual = count_category_tracks(linked_data)
  
  # Append new data to the DataFrame
  new_row = pd.DataFrame({
      'date': [date],
      'no_tracks_raw': [n_ul],
      'no_tracks_matched': [n_l],
      'prop_links': [p_l],
      'mean_no_links': [mean_n_l],
      'max_no_links': [max_n_l],
      'complete_raw': [unlinked_qual['Proportion']['Complete']/100],
      'complete_matched': [linked_qual['Proportion']['Complete']/100]
  })
  results_df = pd.concat([results_df, new_row], ignore_index=True)

results_df.sort_values(by='date')

In [None]:
# Assuming results_df is already defined and contains the specified columns

# Create a figure with 3 subplots arranged in a single row
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Plot boxplot for 'no_links_matched'
sns.boxplot(ax=axes[0], data=results_df, y='no_tracks_matched')
axes[0].set_title('No. track IDs')

# Plot boxplot for 'prop_links'
sns.boxplot(ax=axes[1], data=results_df, y='prop_links')
axes[1].set_title('Prop. of links made')

# Plot boxplot for 'complete_matched'
sns.boxplot(ax=axes[2], data=results_df, y='complete_matched')
axes[2].set_title('Prop. of complete tracks')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()

## Checks

### Distance between unlinked tracks

## Duration distribution

In [None]:
def compute_track_duration(df):
	"""
	Computes the duration of each track.
	
	Parameters:
	- df: A pandas DataFrame with 'track_id' and 'time' columns.
	
	Returns:
	- A pandas DataFrame with 'track_id' and 'duration' for each track.
	"""
	# Calculate the duration by subtracting the first time from the last time for each track
	duration_df = df.groupby('track_id')['time'].apply(lambda x: x.max() - x.min()).reset_index(name='duration')
	
	# Convert duration to a more readable format if needed, e.g., total seconds
	duration_df['duration'] = duration_df['duration'] / 1000 / 60  # Convert milliseconds to minutes
	
	return duration_df

def plot_track_duration_histogram(duration_list):
	"""
	Plots histograms of the duration of each track for multiple datasets.
	
	Parameters:
	- duration_list: A list of pandas DataFrames with 'track_id' and 'duration' columns.
	
	Returns:
	- axs: A list of matplotlib axis objects containing the histograms.
	"""
	# Determine the grid size
	n = len(duration_list)
	nrows = int(n**0.5) + (1 if n % int(n**0.5) > 0 else 0)  # Add an extra row if there are more items than a perfect square
	ncols = n if nrows == 1 else int(n / nrows) + (n % nrows > 0)
	
	# Create a figure and axes for the grid of histograms
	fig, axs = plt.subplots(nrows, ncols, figsize=(5*ncols, 4*nrows), squeeze=False)
	
	# Flatten the axs array for easy iteration if there's more than one row or column
	axs = axs.flatten()
	
	# Plot each histogram
	for i, duration_df in enumerate(duration_list):
		axs[i].hist(duration_df['duration'], bins=20, color='skyblue', edgecolor='black', log=True)
		axs[i].set_title(f'Histogram of Track Durations {i+1}')
		axs[i].set_xlabel('Duration (minutes)')
		axs[i].set_ylabel('Frequency')
	
	# Hide any unused subplots
	for j in range(i + 1, len(axs)):
		fig.delaxes(axs[j])
	
	# Adjust layout to prevent overlap
	plt.tight_layout()
	
	# Return the list of axis objects for further manipulation or saving
	return axs

test_linked_duration = compute_track_duration(test_linked)
test_unlinked_duration = compute_track_duration(test_unlinked)
ax = plot_track_duration_histogram([test_unlinked_duration, test_linked_duration])
plt.show()

## Distance distribution

In [None]:
def compute_track_distance(df):
	"""
	Computes the total distance of each track using Euclidean distance in a more efficient manner.
	
	Parameters:
	- df: A pandas DataFrame with 'track_id', 'position_x', and 'position_y' columns.
	
	Returns:
	- A pandas DataFrame with 'track_id' and 'total_distance' for each track.
	"""
	# Calculate shifted positions for x and y
	df['shifted_x'] = df.groupby('track_id')['position_x'].shift(-1)
	df['shifted_y'] = df.groupby('track_id')['position_y'].shift(-1)
	
	# Vectorized calculation of the Euclidean distance between consecutive points within each track
	df['distance'] = np.sqrt((df['shifted_x'] - df['position_x'])**2 + (df['shifted_y'] - df['position_y'])**2)
	
	# Drop the last row of each track where the shift results in NaN values
	df.dropna(subset=['shifted_x', 'shifted_y'], inplace=True)
	
	# Sum the distances for each track to get the total distance
	total_distance_df = df.groupby('track_id')['distance'].sum().reset_index(name='total_distance')
	
	return total_distance_df

def plot_track_distance_histogram(distance_list):
	"""
	Plots histograms of the total distance of each track for multiple datasets, using Euclidean distance.
	
	Parameters:
	- distance_list: A list of pandas DataFrames with 'track_id' and 'total_distance' columns.
	
	Returns:
	- axs: A list of matplotlib axis objects containing the histograms.
	"""
	# Determine the grid size
	n = len(distance_list)
	nrows = int(n**0.5) + (1 if n % int(n**0.5) > 0 else 0)  # Add an extra row if there are more items than a perfect square
	ncols = n if nrows == 1 else int(n / nrows) + (n % nrows > 0)
	
	# Create a figure and axes for the grid of histograms
	fig, axs = plt.subplots(nrows, ncols, figsize=(5*ncols, 4*nrows), squeeze=False)
	
	# Flatten the axs array for easy iteration if there's more than one row or column
	axs = axs.flatten()
	
	# Plot each histogram
	for i, distance_df in enumerate(distance_list):
		axs[i].hist(distance_df['total_distance'], bins=20, color='skyblue', edgecolor='black', log=True)
		axs[i].set_title(f'Histogram of Track Total Distances {i+1} (Euclidean)')
		axs[i].set_xlabel('Total Distance (m)')
		axs[i].set_ylabel('Frequency')
	
	# Hide any unused subplots
	for j in range(i + 1, len(axs)):
		fig.delaxes(axs[j])
	
	# Adjust layout to prevent overlap
	plt.tight_layout()
	
	# Return the list of axis objects for further manipulation or saving
	return axs

test_linked_distance = compute_track_distance(test_linked)
test_unlinked_distance = compute_track_distance(test_unlinked)
ax = plot_track_distance_histogram([test_unlinked_distance, test_linked_distance])
plt.show()

## Number of made links

In [None]:
# compute number of made links
test_links = test_linked.groupby('track_id')['raw_track_id'].nunique().reset_index(name='links')

# Determine the range of the data
min_links = int(test_links['links'].min())
max_links = int(test_links['links'].max())

# Create integer bins
bins = np.arange(min_links, max_links + 2)

# Plotting the histogram
plt.figure(figsize=(10, 6))
test_links['links'].hist(bins=bins, alpha=0.75, log=True)
plt.title('Histogram of Links per Track ID')
plt.xlabel('Number of Links per Track ID')
plt.ylabel('Frequency')
plt.grid(False)
plt.show()

## Track IDs over time

In [None]:
def plot_tracks_time(df_list):
	"""
	Counts the number of unique track IDs per second, averages these counts per minute for multiple datasets, and plots this on line graphs.
	
	Parameters:
	- df_list: A list of pandas DataFrames with columns 'time' (as datetime), 'track_id', 'position_x', and 'position_y'.
	"""
	# Determine the grid size
	n = len(df_list)
	nrows = int(n**0.5) + (1 if n % int(n**0.5) > 0 else 0)
	ncols = n if nrows == 1 else int(n / nrows) + (n % nrows > 0)
	
	# Create a figure for the grid of line graphs
	fig, axs = plt.subplots(nrows, ncols, figsize=(5*ncols, 4*nrows), squeeze=False)
	
	# Flatten the axs array for easy iteration if there's more than one row or column
	axs = axs.flatten()
	
	# Find the maximum average count of unique track IDs per minute across all DataFrames
	max_count = 0
	for df in df_list:
		df_copy = df.copy()
		df_copy['time'] = df_copy['time']
		df_copy['time'] = pd.to_datetime(df_copy['time'], unit='ms', origin='unix', utc=True)
		df_copy.set_index('time', inplace=True)
		track_counts_per_second = df_copy['track_id'].resample('S').nunique()
		track_counts_per_minute = track_counts_per_second.resample('T').mean()
		max_count = max(max_count, track_counts_per_minute.max())
	
	for i, df in enumerate(df_list):
		# Work on a copy of the DataFrame to avoid modifying the original
		df_copy = df.copy()
		
		# Ensure 'time' is in datetime format
		df_copy['time'] = df_copy['time']
		df_copy['time'] = pd.to_datetime(df_copy['time'], unit='ms', origin='unix', utc=True)
		
		# Set 'time' as the index
		df_copy.set_index('time', inplace=True)
		
		# Resample to 1-second intervals, counting unique track IDs in each interval
		track_counts_per_second = df_copy['track_id'].resample('S').nunique()
		
		# Resample to 1-minute intervals, averaging the counts per second
		track_counts_per_minute = track_counts_per_second.resample('T').mean()
		
		# Plotting
		axs[i].plot(track_counts_per_minute.index, track_counts_per_minute, color='blue')
		axs[i].set_title(f'Average Number of Track IDs per Minute {i+1}')
		axs[i].set_xlabel('Time')
		axs[i].set_ylabel('Average Number of Track IDs')
		axs[i].grid(True)
		axs[i].xaxis.set_major_formatter(DateFormatter('%H:%M'))
		
		# Set uniform y-axis limits
		axs[i].set_ylim(0, max_count)
	
	# Hide any unused subplots
	for j in range(i + 1, len(axs)):
		fig.delaxes(axs[j])
	
	# Adjust layout to prevent overlap
	plt.tight_layout()
	
	plt.show()

plot_tracks_time([test_unlinked, test_linked])

## Number of tracks per unit space

In [None]:
def plot_tracks_spatial(df, square_length=0.5):
	"""
	Counts the number of unique tracks per unit space and plots a heatmap over a background image set by the plot_with_background function.
	
	Parameters:
	- df: A pandas DataFrame with columns 'track_id', 'position_x', and 'position_y'.
	- square_length: The length of the side of each square unit (in meters).
	"""
	# Define the extent of the space
	x_min, x_max, y_min, y_max = 0, 51, -0.02, 14.214
	
	# Calculate the number of bins along each axis
	x_bins = int(np.ceil((x_max - x_min) / square_length))
	y_bins = int(np.ceil((y_max - y_min) / square_length))
	
	# Create a 2D histogram of track counts per unit space
	heatmap, _, _ = np.histogram2d(df['position_x'], df['position_y'], bins=[x_bins, y_bins], range=[[x_min, x_max], [y_min, y_max]])
	
	# Mask the 0 values
	masked_heatmap = np.ma.masked_where(heatmap == 0, heatmap)

	# Create a plot
	fig, ax = plt.subplots(figsize=(10,8))
	
	# Call plot_with_background to set the background image
	plot_with_background(ax, 1)
	
	# Overlay the heatmap
	im = ax.imshow(masked_heatmap.T, extent=[x_min, x_max, y_min, y_max], origin='lower', cmap='hot', alpha=0.5)

	# Manually specify the position and size of the colorbar
	cbar_ax = fig.add_axes([0.15, 0.2, 0.7, 0.02])  # Adjust these values as needed
	
	# Add a colorbar to the heatmap
	fig.colorbar(im, cax=cbar_ax, ax=ax, label='Number of Tracks', orientation='horizontal')
	
	ax.set_xlabel('Position X (m)')
	ax.set_ylabel('Position Y (m)')
	ax.set_title('Heatmap of Tracks per Unit Space')
	
	plt.show()


plot_tracks_spatial(test_linked)

## Spatial distribution of tracking starts and ends

In [None]:
# Create a plot with the background image
def plot_first_last_tracks(df):
	"""
	Plots the first and last track of each track_id in a scatter plot.
	
	Parameters:
	- df: A pandas DataFrame with at least 'track_id', 'position_x', and 'position_y' columns.
	
	Returns:
	- ax: A matplotlib axis object containing the scatter plot.
	"""
	# Ensure the DataFrame is sorted by track_id and then by the tracking time or equivalent
	df_sorted = df.sort_values(by=['track_id', 'time'])
	
	# Group by track_id and get the first and last entry for each track_id
	first_tracks = df_sorted.groupby('track_id').first().reset_index()
	last_tracks = df_sorted.groupby('track_id').last().reset_index()
	
	# Create a scatter plot
	fig, ax = plt.subplots(figsize=(16, 12))
	
	# Plot the first track points in green
	ax.scatter(first_tracks['position_x'], first_tracks['position_y'], color='green', label='First Track', s = 1)
	
	# Plot the last track points in red
	ax.scatter(last_tracks['position_x'], last_tracks['position_y'], color='red', label='Last Track', s = 1)
	
	# Adding legend to distinguish first and last tracks
	ax.legend()
	
	# Labeling the axes
	ax.set_xlabel('Position X')
	ax.set_ylabel('Position Y')
	ax.set_title('First and Last Tracks of Each Track ID')
	
	# Return the axis object for further manipulation or saving
	return ax


ax = plot_first_last_tracks(test_unlinked)
ax = plot_with_background_geom(ax, geometries)
plt.show()

In [None]:
ax = plot_first_last_tracks(test_linked)
ax = plot_with_background_geom(ax, geometries)
plt.show()

## Data quality

In [None]:
def compute_track_proportions(dfs: list) -> pd.DataFrame:
	"""
	Computes the track proportions for given datasets, categorizing each track based on its 'near_entry' status at the start and end.
	
	Parameters:
	- dfs: A list of pandas DataFrames, where the first DataFrame is the unlinked dataset and the second (optional) is the linked dataset.
	
	Returns:
	- A pandas DataFrame with the number of tracks per category for each dataset and the difference between them.
	"""	
	# Process the first dataset (unlinked)
	unlinked_counts = process_dataset(dfs[0])
	
	results = unlinked_counts.copy()
	results.columns = ['Number of Tracks (Unlinked)', 'Proportion (%) (Unlinked)']
	
	# If a second dataset (linked) is provided, process it and compute differences
	if len(dfs) > 1:
		linked_counts = process_dataset(dfs[1])
		linked_counts.columns = ['Number of Tracks (Linked)', 'Proportion (%) (Linked)']
		
		# Merge the results
		results = results.join(linked_counts, how='outer').fillna(0)
		
		# Compute differences
		results['Difference in Number'] = results['Number of Tracks (Linked)'] - results['Number of Tracks (Unlinked)']
		results['Difference in Proportion (%)'] = results['Proportion (%) (Linked)'] - results['Proportion (%) (Unlinked)']
		
		# Format the columns to show numbers and proportions in a single column
		results['Unlinked'] = results['Number of Tracks (Unlinked)'].round(0).astype(int).astype(str) + " (" + results['Proportion (%) (Unlinked)'].round(0).astype(int).astype(str) + "%)"
		results['Linked'] = results['Number of Tracks (Linked)'].round(0).astype(int).astype(str) + " (" + results['Proportion (%) (Linked)'].round(0).astype(int).astype(str) + "%)"
		results['Difference'] = results['Difference in Number'].round(0).astype(int).astype(str) + " (" + results['Difference in Proportion (%)'].round(0).astype(int).astype(str) + "%)"
		
		# Select and rename the final columns
		final_columns = ['Unlinked', 'Linked', 'Difference']
	else:
		# Format the column for a single dataset
		results['Unlinked'] = results['Number of Tracks (Unlinked)'].round(0).astype(int).astype(str) + " (" + results['Proportion (%) (Unlinked)'].round(0).astype(int).astype(str) + "%)"
		final_columns = ['Unlinked']
	
	return results[final_columns].reset_index()

result = compute_track_proportions([test_unlinked, test_linked])
print(result)

In [None]:
def plot_lost_tracks(df):
    # Categorize tracks
    category = df.groupby('track_id').apply(categorize_track).reset_index(name='Category')
    # Filter lost tracks
    lost_tracks = pd.merge(df[['track_id', 'time']], category, on='track_id')
    lost_tracks = lost_tracks[lost_tracks['Category'].isin(['Lost start', 'Lost end', 'Lost both'])]
    lost_times_start = lost_tracks.drop_duplicates(subset='track_id', keep='first')
    lost_times_start = lost_times_start[lost_times_start['Category'].isin(['Lost start', 'Lost both'])]
    lost_times_end = lost_tracks.drop_duplicates(subset='track_id', keep='last') 
    lost_times_end = lost_times_end[lost_times_end['Category'].isin(['Lost end', 'Lost both'])]
    lost_times = pd.concat([lost_times_start, lost_times_end], axis=0)
    
    # Create a DataFrame for lost times
    # print(pd.to_datetime(lost_times, unit='ms', origin='unix', utc=True).max())
    lost_times['time'] = lost_times['time']
    lost_times['time'] = pd.to_datetime(lost_times['time'], unit='ms', origin='unix', utc=True)
    
    # Set time as index and resample to count lost tracks every minute
    lost_times.set_index('time', inplace=True)
    lost_counts = lost_times.resample('T').size()
    
    # Plot the counts
    plt.figure(figsize=(12, 6))
    lost_counts.plot()
    plt.title('Number of lost tracks over time')
    plt.xlabel('Time')
    plt.ylabel('Number of lost tracks')
    plt.show()

plot_lost_tracks(test_unlinked)

## Bad tracks

In [None]:
def compute_dxy(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Computes the Euclidean distance between consecutive points within each track_id
	and adds it as a new column 'dxy' to the DataFrame.
	
	Parameters:
	- df (pd.DataFrame): The input DataFrame with columns 'time', 'track_id', 'position_x', and 'position_y'.
	
	Returns:
	- pd.DataFrame: The DataFrame with an additional column 'dxy'.
	"""
	# Group by 'track_id' and calculate the differences within each group
	df['delta_x'] = df.groupby('track_id')['position_x'].diff()
	df['delta_y'] = df.groupby('track_id')['position_y'].diff()
	
	# Compute the Euclidean distance (dxy) using the differences
	df['dxy'] = np.sqrt(df['delta_x']**2 + df['delta_y']**2)
	
	return df

test_unlinked = compute_dxy(test_unlinked)

def plot_bad_tracks(df, min_dxy=4):
	# Filter track_ids with any dxy greater than min_dxy
	track_ids = df[df['dxy'] > min_dxy]['track_id'].unique()
	
	# Randomly sample up to 5 of these track_ids
	sampled_track_ids = np.random.choice(track_ids, size=min(5, len(track_ids)), replace=False)
	
	# Setup plot with background
	fig, ax = plt.subplots(figsize=(10, 6))
	ax = plot_with_background(ax, 1)
	
	# Color cycle for different tracks
	color_cycle = plt.cm.tab10(np.linspace(0, 1, 10))
	
	# Plot each track
	for idx, track_id in enumerate(sampled_track_ids):
		track_data = df[df['track_id'] == track_id].reset_index(drop=True)
		color = color_cycle[idx % len(color_cycle)]
		
		# Iterate through segments of the track
		for i in range(1, len(track_data)):
			segment = track_data.iloc[i-1:i+1]
			if segment['dxy'].iloc[-1] > min_dxy:
				# Dashed line for segments where dxy > min_dxy
				line_style = '--'
			else:
				# Solid line otherwise
				line_style = '-'
			ax.plot(segment['position_x'], segment['position_y'], line_style, color=color)
		
		# Mark the first and last points specifically
		ax.plot(track_data.iloc[0]['position_x'], track_data.iloc[0]['position_y'], 'o', color=color, markerfacecolor='none')  # Open circle
		ax.plot(track_data.iloc[-1]['position_x'], track_data.iloc[-1]['position_y'], 'x', color=color)  # Cross
	
	plt.show()

plot_bad_tracks(test_unlinked)

def count_bad_tracks(df, thresholds=[3, 5, 7, 10]):
	"""
	Counts the number of bad tracks for different dxy thresholds and returns a table with counts and percentages.
	
	Parameters:
	- df (pd.DataFrame): The input DataFrame with columns 'track_id' and 'dxy'.
	- thresholds (list): A list of thresholds to classify bad tracks.
	
	Returns:
	- pd.DataFrame: A DataFrame with columns 'Threshold', 'Count', and 'Percent' of bad tracks.
	"""
	total_tracks = df['track_id'].nunique()
	results = []

	for threshold in thresholds:
		# Count track_ids with any dxy greater than the current threshold
		count = df[df['dxy'] > threshold]['track_id'].nunique()
		percent = (count / total_tracks) * 100
		results.append({'Threshold': threshold, 'Count': count, 'Percent': percent})

	results_df = pd.DataFrame(results)
	return results_df

# Example usage
bad_tracks_unlinked = count_bad_tracks(test_unlinked)
print(bad_tracks_unlinked)

## Sample tracks

In [None]:
def plot_sample_tracks(df, duration_df, distance_df, link_df, min_time=None, min_distance=None, min_links=1, grid_size=(1, 1), area=None):
    """
    Plots sample tracks with additional information.
    
    Args:
    - df (pd.DataFrame): The main dataframe containing track data.
    - duration_df (pd.DataFrame): A dataframe containing the duration of each track.
    - distance_df (pd.DataFrame): A dataframe containing the total distance of each track.
    - link_df (pd.DataFrame): A dataframe containing the number of links (raw_track_id count) per track_id.
    - min_time (Optional[int]): The minimum duration a track must have to be included. Defaults to None.
    - min_distance (Optional[float]): The minimum distance a track must cover to be included. Defaults to None.
    - min_links (int): The minimum number of raw_track_id that have been linked to form a track_id. Defaults to 1.
    - grid_size (tuple): The grid size for plotting multiple tracks. Defaults to (1, 1).
    - area (character): The column name of the area that should be targeted for samples.
    """
    
    track_info = pd.merge(duration_df, distance_df, on='track_id')
    track_info = pd.merge(track_info, link_df, on='track_id')
    
    if area is not None:
        last_raw_track_ids = df.groupby(['track_id','raw_track_id']).tail(1)
        last_raw_track_ids = last_raw_track_ids.groupby('track_id').apply(lambda x: x.iloc[:-1] if len(x) > 1 else x)
        last_raw_track_ids = last_raw_track_ids.reset_index(drop=True)
        track_ids_in_area = last_raw_track_ids.groupby('track_id').filter(lambda x: x[area].any())
        track_ids_in_area = track_ids_in_area['track_id'].unique()
        area_df = pd.DataFrame(track_ids_in_area, columns=['track_id'])
        area_df['in_area'] = True
        track_info = pd.merge(track_info, area_df, how='left')
        track_info['in_area'] = track_info['in_area'].fillna(False)
        track_info = track_info[track_info['in_area']]
        
    
    if min_time is not None:
        track_info = track_info[track_info['duration'] >= min_time]
    if min_distance is not None:
        track_info = track_info[track_info['total_distance'] >= min_distance]
    if min_links > 1:
        track_info = track_info[track_info['links'] >= min_links]
    
    if track_info.empty:
        print("No tracks meet the filtering criteria.")
        return
    
    num_plots = grid_size[0] * grid_size[1]
    sampled_track_ids = track_info.sample(n=num_plots)['track_id'].tolist()
    
    fig, axs = plt.subplots(grid_size[0], grid_size[1], figsize=(10 * grid_size[1], 6 * grid_size[0]))
    axs = axs.flatten()  # Flatten in case of a single row/column to simplify iteration
    
    for ax, track_id in zip(axs, sampled_track_ids):
        track_df = df[df['track_id'] == track_id]
        
        if track_df.empty:
            print(f"No data for track_id {track_id}.")
            continue
        
        raw_track_ids = track_df['raw_track_id'].unique()
        colors = plt.cm.jet(np.linspace(0, 1, len(raw_track_ids)))
        
        previous_end_time = None
        previous_end_position = None
        
        for i, raw_track_id in enumerate(raw_track_ids):
            segment_df = track_df[track_df['raw_track_id'] == raw_track_id]
            
            ax.plot(segment_df['position_x'], segment_df['position_y'], '-', color=colors[i], linewidth=1)
            
            if len(segment_df) >= 1:
                ax.plot(segment_df.iloc[0]['position_x'], segment_df.iloc[0]['position_y'], 'o', color=colors[i], markerfacecolor='none', markersize=10)
            
            if len(segment_df) > 1:
                ax.plot(segment_df.iloc[-1]['position_x'], segment_df.iloc[-1]['position_y'], 'x', color=colors[i], markersize=10)
            
            if len(segment_df) > 2:
                ax.plot(segment_df.iloc[1:-1]['position_x'], segment_df.iloc[1:-1]['position_y'], 'o', color=colors[i], markersize=1)
            
            # Calculate time and distance to previous segment
            if previous_end_time is not None and previous_end_position is not None:
                time_diff = (segment_df.iloc[0]['timestamp'] - previous_end_time).total_seconds()
                distance_diff = np.sqrt((segment_df.iloc[0]['position_x'] - previous_end_position[0])**2 + 
                                        (segment_df.iloc[0]['position_y'] - previous_end_position[1])**2)
                label_text = f"{i + 1}: {raw_track_id} ({time_diff:.2f}s, {distance_diff:.2f}m)"
            else:
                label_text = f"{i + 1}: {raw_track_id}"
            
            ax.text(1.05, 1 - (i * 0.05), label_text, transform=ax.transAxes, verticalalignment='top', horizontalalignment='left', fontsize=10, color=colors[i], bbox=dict(facecolor='white', alpha=0.5))
            
            # Update previous end time and position
            previous_end_time = segment_df.iloc[-1]['timestamp']
            previous_end_position = (segment_df.iloc[-1]['position_x'], segment_df.iloc[-1]['position_y'])
        
        ax = plot_with_background(ax, 1)  # Assuming plot_with_background is defined elsewhere
        ax.set_xlabel('Position X')
        ax.set_ylabel('Position Y')
    plt.tight_layout()
    plt.show()


In [None]:
# Example usage
plot_sample_tracks(test_linked, test_linked_duration, test_linked_distance, test_links, min_time=5, min_distance=10, min_links=2, grid_size=(16, 1))

In [None]:
# Example usage
plot_sample_tracks(test_linked, test_linked_duration, test_linked_distance, test_links, min_time=5, min_distance=10, min_links=2, grid_size=(8, 1), area='in_tb_pat')

## Sample lost tracks

In [None]:
import warnings
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

def filter_potential_matches(potential_matches: pd.DataFrame, last_point: pd.Series, time: int, distance: float) -> pd.DataFrame:
	# Filter potential matches based on time
	potential_matches = potential_matches.copy()
	potential_matches.loc[:, 'time_diff'] = potential_matches['time'] - last_point['time']
	potential_matches = potential_matches[abs(potential_matches['time_diff']) <= time]
	
	# Filter potential matches based on distance
	potential_matches.loc[:, 'distance'] = np.sqrt((potential_matches['position_x'] - last_point['position_x'])**2 + (potential_matches['position_y'] - last_point['position_y'])**2)
	potential_matches = potential_matches[potential_matches['distance'] <= distance]
	
	return potential_matches

def plot_tracks(ax, df, track_id, color):
	track = df[df['track_id'] == track_id]
	ax.plot(track['position_x'], track['position_y'], color=color, label=f'Track {track_id}')
	
	# Plot the first point as a large open circle
	ax.plot(track['position_x'].iloc[0], track['position_y'].iloc[0], 'o', color=color, markersize=5, markerfacecolor='none')
	
	# Plot the last point as a large cross
	ax.plot(track['position_x'].iloc[-1], track['position_y'].iloc[-1], 'x', color=color, markersize=5)
	
	return ax

def show_potential_matches(df, lost, max_time, max_distance, area = None):
	# Filter tracks based on lost argument
	if lost == 'start':
		lost_tracks = df.groupby('track_id').filter(lambda x: not x.iloc[0]['near_entry'])
		if area is not None:
			lost_tracks = lost_tracks.groupby('track_id').filter(lambda x: x.iloc[0][area])
	elif lost == 'end':
		lost_tracks = df.groupby('track_id').filter(lambda x: not x.iloc[-1]['near_entry'])
		if area is not None:
			lost_tracks = lost_tracks.groupby('track_id').filter(lambda x: x.iloc[-1][area])
	
	# Sample one track_id
	sampled_track_id = lost_tracks['track_id'].sample(1).iloc[0]
	selected_track = df[df['track_id'] == sampled_track_id]
	last_point = selected_track.iloc[-1]
	first_point = selected_track.iloc[0]
	
	# Plot the selected track_id
	fig, ax = plt.subplots(figsize=(8, 6))
	ax = plot_with_background(ax, 1)
	ax = plot_tracks(ax, df, sampled_track_id, 'black')
	
	# Find potential matches
	eps = 3000  
	if lost == 'end':
		potential_matches = df.drop_duplicates(subset='track_id', keep='first')
		potential_matches = potential_matches[potential_matches['near_entry'] == False]
		potential_matches = potential_matches[potential_matches['time'] > (last_point['time'] - eps)]
		potential_matches = filter_potential_matches(potential_matches, last_point, max_time, max_distance)
	else:
		potential_matches = df.drop_duplicates(subset='track_id', keep='last')
		potential_matches = potential_matches[potential_matches['near_entry'] == False]
		potential_matches = potential_matches[potential_matches['time'] < (first_point['time'] + eps)]
		potential_matches = filter_potential_matches(potential_matches, first_point, max_time, max_distance)
	potential_matches_tracks = df[df['track_id'].isin(potential_matches['track_id'])]
	potantial_matches_duration = compute_track_duration(potential_matches_tracks)
	potential_matches_distance = compute_track_distance(potential_matches_tracks)
	potential_matches = pd.merge(potential_matches, potantial_matches_duration, on='track_id')
	potential_matches = pd.merge(potential_matches, potential_matches_distance, on='track_id')

	if potential_matches.empty:
		plt.show()
		return
	
	# Plot potential matches in different colors
	colors = plt.get_cmap('tab10', len(potential_matches))
	color_map = {}
	for i, (track_id, match) in enumerate(potential_matches.groupby('track_id')):
		color_map[track_id] = colors(i)
		ax = plot_tracks(ax, df, track_id, colors(i))
	
	# Create a table listing time difference and distance
	table_data = []
	for track_id, match in potential_matches.groupby('track_id'):
		time_diff = round(match['time_diff'].iloc[0] / 1000)
		distance = round(match['distance'].iloc[0], 1)
		match_duration = round(match['duration'].iloc[0] * 60)
		match_distance = round(match['total_distance'].iloc[0], 1)
		table_data.append([track_id, time_diff, distance, match_duration, match_distance])
	
	# Adjust layout to make space for the table
	plt.subplots_adjust(bottom=0.3)
	
	# Plot the table below the plot
	table = ax.table(cellText=table_data, colLabels=['Track ID', 'Time Difference (s)', 'Distance (m)', 'Duration (s)', 'Total distance (m)'], loc='bottom', bbox=[0, -0.3, 1, 0.2])
	table.auto_set_font_size(False)
	table.set_fontsize(8)
	
	# Set row text colors to match plot colors
	for i, key in enumerate(table.get_celld().keys()):
		cell = table.get_celld()[key]
		if key[0] > 0:  # Skip header row
			track_id = table_data[key[0] - 1][0]
			cell.set_text_props(color=color_map[track_id])
	
	plt.show()

md = 1.5
mt = 10*60*1000

In [None]:
show_potential_matches(test_linked, lost='start', max_time=mt, max_distance=md)
show_potential_matches(test_linked, lost='start', max_time=mt, max_distance=md)
show_potential_matches(test_linked, lost='start', max_time=mt, max_distance=md)
show_potential_matches(test_linked, lost='start', max_time=mt, max_distance=md)
show_potential_matches(test_linked, lost='start', max_time=mt, max_distance=md)
show_potential_matches(test_linked, lost='end', max_time=mt, max_distance=md)
show_potential_matches(test_linked, lost='end', max_time=mt, max_distance=md)
show_potential_matches(test_linked, lost='end', max_time=mt, max_distance=md)
show_potential_matches(test_linked, lost='end', max_time=mt, max_distance=md)
show_potential_matches(test_linked, lost='end', max_time=mt, max_distance=md)

In [None]:
show_potential_matches(test_linked, lost='start', max_time=mt, max_distance=md, area="in_tb_pat")
show_potential_matches(test_linked, lost='start', max_time=mt, max_distance=md, area="in_tb_pat")
show_potential_matches(test_linked, lost='start', max_time=mt, max_distance=md, area="in_vitals_pat")
show_potential_matches(test_linked, lost='start', max_time=mt, max_distance=md, area="in_vitals_pat")
show_potential_matches(test_linked, lost='end', max_time=mt, max_distance=md, area="in_tb_pat")
show_potential_matches(test_linked, lost='end', max_time=mt, max_distance=md, area="in_tb_pat")
show_potential_matches(test_linked, lost='end', max_time=mt, max_distance=md, area="in_vitals_pat")
show_potential_matches(test_linked, lost='end', max_time=mt, max_distance=md, area="in_vitals_pat")