In [None]:
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import seaborn as sns
import numpy as np

from utils.plots import visualize_vehicle_trajectories
from utils.loader import load_data_from_database
from utils.transformer import categorize_ids

In [None]:
print("Do you want to load data from the database? This may take a while.")
if input("Type 'y' to proceed: ").lower() == 'y':
    df = load_data_from_database()
    df = df.sort_values(['date_time', 'frame_id'])
else:
    df = pd.read_csv("raw_traffic_data.csv", parse_dates=['date_time'])
    df = df.sort_values(['date_time', 'frame_id'])

In [None]:
df['session_id'] = (
    (df['frame_id'].diff() < 0)
).cumsum()

In [None]:
df.groupby('session_id').count()

In [None]:
# 1. Select one stable session and sort
session_id_to_analyze = 2
session_df = df[df['session_id'] == session_id_to_analyze].sort_values(['vehicle_id', 'date_time'])

# Filter vehicle_id with less than 20 records
vehicle_counts = session_df['vehicle_id'].value_counts()
valid_vehicles = vehicle_counts[vehicle_counts >= 20].index
session_df = session_df[session_df['vehicle_id'].isin(valid_vehicles)]


In [None]:
session_df

In [None]:
import pandas as pd
import numpy as np

def classify_tracks(metrics):
    """
    Classifies tracks based on geometric and temporal metrics.
    """
    # Calculate movement efficiency (path per frame)
    # This helps identify objects that were stationary (Static) at any point in the ROI
    metrics['movement_efficiency'] = metrics['path_completeness'] / metrics['frames_count']

    # --- Classification conditions ---

    # 1. GHOST: Technical noise (very short tracks)
    is_ghost = (metrics['frames_count'] < 10)

    # 2. STATIC: Stationary object (at start, end, or in traffic jam)
    # If there's too little movement per frame
    is_static = (metrics['movement_efficiency'] < 0.0015) | \
                ((metrics['frames_count'] > 200) & (metrics['path_completeness'] < 0.3))

    # 3. PERFECT: Ideal passage (stable width, full path, normal speed)
    is_perfect = (
        (metrics['path_completeness'] > 0.85) & 
        (metrics['w_cv'] < 0.30) & 
        (metrics['movement_efficiency'] >= 0.0015)
    )

    # 4. ENTRY/EXIT: Full passages where height changed regularly (ID 153, 238, etc.)
    is_entry_exit = (
        (metrics['path_completeness'] > 0.85) & 
        (metrics['w_cv'] < 0.30) & 
        (metrics['h_cv'] > 0.35)
    )

    # 5. FLICKERING: Unstable object (strong width jumps)
    is_flickering = (metrics['w_cv'] > 0.45)

    # 6. PARTIAL: Stable fragments (vehicles that appeared/disappeared mid-frame)
    is_partial = (
        (metrics['path_completeness'].between(0.3, 0.85)) & 
        (metrics['w_cv'] < 0.30)
    )

    # Priority order (from most important/simplest to general)
    conditions = [
        is_ghost,
        is_static,
        is_perfect,
        is_entry_exit,
        is_flickering,
        is_partial
    ]

    choices = [
        'Ghost', 
        'Static', 
        'Perfect', 
        'EntryExit', 
        'Flickering', 
        'Partial'
    ]

    # All others become candidates for merging (RelayCandidate)
    metrics['category'] = np.select(conditions, choices, default='RelayCandidate')
    
    return metrics

def categorize_ids(df):
    """
    Aggregates raw data into metrics for each vehicle and classifies them.
    """
    # Convert time and sort for calculation stability
    df['date_time'] = pd.to_datetime(df['date_time'])
    df = df.sort_values(['session_id', 'date_time', 'frame_id'])

    # Group by session and ID
    grouped = df.groupby(['session_id', 'vehicle_id'])
    
    # ROI height (according to your settings 460 - 170)
    ROI_H = 290 
    
    # Data aggregation
    metrics = grouped.agg(
        y_start=('y', 'first'),
        y_end=('y', 'last'),
        w_mean=('width', 'mean'),
        w_std=('width', 'std'),
        h_mean=('heigth', 'mean'),
        h_std=('heigth', 'std'),
        frames_count=('frame_id', 'count'),
        t_start=('date_time', 'min'),
        t_end=('date_time', 'max'),
        x_mean=('x', 'mean'),
        x_std=('x', 'std')
    ).reset_index()

    # Calculate path completeness (0.0 - 1.0)
    metrics['path_completeness'] = (metrics['y_end'] - metrics['y_start']).abs() / ROI_H

    # Calculate size stability (Coefficient of Variation)
    # Use fillna(0) for single-frame objects
    metrics['w_cv'] = (metrics['w_std'] / metrics['w_mean']).fillna(0)
    metrics['h_cv'] = (metrics['h_std'] / metrics['h_mean']).fillna(0)

    # Run classification
    final_summary = classify_tracks(metrics)
    
    return final_summary

final_summary = categorize_ids(session_df)

In [None]:
final_summary.groupby('category').agg('size')

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from scipy.stats import gaussian_kde

# 1. Parameters
dims = ['path_completeness', 'frames_count', 'movement_efficiency', 'w_cv', 'h_cv']
categories = final_summary['category'].unique()
n = len(dims)

# 2. Create subplot grid
fig = make_subplots(
    rows=n, cols=n, 
    shared_xaxes=False, shared_yaxes=False,
    horizontal_spacing=0.03, vertical_spacing=0.03,
    column_titles=dims, row_titles=dims
)

# Color palette (matches your previous plots)
colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A']

# 3. Fill the matrix
for i, y_col in enumerate(dims):
    for j, x_col in enumerate(dims):
        for k, cat in enumerate(categories):
            df_sub = final_summary[final_summary['category'] == cat]
            
            # If not enough data for KDE (less than 2 points), skip the curve
            if len(df_sub) < 2: continue

            if i == j: # DIAGONAL: Smooth KDE curves
                # Calculate KDE
                x_range = np.linspace(final_summary[x_col].min(), final_summary[x_col].max(), 100)
                try:
                    kde = gaussian_kde(df_sub[x_col])
                    y_kde = kde(x_range)
                    
                    fig.add_trace(
                        go.Scatter(
                            x=x_range, y=y_kde, 
                            name=cat, line=dict(color=colors[k], width=2),
                            fill='tozeroy', opacity=0.3, # Fill under the curve
                            showlegend=(i == 0 and j == 0),
                            legendgroup=cat
                        ),
                        row=i+1, col=j+1
                    )
                except: pass # In case of zero variance

            else: # OFF-DIAGONAL: Scatter plots
                fig.add_trace(
                    go.Scatter(
                        x=df_sub[x_col], y=df_sub[y_col],
                        mode='markers', name=cat, marker_color=colors[k],
                        opacity=0.5, marker_size=4,
                        showlegend=False, legendgroup=cat,
                        hovertext=df_sub['vehicle_id'].apply(lambda x: f"ID: {x}")
                    ),
                    row=i+1, col=j+1
                )

# 4. Configure fixed axes (per your request)
for i, col in enumerate(dims):
    margin = (final_summary[col].max() - final_summary[col].min()) * 0.05
    r = [final_summary[col].min() - margin, final_summary[col].max() + margin]
    
    for k in range(1, n + 1):
        fig.update_xaxes(range=r, row=k, col=i+1)
        if i != k-1: # Don't touch Y axis for diagonal, as it has density scale
            fig.update_yaxes(range=r, row=i+1, col=k)

fig.update_layout(
    title_text="Interactive matrix with distribution curves (KDE) on diagonal",
    width=1200, height=1100,
    template="plotly_white"
)

fig.show()

In [None]:
filtered_final_summary = final_summary[(final_summary['category'] == 'RelayCandidate')]
filtered_final_summary

In [None]:
ids_to_plot = filtered_final_summary['vehicle_id']
plot_data = session_df[session_df['vehicle_id'].isin(ids_to_plot)]
visualize_vehicle_trajectories(plot_data, session_id=0, max_vehicles=25, min_records=20, category='RelayCandidate')

In [None]:
import pandas as pd
import numpy as np

def find_merging_pairs(summary_df, time_gap_limit=1.5, space_gap_limit=40, size_sim_limit=0.2):
    """
    summary_df: result of metrics calculation (final_summary)
    time_gap_limit: max time gap (seconds)
    space_gap_limit: max distance between points (pixels)
    size_sim_limit: max width difference (relative, 0.2 = 20%)
    """
    
    # Work only with Relay candidates
    candidates = summary_df[(summary_df['category'] == 'RelayCandidate') | (summary_df['category'] == 'Static')].copy()
    
    # Sort by appearance time
    candidates = candidates.sort_values('t_start')
    
    merges = []
    used_ids = set()

    # Convert to list of dictionaries for fast iteration
    records = candidates.to_dict('records')

    for i in range(len(records)):
        id_a = records[i]
        if id_a['vehicle_id'] in used_ids: continue

        for j in range(i + 1, len(records)):
            id_b = records[j]
            if id_b['vehicle_id'] in used_ids: continue
            
            # 1. Session check (must be in the same session)
            if id_a['session_id'] != id_b['session_id']: continue

            # 2. Time gap (A ended, B started soon after)
            gap_time = (id_b['t_start'] - id_a['t_end']).total_seconds()
            
            # We're looking for B that comes AFTER A, but not later than the limit
            if 0 <= gap_time <= time_gap_limit:
                
                # 3. Spatial proximity (end of A to start of B)
                # Use Y as it's the main axis of movement
                dist_y = abs(id_b['y_start'] - id_a['y_end'])
                dist_x = abs(id_b['x_mean'] - id_a['x_mean'])
                
                # 4. Size similarity (width shouldn't jump)
                size_diff = abs(id_a['w_mean'] - id_b['w_mean']) / id_a['w_mean']

                if dist_y < space_gap_limit and dist_x < 20 and size_diff < size_sim_limit:
                    merges.append({
                        'old_id': int(id_a['vehicle_id']),
                        'new_id': int(id_b['vehicle_id']),
                        'gap_sec': round(gap_time, 2),
                        'y_dist': round(dist_y, 1),
                        'size_diff_pct': round(size_diff * 100, 1)
                    })
                    # Mark IDs as used to avoid incorrect chain merging
                    # (although chains of 3 IDs do happen too)
                    used_ids.add(id_b['vehicle_id'])
                    break 

    return pd.DataFrame(merges)
'''
# Function call
merge_results = find_merging_pairs(final_summary)
print(f"Found pairs for merging: {len(merge_results)}")
print(merge_results.head())

final_summary.to_csv("final_summary.csv", index=False)
'''

In [None]:
# df['year'] = df['date_time'].dt.year
# df['month'] = df['date_time'].dt.month
# df['day'] = df['date_time'].dt.day
# df['hour'] = df['date_time'].dt.hour
# df['unique_vehicle_id'] = round(df['vehicle_id'] + df['year']/10000 + df['month']/1000000 + df['day']/100000000 + df['hour']/10000000000, 10)
# df['unique_frame_id'] = round(df['vehicle_id'] + df['year']/10000 + df['month']/1000000 + df['day']/100000000 + df['hour']/10000000000, 10)
# df.head()

In [None]:
# df['size'] = df['width'] * df['heigth']
# df.head()

In [None]:
# min_max_df = df.groupby('unique_vehicle_id')[['size', 'y']].agg(['max', 'min']).sort_index()

In [None]:
# min_max_df['way_size'] = min_max_df['y']['max'] - min_max_df['y']['min']
# min_max_df.columns = ['size_max', 'size_min', 'y_max', 'y_min', 'way_size']
# min_max_df.head()

In [None]:
# min_max_df['full_way'] = min_max_df['way_size'] > 240

In [None]:
# min_max_df['day'] = min_max_df.index.map(lambda x: True if 6 <= int((x* 10000000000)%100) < 18 else False)

In [None]:
# min_max_df

In [None]:
## Duration, Speed and Direction

from utils.new_columns_fiorenzo import add_speed_direction_to_summary

final_summary['duration'] = (final_summary['t_end'] - final_summary['t_start']).dt.total_seconds()

final_summary = add_speed_direction_to_summary(final_summary)

print(final_summary.head())



In [None]:
## Create columns size_mean and h/w_mean ratio
final_summary['size_mean'] = final_summary['w_mean'] * final_summary['h_mean']
final_summary['h_w_mean_ratio'] = final_summary['h_mean'] / final_summary['w_mean']
print(final_summary.head())

In [None]:
## Mean height, width and size in meters
from utils.new_columns_fiorenzo import pixel_meters_ratio

final_summary['h_mean_meters'] = final_summary['h_mean'] / pixel_meters_ratio
final_summary['w_mean_meters'] = final_summary['w_mean'] / pixel_meters_ratio
final_summary['size_mean_meters'] = final_summary['size_mean'] / pixel_meters_ratio

print(final_summary)

In [None]:
## Day/Night difference
from utils.new_columns_fiorenzo import add_day_night_to_summary
final_summary = add_day_night_to_summary(final_summary)
print(final_summary)

In [None]:
## Average, max and min of width, height and size of all vehicles
final_summary_viable = final_summary[final_summary['category'].isin(['Perfect', 'Partial'])]

avg_w = final_summary_viable['w_mean'].mean()
max_w = final_summary_viable['w_mean'].max()
min_w = final_summary_viable['w_mean'].min()
avg_h = final_summary_viable['h_mean'].mean()
max_h = final_summary_viable['h_mean'].max()
min_h = final_summary_viable['h_mean'].min()
print(f"Width - Avg: {avg_w:.2f}, Max: {max_w:.2f}, Min: {min_w:.2f}")
print(f"Height - Avg: {avg_h:.2f}, Max: {max_h:.2f}, Min: {min_h:.2f}")

In [None]:
## Plot for classification

from utils.plots_fiorenzo import interactive_dimension_plot_by_cat, average_speed_over_time_plot

interactive_dimension_plot_by_cat(final_summary_viable)

In [None]:
## Explore vehicle size distributions
from utils.plots_fiorenzo import plot_size_distribution

plot_size_distribution(final_summary_viable)


In [None]:
## Apply vehicle classification
from utils.new_columns_fiorenzo import classify_vehicle_types
# Classify vehicles using automatic data-driven thresholds
final_summary_with_class = classify_vehicle_types(final_summary_viable, verbose=True)

'''
# Optional: Use custom thresholds if you want to fine-tune the classification
# Uncomment and adjust values as needed:
# final_summary_with_class = classify_vehicle_types(
#     final_summary_viable,
#     size_threshold=7500,      # Threshold for area (w_mean * h_mean)
#     width_threshold=70,        # Threshold for width
#     height_threshold=140,      # Threshold for height
#     verbose=True
# )
'''
# Show sample results
print("\n" + "="*70)
print("SAMPLE CLASSIFIED VEHICLES")
print("="*70)
print("\nFirst 10 vehicles:")
print(final_summary_with_class[['vehicle_id', 'w_mean', 'h_mean', 'size_mean', 'aspect_ratio', 'Class']].head(10).to_string(index=False))

print("\n" + "="*70)
print("CLASS DISTRIBUTION BY TRACK CATEGORY")
print("="*70)
crosstab = pd.crosstab(final_summary_with_class['category'], 
                       final_summary_with_class['Class'], 
                       margins=True, margins_name='Total')
print(crosstab)
print("\n" + "="*70)



In [None]:
## Visualize vehicle classification

from utils.plots_fiorenzo import visualize_classification

visualize_classification(final_summary_with_class)


In [None]:
## Interactive plot after classification
from utils.plots_fiorenzo import interactive_dimension_plot_by_class

interactive_dimension_plot_by_class(final_summary_with_class)

In [None]:
## Speed distribution
from utils.plots_fiorenzo import speed_distribution_over_time_plot

speed_distribution_over_time_plot(final_summary_with_class, km_h=True)

In [None]:
## Distribution plot
from utils.plots_fiorenzo import vehicle_count_over_time_histogram
vehicle_count_over_time_histogram(final_summary_with_class)

In [None]:
print("Total")
average_speed_over_time_plot(final_summary_with_class, by_class=False)
print("By Class")
average_speed_over_time_plot(final_summary_with_class, by_class=True)

In [None]:
print("Day")
interactive_dimension_plot_by_class(final_summary_with_class, day_only=True)
print("Night")
interactive_dimension_plot_by_class(final_summary_with_class, night_only=True)
