In [None]:
import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt
from scipy.stats import mode

import warnings

# Filter out warnings
warnings.filterwarnings("ignore")

In [None]:
main_folder_path = '../dataset/'

subfolders = [f.path for f in os.scandir(main_folder_path) if f.is_dir() and "20231207" in f.name and "tbl_AR41" in f.name]
subfolders

In [None]:
trains_new_dataset = [107, 131, 136, 172, 181, 194]

date_string = '20231207'

subfolders = [f.path for f in os.scandir(main_folder_path) if f.is_dir() and date_string in f.name and "tbl_AR41" in f.name]

# Create a 3x2 grid of histograms
fig, axs = plt.subplots(2, 3, figsize=(12, 6))

colors = ['blue', 'orange', 'green', 'red', 'purple', 'brown']


for jj, subfolder in enumerate(subfolders):
    files = [f.path for f in os.scandir(subfolder) if f.is_file()]
    file_path = files[0]
    raw_data = pd.read_csv(file_path, sep=';')
    raw_timestamps = raw_data['timestamps_UTC']
    intervals = pd.to_datetime(raw_timestamps).diff()
    
    intervals = intervals[intervals.dt.total_seconds() < 150]

    # Index axs using jj to get the correct subplot
    ax = axs[jj % 2,jj // 2]
    
    # Set x-axis range
    ax.set_xlim(0, 100)  # Adjust the range as needed
    
    # Set y-axis range
    ax.set_ylim(0, 120000)  # Adjust the range as needed

    n, bins, patches = ax.hist(intervals.dt.total_seconds(), bins=60, label=f'Train {trains_new_dataset[jj]}', color=colors[jj])
    ax.set(xlabel='Time Interval (seconds)', ylabel='Frequency')
    ax.legend()
    
    # Calculate mode(s)
    calc_mode = False
    if calc_mode:
        modes = mode(intervals.dt.total_seconds()).mode

        # Display mode(s) as vertical line(s)
        for mode_val in modes:
            ax.axvline(mode_val, color='red', linestyle='dashed', linewidth=2)

        # Annotate if there is more than one mode
        if len(modes) > 1:
            ax.text(0.5, 0.95, f'Multiple Modes', color='red', transform=ax.transAxes, ha='center', va='center', alpha=0.2)

    # Set title instead of legend
    ax.set_title(f'Train {trains_new_dataset[jj]}')
    ax.grid(alpha=0.2)
    # Remove the legend
    ax.legend([])
    
# Adjust layout
plt.tight_layout()


plt.savefig('../figures/distribution_time_intervals.png') 

plt.show()


# Regles pour decouper chaque segment avec des donnees continues
Le but de cette section est de decouper pouir chaque trains en segments de donnees en fonction de l'interval entre chaque timedelta

Plus tard, il serait interessant d'egalement decouper en fonction de la distance parcourue.

In [None]:
df = pd.read_csv('../dataset/tbl_AR41_train136_20231207/tbl_AR41 - Train 136 - 2023-12-07.csv',sep=';')

df['timestamp'] = pd.to_datetime(df['timestamps_UTC'])

# Calculate time difference between consecutive points
df['time_diff'] = df['timestamp'].diff()

# Set a threshold for time difference (you may need to adjust this based on your data)
time_threshold = pd.Timedelta('0 days 00:15:00')  # We consider >15min is a different segment

# Identify segments where the train is moving or standing still
df['movement'] = np.where(df['time_diff'] > time_threshold, 'New Segment', '')

# Create a segment ID for each segment
df['segment_id'] = (df['movement'] == 'New Segment').cumsum()

# Filter segments with at least 10,000 points
large_segments = df.groupby('segment_id').filter(lambda x: len(x) >= 1000)

# Count the average number of segments per day for each train
average_segments_per_day_train1 = large_segments_train1.groupby(large_segments_train1['timestamp'].dt.date).agg({'segment_id': 'nunique'}).mean()

# Drop temporary columns
df = df.drop(columns=['time_diff', 'movement'])

# Print the segmented dataframe
df

In [None]:
trains_new_dataset = [107, 131, 136, 172, 181, 194]

date_string = '20231207'

subfolders = [f.path for f in os.scandir(main_folder_path) if f.is_dir() and date_string in f.name and "tbl_AR41" in f.name]

list_of_dataframes = []

for jj, subfolder in enumerate(subfolders):
    files = [f.path for f in os.scandir(subfolder) if f.is_file()]
    file_path = files[0]
    raw_data = pd.read_csv(file_path, sep=';')
    list_of_dataframes.append(raw_data)

In [None]:


# Set a threshold for time difference (you may need to adjust this based on your data)
time_threshold = pd.Timedelta('0 days 00:15:00')  # Adjust according to your data

plt.figure(figsize=(12, 8))


for i, df_train in enumerate(list_of_dataframes):
    # Calculate time difference between consecutive points for each train
    df_train['time_diff'] = df_train['timestamp'].diff()

    # Identify segments where each train is moving or standing still
    df_train['movement'] = np.where(df_train['time_diff'] > time_threshold, 'New Segment', '')

    # Create a segment ID for each segment for each train
    df_train['segment_id'] = (df_train['movement'] == 'New Segment').cumsum()

    # Filter segments with at least 10,000 points for each train
    large_segments = df_train.groupby('segment_id').filter(lambda x: len(x) >= 1000)

    # Plot the segmented data for each train over time
    plt.scatter(large_segments['timestamp'], [i + 1] * len(large_segments), marker='o', label=f'Train {trains_new_dataset[i]}', s=4)

plt.legend()
plt.xlabel('Timestamp')
plt.ylabel('Train ID')
plt.title('Train Segments Over Time')
plt.show()


In [None]:

# Set a threshold for time difference (you may need to adjust this based on your data)
time_threshold = pd.Timedelta('0 days 00:15:00')  # Adjust according to your data

colors = ['blue', 'orange', 'green', 'red', 'purple', 'brown']

# Create a 1x6 grid of subplots
fig, axes = plt.subplots(nrows=len(list_of_dataframes), ncols=1, figsize=(10, 20), sharey=True)

for i, df_train in enumerate(list_of_dataframes):
    # Create a column 'date' with the current date based on the 'timestamp'
    df_train['date'] = df_train['timestamp'].dt.date

    # Count the number of samples per day for each train
    samples_per_day = df_train.groupby('date').size().reset_index(name='sample_count')

    # Plot the number of samples per day for each train
    axes[i].bar(samples_per_day['date'], samples_per_day['sample_count'], label=f'Train {trains_new_dataset[i]}',color=colors[i])
    axes[i].set_title(f'Train {trains_new_dataset[i]}')
    axes[i].set_xlabel('Date')
    axes[i].grid(alpha=0.3)

# Set a common y-axis label
axes[0].set_ylabel('Sample Count')

# Adjust layout and show the plot

plt.tight_layout()


plt.savefig('../figures/comparison_samples_per_day.png') 


plt.show()
