### Install necessary packages

In [1]:
#!pip install gpxpy geopy folium matplotlib

### Import libraries and define helper functions

In [2]:
import os
import gpxpy
import pandas as pd
import numpy as np
import folium
from geopy.distance import geodesic
from datetime import datetime, timedelta
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import plotly.express as px
import zipfile
import gzip
import plotly.graph_objs as go
from fit2gpx import Converter

### Functions for File Processing and Conversion

In [3]:
# Function to convert FIT files to GPX
def convert_fit_to_gpx(file_path, output_path):
    try:
        converter = Converter()
        converter.fit_to_gpx(file_path, output_path)
        return output_path
    except Exception as e:
        print(f"Error converting {file_path} to GPX: {e}")
        return None
    
# Function to check for duplicates in tracks (used at the end of read_gpx_files to account for duplicate processing after converting .fit files)
def remove_duplicate_tracks(tracks):
    unique_tracks = []
    seen = set()

    for track in tracks:
        if len(track) < 2:
            continue

        start_time = track[0][3]
        end_time = track[-1][3]
        total_distance = sum(calculate_distance(track[i], track[i + 1]) for i in range(len(track) - 1))
        num_points = len(track)

        track_id = (start_time, end_time, total_distance, num_points)

        if track_id not in seen:
            seen.add(track_id)
            unique_tracks.append(track)

    return unique_tracks

# Function to read GPX files and extract the tracks
def read_gpx_files(folder_path):
    tracks = []
    processed_files = set()  # Keep track of processed files
    total_items = len(os.listdir(folder_path))
    processed_items = 0

    for item in os.listdir(folder_path):
        item_path = os.path.join(folder_path, item)
        print(f"Processing {item_path}")  # Debugging print to verify the path

        if item_path in processed_files:
            print(f"Skipping duplicate file {item_path}")
            continue

        if os.path.isfile(item_path):
            if item.endswith('.gpx'):
                # Process GPX file directly
                try:
                    with open(item_path, 'r') as gpx_file:
                        gpx = gpxpy.parse(gpx_file)
                        for track in gpx.tracks:
                            for segment in track.segments:
                                points = [(point.latitude, point.longitude, point.elevation, point.time) for point in segment.points]
                                tracks.append(points)
                    processed_files.add(item_path)
                    processed_items += 1
                except Exception as e:
                    print(f"Error processing file {item_path}: {e}")
            elif item.endswith('.gpx.gz'):
                # Process GPX file inside a gzip archive
                try:
                    with gzip.open(item_path, 'rt') as gpx_file:
                        gpx = gpxpy.parse(gpx_file)
                        for track in gpx.tracks:
                            for segment in track.segments:
                                points = [(point.latitude, point.longitude, point.elevation, point.time) for point in segment.points]
                                tracks.append(points)
                    processed_files.add(item_path)
                    processed_items += 1
                except Exception as e:
                    print(f"Error processing file {item_path}: {e}")
            elif item.endswith('.fit'):
                # Convert FIT file to GPX and process
                try:
                    gpx_path = convert_fit_to_gpx(item_path, item_path.replace('.fit', '.gpx'))
                    if gpx_path:
                        with open(gpx_path, 'r') as gpx_file:
                            gpx = gpxpy.parse(gpx_file)
                            for track in gpx.tracks:
                                for segment in track.segments:
                                    points = [(point.latitude, point.longitude, point.elevation, point.time) for point in segment.points]
                                    tracks.append(points)
                    processed_files.add(item_path)
                    processed_files.add(gpx_path)
                    processed_items += 1
                except Exception as e:
                    print(f"Error processing file {item_path}: {e}")
            elif item.endswith('.fit.gz'):
                # Process FIT file inside a gzip archive
                try:
                    with gzip.open(item_path, 'rb') as fit_file:
                        fit_path = item_path.replace('.fit.gz', '.fit')
                        with open(fit_path, 'wb') as f:
                            f.write(fit_file.read())
                        gpx_path = convert_fit_to_gpx(fit_path, fit_path.replace('.fit', '.gpx'))
                        if gpx_path:
                            with open(gpx_path, 'r') as gpx_file:
                                gpx = gpxpy.parse(gpx_file)
                                for track in gpx.tracks:
                                    for segment in track.segments:
                                        points = [(point.latitude, point.longitude, point.elevation, point.time) for point in segment.points]
                                        tracks.append(points)
                    processed_files.add(item_path)
                    processed_files.add(fit_path)
                    processed_files.add(gpx_path)
                    processed_items += 1
                except Exception as e:
                    print(f"Error processing file {item_path}: {e}")
        elif os.path.isdir(item_path):
            # Handle folders that may contain GPX or FIT files
            for inner_item in os.listdir(item_path):
                inner_item_path = os.path.join(item_path, inner_item)
                print(f"Processing {inner_item_path}")  # Debugging print to verify the path

                if inner_item_path in processed_files:
                    print(f"Skipping duplicate file {inner_item_path}")
                    continue

                if os.path.isfile(inner_item_path):
                    if inner_item.endswith('.gpx'):
                        try:
                            with open(inner_item_path, 'r') as gpx_file:
                                gpx = gpxpy.parse(gpx_file)
                                for track in gpx.tracks:
                                    for segment in track.segments:
                                        points = [(point.latitude, point.longitude, point.elevation, point.time) for point in segment.points]
                                        tracks.append(points)
                            processed_files.add(inner_item_path)
                            processed_items += 1
                        except Exception as e:
                            print(f"Error processing file {inner_item_path}: {e}")
                    elif inner_item.endswith('.gpx.gz'):
                        try:
                            with gzip.open(inner_item_path, 'rt') as gpx_file:
                                gpx = gpxpy.parse(gpx_file)
                                for track in gpx.tracks:
                                    for segment in track.segments:
                                        points = [(point.latitude, point.longitude, point.elevation, point.time) for point in segment.points]
                                        tracks.append(points)
                            processed_files.add(inner_item_path)
                            processed_items += 1
                        except Exception as e:
                            print(f"Error processing file {inner_item_path}: {e}")
                    elif inner_item.endswith('.fit'):
                        try:
                            gpx_path = convert_fit_to_gpx(inner_item_path, inner_item_path.replace('.fit', '.gpx'))
                            if gpx_path:
                                with open(gpx_path, 'r') as gpx_file:
                                    gpx = gpxpy.parse(gpx_file)
                                    for track in gpx.tracks:
                                        for segment in track.segments:
                                            points = [(point.latitude, point.longitude, point.elevation, point.time) for point in segment.points]
                                            tracks.append(points)
                            processed_files.add(inner_item_path)
                            processed_files.add(gpx_path)
                            processed_items += 1
                        except Exception as e:
                            print(f"Error processing file {inner_item_path}: {e}")
                    elif inner_item.endswith('.fit.gz'):
                        try:
                            with gzip.open(inner_item_path, 'rb') as fit_file:
                                fit_path = inner_item_path.replace('.fit.gz', '.fit')
                                with open(fit_path, 'wb') as f:
                                    f.write(fit_file.read())
                                gpx_path = convert_fit_to_gpx(fit_path, fit_path.replace('.fit', '.gpx'))
                                if gpx_path:
                                    with open(gpx_path, 'r') as gpx_file:
                                        gpx = gpxpy.parse(gpx_file)
                                        for track in gpx.tracks:
                                            for segment in track.segments:
                                                points = [(point.latitude, point.longitude, point.elevation, point.time) for point in segment.points]
                                                tracks.append(points)
                            processed_files.add(inner_item_path)
                            processed_files.add(fit_path)
                            processed_files.add(gpx_path)
                            processed_items += 1
                        except Exception as e:
                            print(f"Error processing file {inner_item_path}: {e}")

    print(f"Total items in folder: {total_items}")
    print(f"Processed items: {processed_items}")
    print(f"Tracks found: {len(tracks)}")

    tracks = remove_duplicate_tracks(tracks)
    print(f"Unique Tracks: {len(tracks)}")
    
    return tracks

# Function to count the frequency of points in tracks
def count_point_frequencies(tracks):
    point_counter = {}
    for track in tracks:
        for point in track:
            lat_lon = (point[0], point[1])
            if lat_lon in point_counter:
                point_counter[lat_lon] += 1
            else:
                point_counter[lat_lon] = 1
    return point_counter

# Function to calculate the distance between two points
def calculate_distance(point1, point2):
    return geodesic((point1[0], point1[1]), (point2[0], point2[1])).meters

### Define functions to create maps

In [4]:
# Function to create maps from tracks and point frequencies
def create_maps(tracks, point_counter):
    # Create a global map
    m_global = folium.Map(location=[0, 0], zoom_start=2)
    
    for track in tracks:
        points = [(point[0], point[1]) for point in track if point[0] is not None and point[1] is not None]
        if points:  # Check if points list is not empty
            folium.PolyLine(points, color='red', weight=2.5, opacity=1).add_to(m_global)
    
    # Create a local map for Copenhagen area
    m_local = folium.Map(location=[55.6761, 12.5683], zoom_start=12)
    
    for point, count in point_counter.items():
        if count > 10:
            folium.CircleMarker(location=point, radius=5, color='blue', fill=True, fill_color='blue').add_to(m_local)
    
    # Save the maps
    m_global.save('global_map.html')
    m_local.save('local_map.html')

    # Display the maps
    display(HTML('global_map.html'))
    display(HTML('local_map.html'))

### Define functions to save run list and statistics HTML

In [5]:
# Optimized function to save runs list HTML with sortable columns
def save_runs_list_html(tracks, output_file='runs_list.html'):
    # Sort tracks by start time to assign run numbers in ascending order
    tracks.sort(key=lambda x: x[0][3])

    runs_info = []
    for idx, track in enumerate(tracks):
        if len(track) < 2:
            continue
        distances = np.array([calculate_distance(track[i], track[i + 1]) for i in range(len(track) - 1)])
        total_distance_meters = distances.sum()
        total_distance_km = total_distance_meters / 1000
        times = np.array([(track[i + 1][3] - track[i][3]).total_seconds() for i in range(len(track) - 1)])
        total_time_seconds = times.sum()
        avg_pace_seconds_per_km = total_time_seconds / total_distance_km if total_distance_km > 0 else 0
        avg_pace_minutes = int(avg_pace_seconds_per_km // 60)
        avg_pace_seconds = int(avg_pace_seconds_per_km % 60)
        start_time = track[0][3]
        end_time = track[-1][3]
        run_date = start_time.strftime("%Y-%m-%d")
        run_time = f"{start_time.strftime('%H:%M:%S')} - {end_time.strftime('%H:%M:%S')} (Time: {str(timedelta(seconds=int(total_time_seconds)))})"
        
        runs_info.append({
            'Run Number': idx + 1,
            'Date': run_date,
            'Time': run_time,
            'Distance (km)': f"{total_distance_km:.3f}",
            'Average Pace': f"{avg_pace_minutes}:{avg_pace_seconds:02d} min/km",
            'Average Pace Seconds': avg_pace_seconds_per_km  # for sorting
        })
    
    # Reverse the list to have the most recent run first
    runs_info.reverse()
    
    html_content = """
    <html>
    <head>
        <style>
            body { font-family: Arial, sans-serif; color: #333; }
            table { width: 100%; border-collapse: collapse; margin: 20px 0; }
            th, td { padding: 12px; border: 1px solid #ddd; text-align: left; }
            th { background-color: #f4f4f4; cursor: pointer; }
            th.sort-asc::after { content: " \\2191"; }
            th.sort-desc::after { content: " \\2193"; }
            tr:nth-child(even) { background-color: #f9f9f9; }
        </style>
        <script>
            document.addEventListener('DOMContentLoaded', () => {
                const getCellValue = (tr, idx) => tr.children[idx].innerText || tr.children[idx].textContent;
                const comparer = (idx, asc, type) => (a, b) => {
                    let v1 = getCellValue(asc ? a : b, idx);
                    let v2 = getCellValue(asc ? b : a, idx);
                    if (type === 'date') {
                        v1 = new Date(v1);
                        v2 = new Date(v2);
                    } else if (type === 'pace') {
                        const [min1, sec1] = v1.split(':');
                        const [min2, sec2] = v2.split(':');
                        v1 = parseInt(min1) * 60 + parseInt(sec1);
                        v2 = parseInt(min2) * 60 + parseInt(sec2);
                    } else if (!isNaN(v1) && !isNaN(v2)) {
                        v1 = parseFloat(v1);
                        v2 = parseFloat(v2);
                    }
                    return v1 > v2 ? 1 : v1 < v2 ? -1 : 0;
                };

                document.querySelectorAll('th').forEach(th => th.addEventListener('click', (() => {
                    const table = th.closest('table');
                    const type = th.getAttribute('data-type');
                    Array.from(table.querySelectorAll('tr:nth-child(n+2)'))
                        .sort(comparer(Array.from(th.parentNode.children).indexOf(th), this.asc = !this.asc, type))
                        .forEach(tr => table.appendChild(tr));
                    th.classList.toggle('sort-asc', this.asc);
                    th.classList.toggle('sort-desc', !this.asc);
                })));
            });
        </script>
    </head>
    <body>
        <table>
            <tr>
                <th data-type="number">Run Number</th>
                <th data-type="date">Date</th>
                <th data-type="text">Time</th>
                <th data-type="number">Distance (km)</th>
                <th data-type="pace">Average Pace (min/km)</th>
            </tr>
    """
    for run_info in runs_info:
        html_content += f"""
            <tr>
                <td>{run_info['Run Number']}</td>
                <td>{run_info['Date']}</td>
                <td>{run_info['Time']}</td>
                <td>{run_info['Distance (km)']}</td>
                <td>{run_info['Average Pace']}</td>
            </tr>
        """
    html_content += """
        </table>
    </body>
    </html>
    """
    with open(output_file, 'w') as f:
        f.write(html_content)

# Optimized function to save statistics HTML with additional yearly statistics
def save_statistics_html(tracks, output_file='statistics.html'):
    from datetime import datetime
    
    current_year = datetime.now().year
    total_runs = len(tracks)
    total_distance_km = sum(sum(calculate_distance(track[i], track[i + 1]) for i in range(len(track) - 1)) for track in tracks) / 1000
    avg_distance_km = total_distance_km / total_runs if total_runs else 0

    tracks_this_year = [track for track in tracks if track[0][3].year == current_year]
    total_runs_this_year = len(tracks_this_year)
    total_distance_km_this_year = sum(sum(calculate_distance(track[i], track[i + 1]) for i in range(len(track) - 1)) for track in tracks_this_year) / 1000
    avg_distance_km_this_year = total_distance_km_this_year / total_runs_this_year if total_runs_this_year else 0
    
    last_run_date = max(track[-1][3] for track in tracks).strftime("%d.%m.%Y")

    html_content = f"""
    <html>
    <head>
        <style>
            body {{ font-family: Arial, sans-serif; color: #333; }}
        </style>
    </head>
    <body>
        <p><strong>Total Runs:</strong> {total_runs}</p>
        <p><strong>Total Distance (km):</strong> {total_distance_km:.3f}</p>
        <p><strong>Average Distance per Run (km):</strong> {avg_distance_km:.3f}</p>
        <br>
        <p><strong>Total Runs, This Year:</strong> {total_runs_this_year}</p>
        <p><strong>Total Distance, This Year (km):</strong> {total_distance_km_this_year:.3f}</p>
        <p><strong>Average Distance per Run, This Year (km):</strong> {avg_distance_km_this_year:.3f}</p>
        <br>
        <p><strong>Date of Last Run:</strong> {last_run_date}</p>
    </body>
    </html>
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(html_content)

# Sample usage (replace this with actual tracks data)
# Each track is a list of tuples where each tuple contains (latitude, longitude, altitude, timestamp)
# Example: [(lat1, lon1, alt1, timestamp1), (lat2, lon2, alt2, timestamp2), ...]

# Example usage
tracks = [
    # Sample track data
]

### Define functions to save runs over years HTML

In [6]:
# Function to save runs over years HTML
def save_runs_over_years_html(tracks, output_file='runs_over_years.html'):
    years = [track[0][3].year for track in tracks]
    runs_per_year = pd.Series(years).value_counts().sort_index()

    fig = px.bar(runs_per_year, x=runs_per_year.index, y=runs_per_year.values, labels={'index': 'Year', 'y': 'Number of Runs'}, title='Number of Runs Over the Years')
    fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#333')

    fig_html = fig.to_html(full_html=False)

    html_content = f"""
    <html>
    <head>
        <style>
            body {{ font-family: Arial, sans-serif; color: #333; }}
        </style>
    </head>
    <body>
        {fig_html}
    </body>
    </html>
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(html_content)


In [7]:
# Helper function to format pace in min:sec/km
def format_pace(seconds_per_km):
    if seconds_per_km == 0:
        return "N/A"
    minutes = int(seconds_per_km // 60)
    seconds = int(seconds_per_km % 60)
    return f"{minutes}:{seconds:02d} min/km"

# Function to save scatter plot HTML for distance vs. pace with correct pace data and clickable year legend
def save_distance_vs_pace_html(tracks, output_file='distance_vs_pace.html'):
    run_data = []
    for idx, track in enumerate(tracks):
        if len(track) < 2:
            continue
        distances = np.array([calculate_distance(track[i], track[i + 1]) for i in range(len(track) - 1)])
        total_distance_km = distances.sum() / 1000
        times = np.array([(track[i + 1][3] - track[i][3]).total_seconds() for i in range(len(track) - 1)])
        total_time_seconds = times.sum()
        avg_pace_seconds_per_km = total_time_seconds / total_distance_km if total_distance_km > 0 else 0
        run_date = track[0][3].strftime("%d.%m.%Y")
        total_time = str(timedelta(seconds=int(total_time_seconds)))

        run_data.append({
            'Run Number': idx + 1,
            'Date': run_date,
            'Distance (km)': total_distance_km,
            'Total Time': total_time,
            'Average Pace (seconds/km)': avg_pace_seconds_per_km,
            'Average Pace (min/km)': format_pace(avg_pace_seconds_per_km),
            'Year': track[0][3].year
        })

    run_df = pd.DataFrame(run_data)

    fig = go.Figure()
    years = run_df['Year'].unique()
    
    for year in years:
        year_data = run_df[run_df['Year'] == year]
        fig.add_trace(go.Scatter(
            x=year_data['Distance (km)'],
            y=year_data['Average Pace (seconds/km)'],
            mode='markers',
            name=str(year),
            text=[f"Run Number: {rn}<br>Date: {date}<br>Distance: {distance:.3f} km<br>Total Time: {time}<br>Pace: {pace}" for rn, date, distance, time, pace in zip(year_data['Run Number'], year_data['Date'], year_data['Distance (km)'], year_data['Total Time'], year_data['Average Pace (min/km)'])],
            hoverinfo='text'
        ))

    fig.update_layout(
        title='Distance vs. Pace',
        xaxis_title='Distance (km)',
        yaxis_title='Average Pace (min/km)',
        plot_bgcolor='rgba(0,0,0,0)',
        paper_bgcolor='rgba(0,0,0,0)',
        font_color='#333',
        yaxis=dict(
            autorange='reversed',  # Lower values (faster pace) at the bottom
            tickvals=[i * 60 for i in range(15)],  # Assumed paces range from 0 to 15 min/km
            ticktext=[format_pace(i * 60) for i in range(15)]
        ),
        legend_title_text='Year'
    )

    fig.update_traces(marker=dict(size=12))  # Adjust marker size if needed

    fig_html = fig.to_html(full_html=False)

    html_content = f"""
    <html>
    <head>
        <style>
            body {{ font-family: Arial, sans-serif; color: #333; }}
        </style>
    </head>
    <body>
        {fig_html}
    </body>
    </html>
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(html_content)

# Sample execution (replace this with actual tracks data)
# Each track is a list of tuples where each tuple contains (latitude, longitude, altitude, timestamp)
# Example: [(lat1, lon1, alt1, timestamp1), (lat2, lon2, alt2, timestamp2), ...]

# Example usage
tracks = [
    # Sample track data
]

In [8]:
# Main execution code

# This is the extracted folder provided by strava
folder_path = '../export_133931837/activities'

tracks = read_gpx_files(folder_path)
point_counter = count_point_frequencies(tracks)

# Create global and local maps
create_maps(tracks, point_counter)

# Save and display the runs list HTML
save_runs_list_html(tracks)
display(HTML('runs_list.html'))

# Save and display the statistics HTML
save_statistics_html(tracks)
display(HTML('statistics.html'))

# Save and display the runs over years HTML
save_runs_over_years_html(tracks)
display(HTML('runs_over_years.html'))

# Save the distance vs. pace scatter plot HTML
save_distance_vs_pace_html(tracks, output_file='distance_vs_pace.html')

Processing ../export_133931837/activities\11504250003.gpx
Processing ../export_133931837/activities\11510835206.gpx
Processing ../export_133931837/activities\11534585640.gpx
Processing ../export_133931837/activities\11559022112.gpx
Processing ../export_133931837/activities\11573608149.gpx
Processing ../export_133931837/activities\11583303831.gpx
Processing ../export_133931837/activities\11613936059.gpx
Processing ../export_133931837/activities\11638474422.gpx
Processing ../export_133931837/activities\11651910629.gpx
Processing ../export_133931837/activities\11667437600.gpx
Processing ../export_133931837/activities\11684359214.gpx
Processing ../export_133931837/activities\11706555919.gpx
Processing ../export_133931837/activities\11714495319.gpx
Processing ../export_133931837/activities\11746337793.gpx
Processing ../export_133931837/activities\11761940871.gpx
Processing ../export_133931837/activities\11777845016.gpx
Processing ../export_133931837/activities\11800518665.gpx
Processing ../

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_points[field].fillna(df_points[f'enhanced_{field}'], inplace=True)


Processing ../export_133931837/activities\12707253196.fit.gz
Processing ../export_133931837/activities\12707253196.gpx
Skipping duplicate file ../export_133931837/activities\12707253196.gpx
Total items in folder: 96
Processed items: 95
Tracks found: 95
Unique Tracks: 94


Run Number,Date,Time,Distance (km),Average Pace (min/km)
94,2024-07-18,09:07:16 - 09:26:43 (Time: 0:19:27),4.164,4:40 min/km
93,2024-07-13,17:51:06 - 19:37:51 (Time: 1:46:45),21.173,5:02 min/km
92,2024-07-11,19:41:16 - 20:03:53 (Time: 0:22:37),5.016,4:30 min/km
91,2024-07-07,08:50:30 - 10:26:09 (Time: 1:35:39),13.045,7:19 min/km
90,2024-07-06,13:25:33 - 13:56:33 (Time: 0:31:00),6.4,4:50 min/km
89,2024-07-03,17:11:45 - 17:59:45 (Time: 0:48:00),10.034,4:47 min/km
88,2024-06-30,18:50:24 - 19:38:57 (Time: 0:48:33),10.034,4:50 min/km
87,2024-06-28,19:10:29 - 19:32:53 (Time: 0:22:24),5.016,4:27 min/km
86,2024-06-26,17:44:37 - 18:19:16 (Time: 0:34:39),7.061,4:54 min/km
85,2024-06-22,18:06:11 - 18:28:03 (Time: 0:21:52),5.019,4:21 min/km
