# Exploratory Data Analysis (EDA)

## Table of Contents
1. [Dataset Overview](#dataset-overview)
2. [Format conversion](#conversion)
3. [Handling Missing Values](#handling-missing-values)
4. [Feature Distributions](#feature-distributions)
5. [Possible Biases](#possible-biases)
6. [Correlations](#correlations)


. [Correlations](#correlations)


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Upload local dataset
from google.colab import files
uploaded = files.upload()

In [None]:
!unzip /content/Sport1.zip

## Dataset Overview

[Provide a high-level overview of the dataset. This should include the source of the dataset, the number of samples, the number of features, and example showing the structure of the dataset.]

In [None]:
#!pip install gpxpy

import pandas as pd
import gpxpy

# Load gpx.
gpx_path = '/content/Sport/Rider1/f1.gpx'
with open(gpx_path) as f:
    gpx = gpxpy.parse(f)

# Convert to a dataframe one point at a time.
points = []
for segment in gpx.tracks[0].segments:
    for p in segment.points:
        points.append({
            'time': p.time,
            'latitude': p.latitude,
            'longitude': p.longitude,
            'elevation': p.elevation,
        })
df = pd.DataFrame.from_records(points)

# Number of samples
num_samples = df.shape[0]

# Number of features
num_features = df.shape[1]

# Display these dataset characteristics
print(f"Number of samples: {num_samples}")
print(f"Number of features: {num_features}")

# Display the first few rows of the dataframe to show the structure
print("Example data:")
print(df.head())



In [None]:
import os

# Path to the main folder containing Rider folders
main_path = '/content/Sport'

# Iterate through each Rider folder and count the number of .gpx files
for rider_folder in sorted(os.listdir(main_path)):
    folder_path = os.path.join(main_path, rider_folder)
    if os.path.isdir(folder_path):
        gpx_files = [f for f in os.listdir(folder_path) if f.endswith('.gpx')]
        tcx_files = [f for f in os.listdir(folder_path) if f.endswith('.tcx')]
        print(f"Folder '{rider_folder}' contains {len(gpx_files)} .gpx files and {len(tcx_files)} .tcx files.")


## Conversion
Convert GPX and TCX to XLS format. In order to correctly analyse all the data its format should be firstly unified.

In [None]:
# GPX to XLSX
import gpxpy
import gpxpy.gpx
import pandas as pd
import os

def convert_gpx_to_excel(gpx_file_path, output_file_path):
  # Initialize a DataFrame to store data
  all_data = []

  # Parse the GPX file
  with open(gpx_file_path, 'r') as gpx_file:
      gpx = gpxpy.parse(gpx_file)

  # Extract data (latitude, longitude, elevation, time, etc.)
  for track in gpx.tracks:
      for segment in track.segments:
          for point in segment.points:
              # Convert timezone-aware datetime to timezone-naive
              naive_time = point.time.replace(tzinfo=None) if point.time else None

              all_data.append({
                  'Latitude': point.latitude,
                  'Longitude': point.longitude,
                  'Elevation': point.elevation,
                  'Time': naive_time
              })

  # Convert the data into a DataFrame
  df = pd.DataFrame(all_data)

  # Write the DataFrame to an Excel file
  df.to_excel(output_file_path, index=False)


In [None]:
# TCX to XLSX
import pandas as pd
from tcxreader.tcxreader import TCXReader, TCXExercise

def convert_tcx_to_excel(tcx_file_path, output_file_path):
    """
    Converts a TCX file to an Excel file with trackpoint data.

    Parameters:
    - tcx_file_path: str, path to the input TCX file
    - output_file_path: str, path to save the output Excel file
    """
    # Initialize the TCX reader
    tcx_reader = TCXReader()

    # Read the TCX file
    data: TCXExercise = tcx_reader.read(tcx_file_path)

    # List to store the trackpoint data
    trackpoint_data = []

    # Loop through all trackpoints and extract relevant information
    for trackpoint in data.trackpoints:
        trackpoint_data.append({
            'Time': trackpoint.time,
            'Latitude': trackpoint.latitude,
            'Longitude': trackpoint.longitude,
            'Elevation': trackpoint.elevation,
            'Distance': trackpoint.distance,
            'Heartrate': trackpoint.hr_value,
            'Cadence': trackpoint.cadence,
            'Speed': trackpoint.tpx_ext['Speed']
        })

    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(trackpoint_data)

    # Save the DataFrame to an Excel file
    df.to_excel(output_file_path, index=False, engine='openpyxl')


In [None]:
import os
import shutil

# Define the directories
sport_dir = "/content/Sport"
sport_xlsx_dir = "/content/Sport_xlsx"

# Create Sport_xlsx directory if it doesn't exist
if not os.path.exists(sport_xlsx_dir):
    os.makedirs(sport_xlsx_dir)

# Function to convert files in a folder
def convert_files_in_folder(rider_folder):
    rider_xlsx_folder = os.path.join(sport_xlsx_dir, rider_folder)

    # Create the rider folder in Sport_xlsx directory if it doesn't exist
    if not os.path.exists(rider_xlsx_folder):
        os.makedirs(rider_xlsx_folder)

    rider_folder_path = os.path.join(sport_dir, rider_folder)

    # Process .gpx files
    gpx_files = [f for f in os.listdir(rider_folder_path) if f.endswith('.gpx')]
    for gpx_file in gpx_files:
        convert_gpx_to_excel(os.path.join(rider_folder_path, gpx_file),
                           os.path.join(rider_xlsx_folder, gpx_file.replace('.gpx', '.xlsx')))

    # Process .tcx files
    tcx_files = [f for f in os.listdir(rider_folder_path) if f.endswith('.tcx')]
    for tcx_file in tcx_files:
        convert_tcx_to_excel(os.path.join(rider_folder_path, tcx_file),
                           os.path.join(rider_xlsx_folder, tcx_file.replace('.tcx', '.xlsx')))


In [None]:
Riders = ['Rider1', 'Rider2', 'Rider3', 'Rider4', 'Rider5', 'Rider6', 'Rider7', 'Rider8', 'Rider9']
for rider in Riders:
  convert_files_in_folder(rider)

## Handling Missing Values

[Identify any missing values in the dataset, and describe your approach to handle them if there are any. If there are no missing values simply indicate that there are none.]


In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values


In [None]:
# Handling missing values
# Example: Replacing NaN values with the mean value of the column
# df.fillna(df.mean(), inplace=True)

# Your code for handling missing values goes here


## Feature Distributions

[Plot the distribution of various features and target variables. Comment on the skewness, outliers, or any other observations.]


In [None]:
# Example: Plotting histograms of all numerical features
df.hist(figsize=(12, 12))
plt.show()


## Possible Biases

[Investigate the dataset for any biases that could affect the model’s performance and fairness (e.g., class imbalance, historical biases).]


In [None]:
# Example: Checking for class imbalance in a classification problem
# sns.countplot(x='target_variable', data=df)

# Your code to investigate possible biases goes here


## Correlations

[Explore correlations between features and the target variable, as well as among features themselves.]


In [None]:
# Example: Plotting a heatmap to show feature correlations
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True)
plt.show()
