In [1]:
import pandas as pd
from gpxcsv import gpxtolist
from pathlib import Path
from config import config

Defining a couple of functions to glob the gpx files and read them to a pandas dataframe

In [4]:
def glob_files(folder: str) -> list:
    '''
    Returns a list of all the gpx files in the input folder
    '''
    
    gpx_folder = Path(folder)

    return [x for x in gpx_folder.glob('*.gpx')]

def gpxs_to_df(filelist: list) -> pd.DataFrame:
    '''
    iterates through the filelist and appends all the data to a dataframe
    '''
    
    df = pd.DataFrame()
    
    for filepath in filelist:
        temp_df = pd.DataFrame(gpxtolist(str(filepath)))
        df = pd.concat([df, temp_df], ignore_index=True)
        
    return df

Running the functions and checking the resulting dataframe

In [5]:
filelist = glob_files(config["gpx_folder"])

df = gpxs_to_df(filelist)

df

Unnamed: 0,lat,lon,speed,course,time
0,50.721499,-1.997310,2.810000,234.140625,2022-06-27T19:04:08.368Z
1,50.721435,-1.997435,2.840000,227.109375,2022-06-27T19:04:12.456Z
2,50.721368,-1.997555,2.720000,230.976562,2022-06-27T19:04:16.503Z
3,50.721306,-1.997672,2.760000,234.843750,2022-06-27T19:04:20.147Z
4,50.721252,-1.997816,2.880000,240.117188,2022-06-27T19:04:24.174Z
...,...,...,...,...,...
107137,50.766310,-1.188248,4.895547,107.131947,2022-09-27T16:06:33.046Z
107138,50.766292,-1.188096,5.339956,103.700630,2022-09-27T16:06:35.063Z
107139,50.766281,-1.187933,5.709530,96.992155,2022-09-27T16:06:37.120Z
107140,50.707670,-1.969139,1.920390,27.523740,2022-04-02T14:03:08.131Z


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107142 entries, 0 to 107141
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   lat     107142 non-null  float64
 1   lon     107142 non-null  float64
 2   speed   107142 non-null  float64
 3   course  107142 non-null  float64
 4   time    107142 non-null  object 
dtypes: float64(4), object(1)
memory usage: 4.1+ MB


Having a look at the first 4 colums, looking for outliers

In [7]:
df.lat.describe()

count    107142.000000
mean         50.635248
std           0.275340
min          49.581481
25%          50.656247
50%          50.733327
75%          50.771276
max          50.902602
Name: lat, dtype: float64

In [8]:
df.lon.describe()

count    107142.000000
mean         -1.257214
std           0.462826
min          -2.037418
25%          -1.619437
50%          -1.237366
75%          -0.953824
max           0.060685
Name: lon, dtype: float64

In [9]:
df.speed.describe()

count    107142.000000
mean          3.216125
std           1.716835
min          -1.000000
25%           2.419019
50%           3.113813
75%           3.640235
max          18.450001
Name: speed, dtype: float64

In [10]:
df.course.describe()

count    107142.000000
mean        170.489783
std          91.491364
min          -1.000000
25%         101.611816
50%         160.982765
75%         242.933410
max         359.999939
Name: course, dtype: float64

There doesn't seem to be any outlier in latitude and longitude

-1 values in speed and course are suspicious, shouldn't their values be always greater than 0?

All entries with speed <0 have value -1, for all of them (329 datapoints) the course also has value -1

In [11]:
df[(df["course"]<0)].speed.describe()

count    590.000000
mean      -0.283467
std        1.051606
min       -1.000000
25%       -1.000000
50%       -1.000000
75%        0.000000
max        4.641315
Name: speed, dtype: float64

All entries with course <0 have value -1, for some of them (590 datapoints) speed has a positive value

I will consider all points with speed or course <0 to be outliers and remove them from the dataframe

In [12]:
df = df[~((df["speed"]<0) | (df["course"]<0))]

len(df)

106552

Convert time column datatype to datetime

In [13]:
df.time = pd.to_datetime(df.time)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106552 entries, 0 to 107141
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype              
---  ------  --------------   -----              
 0   lat     106552 non-null  float64            
 1   lon     106552 non-null  float64            
 2   speed   106552 non-null  float64            
 3   course  106552 non-null  float64            
 4   time    106552 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), float64(4)
memory usage: 4.9 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.time = pd.to_datetime(df.time)


Find what percentage of tracks exceeded 10 knots

I'm assuming the speed in the gpx files is recorded in m/s, if so then greater than 10 knots means >= 5.15 m/s

In [18]:
100 * len(df[df.speed>=5.15]) / len(df)

5.501539154591185

if the speed in the gpx files is recorded in knots

In [19]:
100 * len(df[df.speed>10]) / len(df)

1.434041594714318