In [1]:
# As we use our own external modules, we need the folder src to be in the PYTHONPATH env variable.
# However we do not expect the reader to add that folder to the env variable,
# therefore we manually load it temporarily in each notebook.
import os
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:

import pandas as pd
from timeit import default_timer as timer
import json
import folium
from utils.config import PATH_DIR_TRIPS_RAW, PATH_TRIPS

In [3]:
start = timer()

dfs = []
for root, subdirs, files in os.walk(PATH_DIR_TRIPS_RAW):
    for file in files:
        path_to_csv = os.path.join(root, file)
        df = pd.read_csv(path_to_csv)
        dfs.append(df)

trips_raw = pd.concat(dfs)

end = timer()
print(f"Succesfully merged csv data into one dataframe in {(end - start):.2f} seconds")

/home/moritz/data/Uni/Bachelorseminar/Trips
Succesfully merged csv data into one dataframe in 3.27 seconds


In [4]:
trips_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1055091 entries, 0 to 5079
Data columns (total 24 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   id                   1055091 non-null  object 
 1   provider             1055091 non-null  object 
 2   city                 1055091 non-null  object 
 3   vehicleType          1055091 non-null  object 
 4   model                616634 non-null   object 
 5   datetime_start       1055091 non-null  object 
 6   date_start           1055091 non-null  int64  
 7   time_start           1055091 non-null  int64  
 8   datetime_end         1055091 non-null  object 
 9   date_end             1055091 non-null  int64  
 10  time_end             1055091 non-null  int64  
 11  longitude_start      1055091 non-null  float64
 12  latitude_start       1055091 non-null  float64
 13  longitude_end        1055091 non-null  float64
 14  latitude_end         1055091 non-null  float64
 15  f

The relevant columns (id, provider, vehicleType, datetime_start, datetime_end, longitude_start, longitude_end, latitude_start, latitude_end, distance) do not have any null values. Therefore we do not have to perform any deletion or imputation strategies.

In [5]:
lons = pd.concat([trips_raw['longitude_start'], trips_raw['longitude_end']])
lats = pd.concat([trips_raw['latitude_start'], trips_raw['latitude_end']])

In [6]:
lat_min = lats.min()
lat_max = lats.max()
lon_min = lons.min()
lon_max = lons.max()

points = [(lat_max, lon_min), (lat_min, lon_min), (lat_min, lon_max), (lat_max, lon_max), (lat_max, lon_min)]

In [7]:
fmap = folium.Map(location=(50.9253, 6.9495), zoom_start=11, control_scale=True, max_zoom=20)
folium.PolyLine(points).add_to(fmap)
display(fmap)

All trips starting and ending locations fall within the blue square. This seems plausible, so there are no outliers in the geospatial data.

In [8]:
# convert string time columns to datetime format
trips_raw['datetime_start'] =  pd.to_datetime(trips_raw['datetime_start'], format='%Y%m%d-%H%M%S')
trips_raw['datetime_end'] =  pd.to_datetime(trips_raw['datetime_end'], format='%Y%m%d-%H%M%S')

In [9]:
trips_raw["start"] = list(zip(trips_raw["latitude_start"],trips_raw["longitude_start"]))
trips_raw["end"] = list(zip(trips_raw["latitude_end"],trips_raw["longitude_end"]))

In [10]:
trips_raw["vehicleType"] = trips_raw["vehicleType"].replace({'kick scooter': 'kick_scooter'})

In [11]:
start = timer()
os.makedirs(os.path.dirname(PATH_TRIPS), exist_ok=True)
trips_raw.to_pickle(PATH_TRIPS)
end = timer()
print(f"Succesfully saved dataframe to pickle in {(end - start):.2f} seconds")

Succesfully saved dataframe to pickle in 0.86 seconds


In [12]:
trips_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1055091 entries, 0 to 5079
Data columns (total 26 columns):
 #   Column               Non-Null Count    Dtype         
---  ------               --------------    -----         
 0   id                   1055091 non-null  object        
 1   provider             1055091 non-null  object        
 2   city                 1055091 non-null  object        
 3   vehicleType          1055091 non-null  object        
 4   model                616634 non-null   object        
 5   datetime_start       1055091 non-null  datetime64[ns]
 6   date_start           1055091 non-null  int64         
 7   time_start           1055091 non-null  int64         
 8   datetime_end         1055091 non-null  datetime64[ns]
 9   date_end             1055091 non-null  int64         
 10  time_end             1055091 non-null  int64         
 11  longitude_start      1055091 non-null  float64       
 12  latitude_start       1055091 non-null  float64       
 13  