In [2]:
import tarfile
import os

import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import shapely

In [3]:
import sys, os
sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2

from modules.config import *

### Read & Merge CSV Files

In [4]:
tarfile_path = TRIPS_TARFILE_PATH
unpacked_path = UNPACKED_TRIPS_DIR_PATH


In [5]:
# Unpack
with tarfile.open(tarfile_path, 'r') as tar:
	tar.extractall(DATA_DIR_PATH)

In [6]:
# find all csv files in the unpacked directory
csv_files = []

def get_csv_files_recursively(path):
	for file in os.listdir(path):
		if os.path.isdir(os.path.join(path, file)):
			get_csv_files_recursively(os.path.join(path, file))
		elif file.endswith(".csv"):
			csv_files.append(os.path.join(path, file))

get_csv_files_recursively(unpacked_path)

csv_files[:5]

['/Users/hannesvogelsang/Programming/AAA_Project/aaa-2022-magma/00_data/leipzig/2019/03/positions_nextbike_leipzig_2019_03_20.csv',
 '/Users/hannesvogelsang/Programming/AAA_Project/aaa-2022-magma/00_data/leipzig/2019/03/positions_nextbike_leipzig_2019_03_08.csv',
 '/Users/hannesvogelsang/Programming/AAA_Project/aaa-2022-magma/00_data/leipzig/2019/03/positions_nextbike_leipzig_2019_03_09.csv',
 '/Users/hannesvogelsang/Programming/AAA_Project/aaa-2022-magma/00_data/leipzig/2019/03/positions_nextbike_leipzig_2019_03_21.csv',
 '/Users/hannesvogelsang/Programming/AAA_Project/aaa-2022-magma/00_data/leipzig/2019/03/positions_nextbike_leipzig_2019_03_23.csv']

In [7]:
data_df = pd.concat(map(pd.read_csv, csv_files))

In [8]:
data_df.head(2)

Unnamed: 0,p_spot,b_lock_types,p_maintenance,p_bike_racks,p_terminal_type,p_place_type,p_number,p_rack_locks,p_uid,b_state,...,city,p_booked_bikes,p_name,b_bike_type,p_lat,b_pedelec_battery,p_bike_types,b_battery_pack,p_special_racks,p_free_special_racks
0,False,frame_lock,False,0,,12,0.0,False,13518747,ok,...,leipzig,0,BIKE 23510,71,51.338882,0.0,"{""71"": 1}",,,
1,False,frame_lock,False,0,,12,0.0,False,13518747,ok,...,leipzig,0,BIKE 23510,71,51.338882,0.0,"{""71"": 1}",,,


### Remove Insignificant Columns

In [9]:
# find columns with only one value
columns_with_one_unique_value = data_df.columns[data_df.nunique() == 1]
columns_with_one_unique_value

Index(['p_maintenance', 'p_bike_racks', 'p_rack_locks', 'b_state',
       'p_free_racks', 'b_active', 'city', 'p_special_racks',
       'p_free_special_racks'],
      dtype='object')

In [10]:

data_df = data_df.drop(columns=columns_with_one_unique_value)


In [11]:
rename_dict = {
	"p_lat": "lat",
	"p_lng": "lng",
}

data_df = data_df.rename(columns=rename_dict)

### Remove Trips Outside of Flexzone

In [12]:
flexzone = gpd.read_file(FLEXZONE_GEOJSON_PATH)

In [13]:
flexzone.geometry = flexzone.geometry.map(lambda polygon: shapely.ops.transform(lambda x, y: (y, x), polygon))
flexzone

Unnamed: 0,color,fill,name,domain,cityId,category,serviceCases,geometry
0,#00CBFF,#00CBFF,LE_Blau_West,le,1,free_return,[ ],"POLYGON ((51.30176 12.32817, 51.30065 12.32812..."
1,#ff00e7,#ff00e7,LE_Pink_Ost,le,1,chargeable_return,"[ ""21"" ]","POLYGON ((51.33200 12.35440, 51.33206 12.35397..."
2,#00CBFF,#00CBFF,LE_Blau_Ost,le,1,free_return,[ ],"POLYGON ((51.32113 12.36297, 51.32118 12.36301..."
3,#ff00e7,#ff00e7,LE_Pink_West,le,1,chargeable_return,[ ],"POLYGON ((51.31175 12.32583, 51.31167 12.32581..."
4,#f2f3f5,#f2f3f5,FlexZone Leipzig Brünnerstr.,le,1,free_return,[ ],"POLYGON ((51.32514 12.30716, 51.32525 12.30754..."
5,#00CBFF,#00CBFF,LE_Blau_Schoenefeld,le,1,free_return,[ ],"POLYGON ((51.36009 12.40877, 51.35998 12.40915..."
6,#ff00e7,#ff00e7,LE_Pink_Schoenefeld,le,1,chargeable_return,[ ],"POLYGON ((51.35528 12.41517, 51.35523 12.41525..."


In [14]:
lat_mean, lng_mean = data_df.lat.mean(), data_df.lng.mean()

In [15]:
flexzone_swapped = flexzone.copy()
flexzone_swapped.geometry = flexzone.geometry.map(lambda polygon: shapely.ops.transform(lambda x, y: (y, x), polygon))
flexzone_swapped.head(2)

Unnamed: 0,color,fill,name,domain,cityId,category,serviceCases,geometry
0,#00CBFF,#00CBFF,LE_Blau_West,le,1,free_return,[ ],"POLYGON ((12.32817 51.30176, 12.32812 51.30065..."
1,#ff00e7,#ff00e7,LE_Pink_Ost,le,1,chargeable_return,"[ ""21"" ]","POLYGON ((12.35440 51.33200, 12.35397 51.33206..."


In [16]:
import folium

fmap = folium.Map(location=[lat_mean, lng_mean], zoom_start=12)
style_function = lambda x: {
    "fillColor": x['properties']['color']
}
folium.GeoJson(flexzone_swapped.to_json(), style_function=style_function).add_to(fmap)
fmap

In [17]:
data_df = data_df.reset_index(drop=True)
data_geodf = gpd.GeoDataFrame(data_df, geometry=gpd.points_from_xy(data_df.lat, data_df.lng))

In [18]:
import folium

import folium

fmap = folium.Map(location=[lat_mean, lng_mean], zoom_start=12)
style_function = lambda x: {
    "fillColor": x['properties']['color']
}
folium.GeoJson(flexzone_swapped.to_json(), style_function=style_function).add_to(fmap)

for index, row in data_geodf[:1000].iterrows():
	folium.CircleMarker(
		location=[row.lat, row.lng],
		radius=1,
		color="red",
		fill=True,
		fill_color="red",
		fill_opacity=0.5,
	).add_to(fmap)
	

fmap

In [19]:
data_geodf = data_geodf.set_crs(epsg=4326)

In [20]:
flexzone_free = flexzone[flexzone['category'] == 'free_return']
flexzone_charged = flexzone[flexzone['category'] == 'chargeable_return']

In [21]:
points_within_free = gpd.sjoin(data_geodf, flexzone_free, predicate='within')
points_within_charged = gpd.sjoin(data_geodf, flexzone_charged, predicate='within')
points_within_free.head(2)

Unnamed: 0,p_spot,b_lock_types,p_terminal_type,p_place_type,p_number,p_uid,b_number,b_boardcomputer,datetime,lng,...,b_battery_pack,geometry,index_right,color,fill,name,domain,cityId,category,serviceCases
2,False,frame_lock,,12,0.0,13547389,23510,7551005154,2019-03-20 15:12:00,12.380121,...,,POINT (51.33925 12.38012),2,#00CBFF,#00CBFF,LE_Blau_Ost,le,1,free_return,[ ]
3,False,frame_lock,,12,0.0,13547389,23510,7551005154,2019-03-20 23:59:00,12.38003,...,,POINT (51.33932 12.38003),2,#00CBFF,#00CBFF,LE_Blau_Ost,le,1,free_return,[ ]


In [22]:
data_df['in_free_flexzone'] = data_df.index.isin(points_within_free.index)
data_df['in_charged_flexzone'] = data_df.index.isin(points_within_charged.index)
data_df.in_free_flexzone.value_counts()

False    1370051
True      763595
Name: in_free_flexzone, dtype: int64

In [23]:

data_df.in_charged_flexzone.value_counts()

True     1524756
False     608890
Name: in_charged_flexzone, dtype: int64

In [24]:
flexzone_loose = flexzone.dissolve()
flexzone_loose = flexzone_loose.to_crs(epsg=3763)
flexzone_loose = flexzone_loose.buffer(10000)
flexzone_loose = flexzone_loose.to_crs(epsg=4326)

In [25]:
import folium

fmap = folium.Map(location=[data_df.lat.mean(), data_df.lng.mean()], zoom_start=12)
folium.features.Choropleth(
	geo_data=flexzone_loose.geometry.map(lambda polygon: shapely.ops.transform(lambda x, y: (y, x), polygon)).to_json(),
).add_to(fmap)

for index, row in data_geodf[:1000].iterrows():
	folium.CircleMarker(
		location=[row.lat, row.lng],
		radius=1,
		color="red",
		fill=True,
		fill_color="red",
		fill_opacity=0.5,
	).add_to(fmap)
	

fmap

In [26]:
flexzone_loose = flexzone_loose.to_crs(epsg=4326)
flexzone_loose = flexzone_loose.to_frame('geometry')

In [27]:

points_within = gpd.sjoin(data_geodf, flexzone_loose, predicate='within')
points_within.head(2)

Unnamed: 0,p_spot,b_lock_types,p_terminal_type,p_place_type,p_number,p_uid,b_number,b_boardcomputer,datetime,lng,...,trip,p_booked_bikes,p_name,b_bike_type,lat,b_pedelec_battery,p_bike_types,b_battery_pack,geometry,index_right
0,False,frame_lock,,12,0.0,13518747,23510,7551005154,2019-03-20 00:00:00,12.394574,...,first,0,BIKE 23510,71,51.338882,0.0,"{""71"": 1}",,POINT (51.33888 12.39457),0
1,False,frame_lock,,12,0.0,13518747,23510,7551005154,2019-03-20 15:03:00,12.394574,...,start,0,BIKE 23510,71,51.338882,0.0,"{""71"": 1}",,POINT (51.33888 12.39457),0


In [28]:
data_df['in_flexzone_loose'] = data_df.index.isin(points_within.index)

In [29]:
data_df.in_flexzone_loose.value_counts()

True     2130172
False       3474
Name: in_flexzone_loose, dtype: int64

In [30]:
data_df = data_df[data_df.in_flexzone_loose]
data_df = data_df.drop(columns=['in_flexzone_loose'])

In [31]:
data_df.datetime = pd.to_datetime(data_df.datetime)

### Remove Trips Outside Of Observed Time Interval

In [32]:
data_df = data_df[(data_df.datetime >= "2019-01-01") & (data_df.datetime <= "2019-12-31")]

In [33]:
data_df.to_pickle(ORIGINAL_DATA_MERGED_PATH)

# Trip Data

### Merge Location Data Into Trip Data

In [34]:
data_df = data_df.sort_values('datetime')

In [35]:
data_df = data_df.reset_index(drop=True)

In [36]:
unique_bike_numbers = data_df.b_number.unique()
len(unique_bike_numbers)

1070

In [37]:
data_df.trip.isna().sum()

0

In [38]:
def create_movements(bike_number):
    bike_locations = data_df.loc[
        data_df.b_number == bike_number
    ]
    bike_locations_a = bike_locations
    bike_locations_b = bike_locations.shift(-1)

    bike_movements = bike_locations_a.merge(
        bike_locations_b,
        left_index=True,
        right_index=True,
        suffixes=('_start', '_end'),
    )
    # due to shift & merge we create columns where end is null
    bike_movements = bike_movements.dropna(subset=['trip_start', 'trip_end'])
    return bike_movements


In [39]:
results = []
for bike_number in unique_bike_numbers:
	results.append(create_movements(bike_number))

In [40]:
movements = pd.concat(results)

In [41]:
to_drop = movements[((movements['lat_end'] == movements['lat_start']) & (movements['lng_end'] == movements['lng_start']))]
movements = movements.drop(to_drop.index)

In [42]:
import warnings
warnings.filterwarnings("ignore")

trips = movements.loc[(movements.trip_start == 'start') & (movements.trip_end == 'end')]


# relocations are movements - trips 
relocations = movements.loc[(movements.trip_start != 'start') | (movements.trip_end != 'end')]

trips['type'] = 'trip'
relocations['type'] = relocations['trip_start'] + '_' + relocations['trip_end']

warnings.filterwarnings("default")

### Remove Redundant Columns

In [43]:
unchanging_columns = [
	'b_lock_types',
	'b_bike_type',
	'b_electric_lock',
	'b_number',
	'b_boardcomputer'
]


In [44]:
unchanging_end = [col + "_end" for col in unchanging_columns]
unchaning_rename_dict = {col + "_start": col for col in unchanging_columns}

trips = trips.drop(columns=unchanging_end)
trips = trips.rename(columns=unchaning_rename_dict)

relocations = relocations.drop(columns=unchanging_end)
relocations = relocations.rename(columns=unchaning_rename_dict)

movements = movements.drop(columns=unchanging_end)
movements = movements.rename(columns=unchaning_rename_dict)

### Cleaning According To Distance

In [45]:
def haversine(lng1, lat1, lng2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    
    author: derricw (https://stackoverflow.com/questions/29545704/fast-haversine-approximation-python-pandas/29546836#29546836)
    """
    lng1, lat1, lng2, lat2 = map(np.radians, [lng1, lat1, lng2, lat2])

    dlng = lng2 - lng1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlng/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [46]:
trips["min_distance"] = haversine(
    trips.lng_start, trips.lat_start, trips.lng_end, trips.lat_end
)
relocations["min_distance"] = haversine(
    relocations.lng_start,
    relocations.lat_start,
    relocations.lng_end,
    relocations.lat_end,
)


In [47]:
# drop relocations under 100 meters
relocations = relocations.loc[relocations['min_distance'] > 0.1]

In [48]:
trips["duration"] = (trips.datetime_end - trips.datetime_start).dt.total_seconds() / 60
relocations["duration"] = (relocations.datetime_end - relocations.datetime_start).dt.total_seconds() / 60

In [49]:
print(f"minimum duration is {trips.duration.min():.2f} minutes")
print(f"maximum duration is {(trips.duration.max()/60):.2f} hours")

minimum duration is 2.00 minutes
maximum duration is 456.27 hours


We'll keep this, explain later

In [50]:
trips["min_avg_speed"] = trips.min_distance / (trips.duration / 60) 
relocations["min_avg_speed"] = relocations.min_distance / (relocations.duration / 60)

We will omit all trips that exceed the speed of 25kmh, which is the limit for e-bikes in germany [source](https://www.giant-bicycles.com/de/campaigns/wie-schnell-fahrt-ein-e-bike/21531). This seems plausible as trips that exceed this limit are very likely to be faulty because they would need to cycle faster than the maximum speed of e-bikes without any stops during the trip. Also, our distance column is calculated as the distance between the start and end station, which is a lower bound on the actual distance traveled. Therefore the actual distance traveled is most likely longer and the actual speed is most likely lower.



In [51]:
speed_limit = 25
trips_above_speed_limit = trips[trips.min_avg_speed > speed_limit]
print(
    f"{len(trips_above_speed_limit)} trips above speed limit, "
    + f"that is {len(trips_above_speed_limit) / len(trips) * 100:.4f}%"
)

trips = trips[trips.min_avg_speed < speed_limit]

186 trips above speed limit, that is 0.0323%


In [52]:
os.makedirs(PROCESSED_DATA_DIR_PATH, exist_ok=True)

trips = trips.drop(columns=['geometry_end', 'geometry_start'])
relocations = relocations.drop(columns=['geometry_end', 'geometry_start'])
movements = movements.drop(columns=['geometry_end', 'geometry_start'])

trips.to_parquet(TRIPS_PATH.replace(".pkl", ".parquet"))
relocations.to_parquet(RELOCATIONS_PATH.replace(".pkl", ".parquet"))
movements.to_parquet(MOVEMENTS_PATH.replace(".pkl", ".parquet"))