In [64]:
import tarfile
import os

import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

### Read & Merge CSV Files

In [65]:
data_dir_path = "../00_data"
tarfile_path = os.path.join(data_dir_path, "leipzig.tar")
unpacked_path = os.path.join(data_dir_path, "leipzig")


In [66]:
# Unpack
with tarfile.open(tarfile_path, 'r') as tar:
	tar.extractall(data_dir_path)

In [67]:
# find all csv files in the unpacked directory
csv_files = []

def get_csv_files_recursively(path):
	for file in os.listdir(path):
		if os.path.isdir(os.path.join(path, file)):
			get_csv_files_recursively(os.path.join(path, file))
		elif file.endswith(".csv"):
			csv_files.append(os.path.join(path, file))

get_csv_files_recursively(unpacked_path)

csv_files[:5]

['../00_data/leipzig/2020/01/positions_nextbike_leipzig_2020_01_03.csv',
 '../00_data/leipzig/2020/01/positions_nextbike_leipzig_2020_01_13.csv',
 '../00_data/leipzig/2020/01/positions_nextbike_leipzig_2020_01_19.csv',
 '../00_data/leipzig/2020/01/positions_nextbike_leipzig_2020_01_06.csv',
 '../00_data/leipzig/2020/01/positions_nextbike_leipzig_2020_01_01.csv']

In [68]:
data_df = pd.concat(map(pd.read_csv, csv_files))

In [69]:
data_df.head(2)

Unnamed: 0,p_spot,p_free_racks,b_battery_pack,b_active,b_pedelec_battery,p_address,p_bike_racks,p_lat,b_lock_types,p_free_special_racks,...,p_booked_bikes,b_electric_lock,p_lng,p_rack_locks,b_number,p_bike_types,p_maintenance,p_special_racks,b_boardcomputer,city
0,True,0,,True,,,0,51.34397,frame_lock,0.0,...,0,True,12.383174,False,23138,"{""71"": 1}",False,0.0,7551005157,leipzig
1,True,0,,True,,,0,51.34397,frame_lock,0.0,...,0,True,12.383174,False,23138,"{""71"": 3}",False,0.0,7551005157,leipzig


### Remove Insignificant Columns

In [70]:
# find columns with only one value
columns_with_unique_values = data_df.columns[data_df.nunique() == 1]
columns_with_unique_values

Index(['p_free_racks', 'b_active', 'p_bike_racks', 'p_free_special_racks',
       'b_state', 'p_rack_locks', 'p_maintenance', 'p_special_racks', 'city'],
      dtype='object')

In [71]:

data_df = data_df.drop(columns=columns_with_unique_values)


### Column Renaming

In [72]:
rename_dict = {
	"p_lat": "lat",
	"p_lng": "lng",
}

data_df = data_df.rename(columns=rename_dict)

### Merge Location Data Into Trip Data

In [73]:
data_df.datetime = pd.to_datetime(data_df.datetime)

In [74]:
data_df = data_df.sort_values('datetime')

In [75]:
data_df = data_df.reset_index(drop=True)

In [76]:
trips = pd.DataFrame()

In [77]:
location_data_trips = data_df[(data_df.trip == 'start') | (data_df.trip == 'end')]

In [78]:
def create_trips(bike_number):
    bike_locations = location_data_trips[location_data_trips.b_number == bike_number]
    bike_start_locations = bike_locations
    bike_end_locations = bike_locations.shift(-1)

    bike_trips = bike_start_locations.merge(
        bike_end_locations,
        left_index=True,
        right_index=True,
        suffixes=("_start", "_end"),
    )

    bike_trips = bike_trips.loc[
        (bike_trips.trip_start == "start") & (bike_trips.trip_end == "end")
    ]

    return bike_trips


In [79]:
unique_bike_numbers = location_data_trips.b_number.unique()
len(unique_bike_numbers)

1054

In [80]:
for bike_number in tqdm(unique_bike_numbers):
	trips = pd.concat([trips, create_trips(bike_number)])

  0%|          | 0/1054 [00:00<?, ?it/s]

In [81]:
trips.head(2)

Unnamed: 0,p_spot_start,b_battery_pack_start,b_pedelec_battery_start,p_address_start,lat_start,b_lock_types_start,p_name_start,p_number_start,p_bikes_start,p_uid_start,...,p_bike_end,trip_end,p_terminal_type_end,b_bike_type_end,p_booked_bikes_end,b_electric_lock_end,lng_end,b_number_end,p_bike_types_end,b_boardcomputer_end
396,False,,0.0,,51.321272,frame_lock,Karl-Tauchnitz-Straße & Wundtstraße,0.0,1,12058006,...,True,end,,71.0,0.0,True,12.341924,23255.0,"{""71"": 1}",7551007000.0
2934,False,,0.0,,51.327567,frame_lock,BIKE 23255,0.0,1,12126492,...,True,end,,71.0,0.0,True,12.368997,23255.0,"{""71"": 1}",7551007000.0


### Remove Redundant Columns

In [82]:
unchanging_columns = [
	'b_lock_types',
	'trip', # actually changing but implicitly given
	'b_bike_type',
	'b_electric_lock',
	'b_number',
	'b_boardcomputer'
]


In [83]:
unchanging_end = [col + "_end" for col in unchanging_columns]
unchaning_rename_dict = {col + "_start": col for col in unchanging_columns}

trips = trips.drop(columns=unchanging_end)
trips = trips.rename(columns=unchaning_rename_dict)

### Cleaning

In [84]:
def haversine(lng1, lat1, lng2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    
    author: derricw (https://stackoverflow.com/questions/29545704/fast-haversine-approximation-python-pandas/29546836#29546836)
    """
    lng1, lat1, lng2, lat2 = map(np.radians, [lng1, lat1, lng2, lat2])

    dlng = lng2 - lng1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlng/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [85]:
trips["min_distance"] = haversine(
    trips.lng_start, trips.lat_start, trips.lng_end, trips.lat_end
)

In [86]:
trips["duration"] = (trips.datetime_end - trips.datetime_start).dt.total_seconds() / 60

In [87]:
print(f"minimum duration is {trips.duration.min():.2f} minutes")
print(f"maximum duration is {(trips.duration.max()/60):.2f} hours")

minimum duration is 2.00 minutes
maximum duration is 23.78 hours


We'll keep this, explain later

In [88]:
trips["min_avg_speed"] = trips.min_distance / (trips.duration / 60) 

We will omit all trips that exceed the speed of 25kmh, which is the limit for e-bikes in germany [source](https://www.giant-bicycles.com/de/campaigns/wie-schnell-fahrt-ein-e-bike/21531). This seems plausible as trips that exceed this limit are very likely to be faulty because they would need to cycle faster than the maximum speed of e-bikes without any stops during the trip. Also, our distance column is calculated as the distance between the start and end station, which is a lower bound on the actual distance traveled. Therefore the actual distance traveled is most likely longer and the actual speed is most likely lower.



In [89]:
speed_limit = 25
trips_above_speed_limit = trips[trips.min_avg_speed > speed_limit]
print(
    f"{len(trips_above_speed_limit)} trips above speed limit, "
    + f"that is {len(trips_above_speed_limit) / len(trips) * 100:.4f}%"
)

trips = trips[trips.min_avg_speed < speed_limit]

267 trips above speed limit, that is 0.0325%


In [90]:
location_data_first_last = data_df.loc[
    (data_df["trip"] == "last") | (data_df["trip"] == "first")
]


def create_last_first_tuple(bike_number):
    bike_locations = location_data_first_last[
        location_data_first_last.b_number == bike_number
    ]

    bike_first_locations = bike_locations
    bike_last_locations = bike_locations.shift(-1)

    bike_first_last_tuple = bike_first_locations.merge(
        bike_last_locations,
        left_index=True,
        right_index=True,
        suffixes=("_first", "_last"),
    )

    bike_first_last_tuple = bike_first_last_tuple.loc[
        (bike_first_last_tuple.trip_first == "first")
        & (bike_first_last_tuple.trip_last == "last")
    ]

    return bike_first_last_tuple


In [91]:
last_first_tuples = pd.DataFrame()

for bike_number in tqdm(unique_bike_numbers):
    last_first_tuples = pd.concat([last_first_tuples, create_last_first_tuple(bike_number)])


  0%|          | 0/1054 [00:00<?, ?it/s]

In [None]:
last_first_tuples.head(2)

In [92]:
last_first_tuples["distance"] = haversine(
    last_first_tuples.lng_first,
    last_first_tuples.lat_first,
    last_first_tuples.lng_last,
    last_first_tuples.lat_last,
)


In [97]:
def describe(distances):
	minimum, maximum, mean, median = distances.min(), distances.max(), distances.mean(), distances.median()
	print(f"minimum distance is {minimum:.2f} meters")
	print(f"maximum distance is {maximum:.2f} meters")
	print(f"mean distance is {mean:.2f} meters")
	print(f"median distance is {median:.2f} meters")

In [98]:
describe(last_first_tuples.distance)

minimum distance is 0.00 meters
maximum distance is 783.35 meters
mean distance is 1.77 meters
median distance is 1.21 meters


In [93]:
data_df['date'] = data_df.datetime.dt.date

In [94]:
def create_end_last_tuple(bike_number):
    last_trips_location = (
        data_df.loc[(data_df.b_number == bike_number) & data_df["trip"] == "end"]
        .groupby("date")
        .last()
    )
    last_location = data_df.loc[
        (data_df.b_number == bike_number) & data_df["trip"] == "last"
    ]

    return last_trips_location.merge(
        last_location,
        on="date",
        suffixes=("_last", "_end")
    )


In [95]:
end_last_tuples = pd.DataFrame()

for bike_number in tqdm(unique_bike_numbers):
	end_last_tuples = pd.concat([end_last_tuples, create_end_last_tuple(bike_number)])

  0%|          | 0/1054 [00:00<?, ?it/s]

In [None]:
end_last_tuples.head(2)

In [101]:
end_last_tuples["distance"] = haversine(
    end_last_tuples.lng_last,
    end_last_tuples.lat_last,
    end_last_tuples.lng_end,
    end_last_tuples.lat_end,
)

In [102]:
describe(end_last_tuples.distance)

minimum distance is nan meters
maximum distance is nan meters
mean distance is nan meters
median distance is nan meters
