In [None]:
!pip install -e ..

In [None]:
from nextbike import preprocessing
import pandas as pd
import geopandas as gpd
from matplotlib import pyplot as plt
import contextily as ctx
import numpy as np
%matplotlib inline

In [None]:
# The dataset contains many duplicates
df = preprocessing.load_df().drop_duplicates()

In [None]:
df.head(15)

In [None]:
df.info(null_counts=True)

In [None]:
df[df.isna().any(axis=1)]

In [None]:
# p_number is 0 for free-floating or unregistered stations and likely by mistake NaN
df.fillna(0, inplace=True)

In [None]:
for col in ['p_spot', 'p_number', 'p_place_type', 'trip', 'b_bike_type', 'p_bike']:
    print(df[col].value_counts())
    print('---------------------------------')

In [None]:
# p_bike is always True if p_place_type is 12
df[df['p_place_type'] == 12]
df[(df['p_bike'] == True) & (df['p_place_type'] == 12)]

In [None]:
# p_bike is always True if p_spot is False
df[(df['p_bike'] == True) & (df['p_spot'] == False)]

In [None]:
# But p_bike can also be False if p_spot is False
# --> What does p_bike mean in this case?
df[(df['p_bike'] == False) & (df['p_spot'] == False)]

In [None]:
# Neither p_number nor p_uid is a unique identifier for p_name
print(len(df['p_number'].drop_duplicates()))
print(len(df['p_uid'].drop_duplicates()))
print(len(df['p_name'].drop_duplicates()))

* **The data set shows columns with prefixes p and b. What do you think do they represent? Also try to find good assumptions for the meanings of the columns**
    * `p_`: Place related information
    * `b_`: Bike related information
    * `p_spot`: True if the place is a known station, False if free-floating
    * `p_place_type`: 0 if the place is a known station, 12 if free-floating
    * `datetime`: Date of the booking
    * `b_number`: Unique identifier for a bike
    * `trip`:
        * 'first': Indicates the first booking of a day for a bike
        * 'last': Indicates the last booking of a day for a bike
        * 'start': Indicates the start of a trip
        * 'end': Inidicates the end of a trip
    * `p_uid`: ID of the location
    * `p_bikes`: Available bikes at the place
    * `p_lat`: Latitude of the location
    * `b_bike_type`: Type of the bike (the meaning is not clear, probably different bike versions)
    * `p_name`: Name of the location
    * `p_number`: ???
    * `p_lng`: Longitude of the location
    * `p_bike`: True if free-floating, False if known station

* **The trip column in your data set shows different values. Explain why there are not only two. Are examples with certain values for trip more informative for the analysis of mobility patterns than others?**
    * The start/end trips are more informative in order to calculate the duration of a trip and to obtain the target data format

In [None]:
# The official nextbike stations in mannheim
# -> https://mannheim.opendatasoft.com/explore/dataset/free_bike_status/information/
stations = pd.read_csv('https://mannheim.opendatasoft.com/explore/dataset/free_bike_status/download/?format=csv&timezone=Europe/Berlin&lang=de&use_labels_for_header=true&csv_separator=%3B', delimiter=';')

In [None]:
# Create a set of distinct bike_numbers which are currently used in Mannheim
# Problem: There might be old bikes in the bookings which are not used anymore
bike_numbers = set()
for number_list in stations['Fahrradnummern']:
    if isinstance(number_list, str):
        for num in number_list.split(','):
            bike_numbers.add(num)

In [None]:
# Filtering like this might not be 100% consitent because free-floating bookings with bikes that are not in
# bike_numbers are not captured.
mannheim_df = df[(df['p_uid'].isin(stations['uid'])) | (df['b_number'].isin(bike_numbers))].reset_index()
mannheim_df

In [None]:
# Same problem as before: start and end trip number does not match
mannheim_df['trip'].value_counts()

**Filtering with the bike_numbers and stations does not work properly. It might be better to filter via the GeoJson boundary.**

In [None]:
# Load the boundary of Mannheim as GeoJson shape and plot the real stations vs the filtered trip locations
mannheim_boundary_gdf = gpd.read_file('../data/input/mannheim_boundary.geojson', crs='EPSG:4326')
stations_gdf = gpd.GeoDataFrame(geometry=gpd.points_from_xy(stations['lng'], stations['lat']), crs='EPSG:4326')
mannheim_gdf = gpd.GeoDataFrame(geometry=gpd.points_from_xy(mannheim_df['p_lng'], mannheim_df['p_lat']), crs='EPSG:4326')

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 16), dpi=300)

mannheim_boundary_gdf.plot(ax=ax, alpha=.6, edgecolor='blue')
mannheim_gdf.plot(ax=ax, label='Filtered trip locations', marker='x', c='red')
stations_gdf.plot(ax=ax, label='Real stations', c='midnightblue')
ctx.add_basemap(ax=ax, crs='EPSG:4326')

ax.set_title('Mannheim: Real stations vs. filtered trip locations')
ax.legend()
plt.show()

In [None]:
# All trips have to be within the Polygon of Mannheim
mannheim_boundary_polygon = mannheim_boundary_gdf['geometry'][0]
mannheim_boundary_polygon

In [None]:
# Create a GeoDataFrame from the original df
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['p_lng'], df['p_lat']), crs='EPSG:4326')
# Filter it by the entries which are within the Polygon above
trips_within_mannheim_gdf = gdf[gdf.within(mannheim_boundary_polygon)]
trips_within_mannheim_gdf

In [None]:
# Now all trips are really within Mannheim
# But: The mismatch between start and end trips is still present
#      because this data set might contain trips which cross the border of Mannheim
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 4), dpi=300)

mannheim_boundary_gdf.plot(ax=ax, alpha=.6, edgecolor='blue')
trips_within_mannheim_gdf.plot(ax=ax, label='Trips in Mannheim', c='red', marker='x')
ctx.add_basemap(ax=ax, crs='EPSG:4326')

ax.set_title('Mannheim: Trips within the city')
ax.legend()
plt.show()

In [None]:
# As expected, there is still a mismatch between start and end trips
trips_within_mannheim_gdf['trip'].value_counts()

**Clean the data set**

In [None]:
# Trips of type 'first' or 'last' are not relevant, so remove them
trips_within_mannheim_gdf = trips_within_mannheim_gdf[(trips_within_mannheim_gdf['trip'] != 'first') & (trips_within_mannheim_gdf['trip'] != 'last')].reset_index(drop=True)
trips_within_mannheim_gdf

In [None]:
# Define a sliding window O(N) algorithm which cleasn the data set by the following scheme:
# if two consecutive rows have the same trip type:
#   if the trip type is 'start':
#     delete the first row of the two rows (because the end trip for the observed start trip is missing)
#   else:
#     delete the second row (because the start trip for the observed end trip is missing)
def fix(df):
    # Use numpy to execute the code in the Cython space
    trips = np.array(df['trip'])
    # Use a hash set for distinct O(1) insertion operations
    delete_indices = set()
    # Iterate until the second last index because the sliding window is constructed by the interval [i, i + 1]
    for i in range(len(trips) - 1):
        if trips[i] == trips[i + 1]:
            i_delete = i if trips[i] == 'start' else i + 1
            delete_indices.add(i_delete)
    # Call pandas' internal drop method once to hand over the execution to Cython again
    return df.drop(delete_indices, axis=0)
    
new = fix(trips_within_mannheim_gdf)

In [None]:
# Now each start trip has a corresponding end trip
new['trip'].value_counts()

In [None]:
# This is the number of rows removed
len(trips_within_mannheim_gdf) - len(new)

In [None]:
# The new data looks good now!
sample_size = 20
random_start = np.random.randint(sample_size, len(new) - sample_size)
new[random_start:random_start + sample_size]