In [None]:
!pip install -e ..

In [None]:
from nextbike import preprocessing
import pandas as pd

In [None]:
# The dataset contains many duplicates
df = preprocessing.load_df().drop_duplicates()

In [None]:
df.head(15)

In [None]:
df.info(null_counts=True)

In [None]:
df[df.isna().any(axis=1)]

In [None]:
# p_number is 0 for free-floating or unregistered stations and likely by mistake NaN
df.fillna(0, inplace=True)

In [None]:
for col in ['p_spot', 'p_number', 'p_place_type', 'trip', 'b_bike_type', 'p_bike']:
    print(df[col].value_counts())
    print('---------------------------------')

In [None]:
# p_bike is always True if p_place_type is 12
df[df['p_place_type'] == 12]
df[(df['p_bike'] == True) & (df['p_place_type'] == 12)]

In [None]:
# p_bike is always True if p_spot is False
df[(df['p_bike'] == True) & (df['p_spot'] == False)]

In [None]:
# But p_bike can also be False if p_spot is False
# --> What does p_bike mean in this case?
df[(df['p_bike'] == False) & (df['p_spot'] == False)]

In [None]:
# Neither p_number nor p_uid is a unique identifier for p_name
print(len(df['p_number'].drop_duplicates()))
print(len(df['p_uid'].drop_duplicates()))
print(len(df['p_name'].drop_duplicates()))

* **The data set shows columns with prefixes p and b. What do you think do they represent? Also try to find good assumptions for the meanings of the columns**
    * `p_`: Place related information
    * `b_`: Bike related information
    * `p_spot`: True if the place is a known station, False if free-floating
    * `p_place_type`: 0 if the place is a known station, 12 if free-floating
    * `datetime`: Date of the booking
    * `b_number`: Unique identifier for a bike
    * `trip`:
        * 'first': Indicates the first booking of a day for a bike
        * 'last': Indicates the last booking of a day for a bike
        * 'start': Indicates the start of a trip
        * 'end': Inidicates the end of a trip
    * `p_uid`: ID of the location
    * `p_bikes`: Available bikes at the place
    * `p_lat`: Latitude of the location
    * `b_bike_type`: Type of the bike (the meaning is not clear, probably different bike versions)
    * `p_name`: Name of the location
    * `p_number`: ???
    * `p_lng`: Longitude of the location
    * `p_bike`: True if free-floating, False if known station

* **The trip column in your data set shows different values. Explain why there are not only two. Are examples with certain values for trip more informative for the analysis of mobility patterns than others?**
    * The start/end trips are more informative in order to calculate the duration of a trip and to obtain the target data format

In [None]:
# The official nextbike stations in mannheim -> https://mannheim.opendatasoft.com/explore/dataset/free_bike_status/information/
stations = pd.read_csv('https://mannheim.opendatasoft.com/explore/dataset/free_bike_status/download/?format=csv&timezone=Europe/Berlin&lang=de&use_labels_for_header=true&csv_separator=%3B', delimiter=';')
stations.head()

In [None]:
bike_numbers = set()
for number_list in stations['Fahrradnummern']:
    if isinstance(number_list, str):
        for num in number_list.split(','):
            bike_numbers.add(num)

In [None]:
mannheim_df = df[(df['p_uid'].isin(stations['uid'])) | (df['b_number'].isin(bike_numbers))].reset_index()
mannheim_df

In [None]:
# Same problem as before: start and end trip number does not match
mannheim_df['trip'].value_counts()