In [1]:
import tarfile
import pandas as pd
import os
from tqdm.notebook import tqdm

In [2]:
data_dir_path = "../00_data"
tarfile_path = os.path.join(data_dir_path, "leipzig.tar")
unpacked_path = os.path.join(data_dir_path, "leipzig")


In [3]:
# Unpack
with tarfile.open(tarfile_path, 'r') as tar:
	tar.extractall(data_dir_path)

In [4]:
# find all csv files in the unpacked directory
csv_files = []

def get_csv_files_recursively(path):
	for file in os.listdir(path):
		if os.path.isdir(os.path.join(path, file)):
			get_csv_files_recursively(os.path.join(path, file))
		elif file.endswith(".csv"):
			csv_files.append(os.path.join(path, file))

get_csv_files_recursively(unpacked_path)

csv_files[:5]

['../00_data/leipzig/2020/01/positions_nextbike_leipzig_2020_01_03.csv',
 '../00_data/leipzig/2020/01/positions_nextbike_leipzig_2020_01_13.csv',
 '../00_data/leipzig/2020/01/positions_nextbike_leipzig_2020_01_19.csv',
 '../00_data/leipzig/2020/01/positions_nextbike_leipzig_2020_01_06.csv',
 '../00_data/leipzig/2020/01/positions_nextbike_leipzig_2020_01_01.csv']

In [5]:
data_df = pd.concat(map(pd.read_csv, csv_files))

In [6]:
data_df.head(2)

Unnamed: 0,p_spot,p_free_racks,b_battery_pack,b_active,b_pedelec_battery,p_address,p_bike_racks,p_lat,b_lock_types,p_free_special_racks,...,p_booked_bikes,b_electric_lock,p_lng,p_rack_locks,b_number,p_bike_types,p_maintenance,p_special_racks,b_boardcomputer,city
0,True,0,,True,,,0,51.34397,frame_lock,0.0,...,0,True,12.383174,False,23138,"{""71"": 1}",False,0.0,7551005157,leipzig
1,True,0,,True,,,0,51.34397,frame_lock,0.0,...,0,True,12.383174,False,23138,"{""71"": 3}",False,0.0,7551005157,leipzig


In [7]:
# find columns with only one value
columns_with_unique_values = data_df.columns[data_df.nunique() == 1]
columns_with_unique_values

Index(['p_free_racks', 'b_active', 'p_bike_racks', 'p_free_special_racks',
       'b_state', 'p_rack_locks', 'p_maintenance', 'p_special_racks', 'city'],
      dtype='object')

In [8]:

data_df = data_df.drop(columns=columns_with_unique_values)


In [9]:
data_df.datetime = pd.to_datetime(data_df.datetime)

In [10]:
data_df = data_df.sort_values('datetime')

In [11]:
data_df = data_df.reset_index(drop=True)

In [12]:
trip_data = pd.DataFrame()

In [13]:
trip_selector = (data_df.trip == 'start') | (data_df.trip == 'end')
location_data_trips = data_df[trip_selector]

In [14]:
def create_trips(bike_number):
    bike_locations = location_data_trips[location_data_trips.b_number == bike_number]
    bike_start_locations = bike_locations
    bike_end_locations = bike_locations.shift(-1)

    bike_trips = bike_start_locations.merge(
        bike_end_locations,
        left_index=True,
        right_index=True,
        suffixes=("_start", "_end"),
    )

    bike_trips = bike_trips.loc[bike_trips.trip_start == 'start']
    bike_trips = bike_trips.loc[bike_trips.trip_start != bike_trips.trip_end]

    return bike_trips
create_trips(23138).head(2)


Unnamed: 0,p_spot_start,b_battery_pack_start,b_pedelec_battery_start,p_address_start,p_lat_start,b_lock_types_start,p_name_start,p_number_start,p_bikes_start,p_uid_start,...,p_bike_end,trip_end,p_terminal_type_end,b_bike_type_end,p_booked_bikes_end,b_electric_lock_end,p_lng_end,b_number_end,p_bike_types_end,b_boardcomputer_end
8741,False,,0.0,,51.315696,frame_lock,BIKE 23138,0.0,1,12085233,...,True,end,,71.0,0.0,True,12.326001,23138.0,"{""71"": 1}",7551005000.0
9565,False,,0.0,,51.315696,frame_lock,BIKE 23138,0.0,1,12085233,...,True,end,,71.0,0.0,True,12.326001,23138.0,"{""71"": 1}",7551005000.0


In [15]:
unique_bike_numbers = location_data_trips.b_number.unique()
len(unique_bike_numbers)

1054

In [16]:
for bike_number in tqdm(unique_bike_numbers):
	trip_data = pd.concat([trip_data, create_trips(bike_number)])

  0%|          | 0/1054 [00:00<?, ?it/s]

In [17]:
trip_data.info(null_counts=True)

  trip_data.info(null_counts=True)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 821991 entries, 396 to 1990239
Data columns (total 44 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   p_spot_start             821991 non-null  bool          
 1   b_battery_pack_start     123 non-null     object        
 2   b_pedelec_battery_start  86319 non-null   float64       
 3   p_address_start          16333 non-null   object        
 4   p_lat_start              821991 non-null  float64       
 5   b_lock_types_start       821991 non-null  object        
 6   p_name_start             821991 non-null  object        
 7   p_number_start           821919 non-null  float64       
 8   p_bikes_start            821991 non-null  int64         
 9   p_uid_start              821991 non-null  int64         
 10  p_place_type_start       821991 non-null  int64         
 11  datetime_start           821991 non-null  datetime64[ns]
 12  p_bike_start 