In [1]:
import os
import pandas as pd
import glob
import seaborn as sns

In [2]:
path_input=os.path.join(os.getcwd(), '../data/input/dresden/')
path_output_weather = os.path.join(os.getcwd(), '../data/output/weather/')
path_output=os.path.join(os.getcwd(), '../data/output/')

## Read Data

In [3]:
directors = glob.glob(path_input+"*/**")

all_files = []
for director in directors:
    all_files=all_files+glob.glob(director+"/*.csv")
    
li = []

for filename in all_files:
    df_read = pd.read_csv(filename, index_col=None, header=0)
    li.append(df_read)

df = pd.concat(li, axis=0, ignore_index=True)

In [4]:
#df["datetime"].sort_values()[1450300:1450350]

## Drop Columns

In [5]:
#Columns including single value
#The following columns only included one value (or NaN values)
#city : ['dresden']
#p_rack_locks : [False]
#b_state : ['ok']
#b_active : [ True]
#b_battery_pack : [nan '{"percentage": 0}']
#p_special_racks : [nan 0.]
#p_free_special_racks : [nan 0.]

#Columns which highly correlate

#The following columns highly correlate - meaning they are complementary booleans or if one is high, the other is too

#p_bike [corr(p_spot)=-0.999501, complementary boolean to p_spot]

df_cleaned = df.drop(["city", 
                      "p_bike", 
                      "p_rack_locks", 
                      "b_active", 
                      "b_state", 
                      "b_battery_pack", 
                      "p_rack_locks", 
                      "p_special_racks", 
                      "p_free_special_racks",
                      "p_address", # These 4 last columns will be dropped because there are too many stored NaN values
                      "p_bike_types",
                      "b_pedelec_battery",
                      "p_terminal_type"], axis=1)
df_cleaned.head()

Unnamed: 0,p_bike_racks,p_spot,p_booked_bikes,p_place_type,datetime,b_number,trip,p_uid,p_bikes,p_lat,b_electric_lock,b_bike_type,p_name,p_free_racks,b_lock_types,p_number,p_lng,b_boardcomputer,p_maintenance
0,0,True,0,0,2019-01-20 00:00:00,93716,first,264546,3,51.033548,False,15,Strehlener Platz,0,analog_code_lock,4346.0,13.7488,0,False
1,0,True,0,0,2019-01-20 09:40:00,93716,start,264546,3,51.033548,False,15,Strehlener Platz,0,analog_code_lock,4346.0,13.7488,0,False
2,0,True,0,0,2019-01-20 09:58:00,93716,end,264595,2,51.058771,False,15,Gutenbergstraße,0,analog_code_lock,4389.0,13.769281,0,False
3,0,True,0,0,2019-01-20 17:36:00,93716,start,264595,3,51.058771,False,15,Gutenbergstraße,0,analog_code_lock,4389.0,13.769281,0,False
4,0,True,0,0,2019-01-20 19:39:00,93716,end,100894,1,51.067153,False,15,Waldschlösschen,0,analog_code_lock,4361.0,13.77648,0,False


## Create Trips

In [6]:
# No Duplicates
len(df_cleaned)-len(df_cleaned.drop_duplicates(subset=df_cleaned.columns.difference(["p_lat", "p_lng"])))

0

In [7]:
print("Starts:", len(df_cleaned[df_cleaned["trip"]=="start"]))
print("Ends:", len(df_cleaned[df_cleaned["trip"]=="end"]))
print("Difference:", len(df_cleaned[df_cleaned["trip"]=="start"])-len(df_cleaned[df_cleaned["trip"]=="end"]))

Starts: 583206
Ends: 567439
Difference: 15767


In [8]:
# df_2: start and end of trips
df_2 = df_cleaned[(df_cleaned["trip"]=="start") | (df_cleaned["trip"]=="end")]
print("Dropped", len(df_cleaned)-len(df_2), "first and last Values")

Dropped 307709 first and last Values


In [9]:
# df_3: start and end of trips sorted by b_number and datetime
df_3=df_2.sort_values(["b_number", "datetime"])
df_3[["b_number", "datetime", "trip"]].iloc[500010:500030] # two examples for multiple starts

Unnamed: 0,b_number,datetime,trip
54359,93547,2019-02-12 21:52:00,start
54360,93547,2019-02-12 21:54:00,start
54361,93547,2019-02-12 21:56:00,end
54362,93547,2019-02-12 22:11:00,start
54363,93547,2019-02-12 22:14:00,end
54364,93547,2019-02-12 22:27:00,start
54365,93547,2019-02-12 22:30:00,end
54366,93547,2019-02-12 23:24:00,start
54367,93547,2019-02-12 23:27:00,start
54368,93547,2019-02-12 23:31:00,end


In [10]:
# sr_1: True if after start entry comes end entry else False
sr_1 = (df_3['trip'] != df_3['trip'].shift())
df_3["valid_start"]=sr_1

# sr_2: 
#sr_2 = (df_3['trip'] != df_3['trip'].shift(-1))
#df_3['valid_end'] = sr_2


In [11]:
# df_4: start and end of trips sorted by b_number and datetime only valid starts
df_4=df_3[df_3["valid_start"]==True]
print("Deleted entries", len(df_3)-len(df_4))
print("Correct entries", len(df_4))

Deleted entries 15767
Correct entries 1134878


In [12]:
# Tests if there are values with two ends after each other
test = df_4["trip"] != df_4["trip"].shift(-1)
print("Entries where Bike X ends with an end and bike Y starts with an end")
pd.DataFrame(test)[pd.DataFrame(test)["trip"]==False]

Entries where Bike X ends with an end and bike Y starts with an end


Unnamed: 0,trip


In [13]:
df_starts = df_4[df_4["trip"]=="start"].reset_index(drop=True)
df_ends = df_4[df_4["trip"]=="end"].reset_index(drop=True)



In [14]:
print(df_starts.head()[["b_number", "datetime", "trip"]], "\n")
print(df_ends.head()[["b_number", "datetime", "trip"]])

   b_number             datetime   trip
0     11225  2019-04-29 06:31:00  start
1     11225  2019-04-29 11:27:00  start
2     11225  2019-04-29 11:58:00  start
3     11225  2019-04-29 12:22:00  start
4     11225  2019-04-29 13:00:00  start 

   b_number             datetime trip
0     11225  2019-04-29 06:35:00  end
1     11225  2019-04-29 11:33:00  end
2     11225  2019-04-29 12:03:00  end
3     11225  2019-04-29 12:39:00  end
4     11225  2019-04-29 13:14:00  end


In [15]:
df_merged = df_starts.merge(df_ends, left_on=df_starts.index, right_on=df_ends.index, suffixes=("_start", "_end"))
df_merged.drop(["key_0", 
                "valid_start_start", 
                "valid_start_end", 
                "trip_start", 
                "trip_end", 
                "b_number_end", 
                "b_boardcomputer_end", 
                "b_lock_types_end", 
                "b_bike_type_end", 
                "b_electric_lock_end",], axis=1, inplace=True)
df_merged.head()

Unnamed: 0,p_bike_racks_start,p_spot_start,p_booked_bikes_start,p_place_type_start,datetime_start,b_number_start,p_uid_start,p_bikes_start,p_lat_start,b_electric_lock_start,...,p_place_type_end,datetime_end,p_uid_end,p_bikes_end,p_lat_end,p_name_end,p_free_racks_end,p_number_end,p_lng_end,p_maintenance_end
0,0,True,0,0,2019-04-29 06:31:00,11225,264595,1,51.058771,True,...,0,2019-04-29 06:35:00,264595,1,51.058771,Gutenbergstraße,0,4389.0,13.769281,False
1,0,False,0,12,2019-04-29 11:27:00,11225,15109859,1,50.806704,True,...,12,2019-04-29 11:33:00,15117356,1,50.813655,BIKE 11225,0,0.0,8.770495,False
2,0,False,0,12,2019-04-29 11:58:00,11225,15117356,1,50.813655,True,...,12,2019-04-29 12:03:00,15118389,1,50.808976,BIKE 11225,0,0.0,8.772863,False
3,0,False,0,12,2019-04-29 12:22:00,11225,15118389,1,50.808976,True,...,12,2019-04-29 12:39:00,15119670,1,50.822621,BIKE 11225,0,0.0,8.77455,False
4,0,False,0,12,2019-04-29 13:00:00,11225,15119670,1,50.822621,True,...,12,2019-04-29 13:14:00,15120946,1,50.813241,BIKE 11225,0,0.0,8.763238,False


## Merge Trips with Weather data

In [16]:
df_dwd=pd.read_csv(os.path.join(path_output_weather, "DWD.csv"))
df_dwd['MESS_DATUM'] = pd.to_datetime(df_dwd['MESS_DATUM'].astype(str), format="%Y-%m-%d %H:%M:%S", errors='coerce')

df_merged["datetime_start"] = pd.to_datetime(df_merged["datetime_start"].astype(str), format="%Y-%m-%d %H:%M:%S", errors="coerce")

In [17]:
df_full = pd.merge_asof(df_merged.sort_values('datetime_start'), 
                        df_dwd, left_on='datetime_start', 
                        right_on='MESS_DATUM', 
                        tolerance=pd.Timedelta('30 min'),
                        allow_exact_matches=True, direction='nearest')

In [18]:
df_full.head(5)

Unnamed: 0.1,p_bike_racks_start,p_spot_start,p_booked_bikes_start,p_place_type_start,datetime_start,b_number_start,p_uid_start,p_bikes_start,p_lat_start,b_electric_lock_start,...,p_lng_end,p_maintenance_end,Unnamed: 0,MESS_DATUM,air_deg,air_hum,rain_mm,rain_yn,sun_hour,wind_ms
0,0,False,0,12,2019-01-20 00:00:00,93771,12095573,1,51.071262,False,...,13.693052,False,1752,2019-01-20,-3.3,75.3,0.0,0.0,0.0,4.3
1,0,True,0,0,2019-01-20 00:05:00,93576,10299640,5,51.03821,False,...,13.744122,False,1752,2019-01-20,-3.3,75.3,0.0,0.0,0.0,4.3
2,0,True,0,0,2019-01-20 00:07:00,93440,10299584,1,51.04257,True,...,13.74773,False,1752,2019-01-20,-3.3,75.3,0.0,0.0,0.0,4.3
3,0,False,0,12,2019-01-20 00:07:00,93322,12098234,1,51.041798,False,...,13.68962,False,1752,2019-01-20,-3.3,75.3,0.0,0.0,0.0,4.3
4,0,True,0,0,2019-01-20 00:07:00,93585,264575,5,51.07174,False,...,13.741257,False,1752,2019-01-20,-3.3,75.3,0.0,0.0,0.0,4.3


In [19]:
df_full = df_full.drop(["Unnamed: 0", "MESS_DATUM"], axis=1)

In [20]:
# cast start&end times to datetimes
df_full['datetime_start'] = pd.to_datetime(df_full['datetime_start'])
df_full['datetime_end'] = pd.to_datetime(df_full['datetime_end'])
# calculate trip duration
df_full['trip_duration'] = ((df_full['datetime_end'] - df_full['datetime_start']).dt.total_seconds() / 60.0).round(2)

In [21]:
df_full.to_csv(os.path.join(path_output, "Trips.csv"))

In [22]:
# Activate parameter to show ALL columns from dataframe
pd.set_option('display.max_columns', None)
df_full.head(10)

Unnamed: 0,p_bike_racks_start,p_spot_start,p_booked_bikes_start,p_place_type_start,datetime_start,b_number_start,p_uid_start,p_bikes_start,p_lat_start,b_electric_lock_start,b_bike_type_start,p_name_start,p_free_racks_start,b_lock_types_start,p_number_start,p_lng_start,b_boardcomputer_start,p_maintenance_start,p_bike_racks_end,p_spot_end,p_booked_bikes_end,p_place_type_end,datetime_end,p_uid_end,p_bikes_end,p_lat_end,p_name_end,p_free_racks_end,p_number_end,p_lng_end,p_maintenance_end,air_deg,air_hum,rain_mm,rain_yn,sun_hour,wind_ms,trip_duration
0,0,False,0,12,2019-01-20 00:00:00,93771,12095573,1,51.071262,False,15,BIKE 93771,0,analog_code_lock,0.0,13.75038,22532,False,0,False,0,12,2019-01-20 00:28:00,12099518,1,51.046234,BIKE 93771,0,0.0,13.693052,False,-3.3,75.3,0.0,0.0,0.0,4.3,28.0
1,0,True,0,0,2019-01-20 00:05:00,93576,10299640,5,51.03821,False,15,Wohnheim Gret-Palucca.Straße / Lenneplatz,0,analog_code_lock,4486.0,13.744712,0,False,0,False,0,12,2019-01-20 00:15:00,12099344,1,51.049069,BIKE 93576,0,0.0,13.744122,False,-3.3,75.3,0.0,0.0,0.0,4.3,10.0
2,0,True,0,0,2019-01-20 00:07:00,93440,10299584,1,51.04257,True,0,Malterstraße (Haltestelle),0,frame_lock,4483.0,13.69113,7551004130,False,0,True,0,0,2019-01-20 00:33:00,4405670,3,51.02837,Wundtstr. / Zellescher Weg,0,4458.0,13.74773,False,-3.3,75.3,0.0,0.0,0.0,4.3,26.0
3,0,False,0,12,2019-01-20 00:07:00,93322,12098234,1,51.041798,False,0,BIKE 93322,0,analog_code_lock,0.0,13.68962,1265,False,0,False,0,12,2019-01-20 00:09:00,12098234,1,51.041798,BIKE 93322,0,0.0,13.68962,False,-3.3,75.3,0.0,0.0,0.0,4.3,2.0
4,0,True,0,0,2019-01-20 00:07:00,93585,264575,5,51.07174,False,15,Friedensstraße/Conradstr.,0,analog_code_lock,4373.0,13.741257,0,False,0,True,0,0,2019-01-20 00:35:00,264575,5,51.07174,Friedensstraße/Conradstr.,0,4373.0,13.741257,False,-3.3,75.3,0.0,0.0,0.0,4.3,28.0
5,0,True,0,0,2019-01-20 00:21:00,93660,45444,1,51.025741,False,15,Panometer,0,analog_code_lock,4310.0,13.789338,0,False,0,True,0,0,2019-01-20 00:33:00,264532,1,51.034938,Altenberger Platz,0,4336.0,13.807733,False,-3.3,75.3,0.0,0.0,0.0,4.3,12.0
6,0,False,0,12,2019-01-20 00:26:00,93478,12098942,1,51.028163,False,0,BIKE 93478,0,analog_code_lock,0.0,13.710755,1206,False,0,True,0,0,2019-01-20 00:28:00,10299678,1,51.023416,Räcknitzhöhe,0,4491.0,13.742453,False,-3.3,75.3,0.0,0.0,0.0,4.3,2.0
7,0,True,0,0,2019-01-20 00:29:00,93577,264599,1,51.04474,False,15,Mosenstraße,0,analog_code_lock,4392.0,13.777725,0,False,0,True,0,0,2019-01-20 00:31:00,264599,1,51.04474,Mosenstraße,0,4392.0,13.777725,False,-3.3,75.3,0.0,0.0,0.0,4.3,2.0
8,0,True,0,0,2019-01-20 00:31:00,93258,38955,2,51.060231,False,0,Palaisplatz,0,analog_code_lock,4403.0,13.739294,1322,False,0,True,0,0,2019-01-20 01:02:00,121771,4,51.071144,Schauburg,0,4327.0,13.750328,False,-3.4,75.0,0.0,0.0,0.0,4.4,31.0
9,0,True,0,0,2019-01-20 00:33:00,93785,264548,2,51.033767,False,15,Nürnberger Ei,0,analog_code_lock,4348.0,13.722911,22556,False,0,True,0,0,2019-01-20 00:49:00,4405670,5,51.02837,Wundtstr. / Zellescher Weg,0,4458.0,13.74773,False,-3.4,75.0,0.0,0.0,0.0,4.4,16.0


In [23]:
df_full.sort_values(by="datetime_start", ascending=False)

Unnamed: 0,p_bike_racks_start,p_spot_start,p_booked_bikes_start,p_place_type_start,datetime_start,b_number_start,p_uid_start,p_bikes_start,p_lat_start,b_electric_lock_start,b_bike_type_start,p_name_start,p_free_racks_start,b_lock_types_start,p_number_start,p_lng_start,b_boardcomputer_start,p_maintenance_start,p_bike_racks_end,p_spot_end,p_booked_bikes_end,p_place_type_end,datetime_end,p_uid_end,p_bikes_end,p_lat_end,p_name_end,p_free_racks_end,p_number_end,p_lng_end,p_maintenance_end,air_deg,air_hum,rain_mm,rain_yn,sun_hour,wind_ms,trip_duration
567438,0,False,0,12,2020-01-20 23:36:00,93454,27150164,1,51.044302,True,71,BIKE 93454,0,frame_lock,0.0,13.738484,7551002419,False,0,True,0,0,2020-01-20 23:50:00,10299848,17,51.070350,Alexander-Puschkin-Platz,0,4504.0,13.729830,False,-2.4,91.7,0.0,0.0,0.0,3.2,14.0
567437,0,True,0,0,2020-01-20 23:33:00,93824,73,1,51.065184,True,71,Bf. Dresden-Neustadt,0,frame_lock,4332.0,13.741220,7551016033,False,0,True,0,0,2020-01-20 23:45:00,264558,3,51.089747,Hubertusplatz,0,4356.0,13.723367,False,-2.4,91.7,0.0,0.0,0.0,3.2,12.0
567436,0,True,0,0,2020-01-20 23:31:00,93334,264575,4,51.071740,True,71,Friedensstraße/Conradstr.,0,frame_lock,4373.0,13.741257,7551005670,False,0,True,0,0,2020-01-20 23:36:00,121771,5,51.071144,Schauburg,0,4327.0,13.750328,False,-2.4,91.7,0.0,0.0,0.0,3.2,5.0
567435,0,True,0,0,2020-01-20 23:29:00,93908,11248297,3,51.043537,True,71,Tharandter Straße,0,frame_lock,4507.0,13.703979,7551012906,False,0,True,0,0,2020-01-20 23:46:00,4405670,4,51.028370,Wundtstr. / Zellescher Weg,0,4458.0,13.747730,False,-1.6,91.7,0.0,0.0,0.0,3.3,17.0
567434,0,True,0,0,2020-01-20 23:29:00,93857,310792,6,51.038466,True,71,Bonhoeffer Platz,0,frame_lock,4438.0,13.701679,7551012762,False,0,True,0,0,2020-01-20 23:49:00,10299848,16,51.070350,Alexander-Puschkin-Platz,0,4504.0,13.729830,False,-1.6,91.7,0.0,0.0,0.0,3.3,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,0,True,0,0,2019-01-20 00:07:00,93585,264575,5,51.071740,False,15,Friedensstraße/Conradstr.,0,analog_code_lock,4373.0,13.741257,0,False,0,True,0,0,2019-01-20 00:35:00,264575,5,51.071740,Friedensstraße/Conradstr.,0,4373.0,13.741257,False,-3.3,75.3,0.0,0.0,0.0,4.3,28.0
2,0,True,0,0,2019-01-20 00:07:00,93440,10299584,1,51.042570,True,0,Malterstraße (Haltestelle),0,frame_lock,4483.0,13.691130,7551004130,False,0,True,0,0,2019-01-20 00:33:00,4405670,3,51.028370,Wundtstr. / Zellescher Weg,0,4458.0,13.747730,False,-3.3,75.3,0.0,0.0,0.0,4.3,26.0
3,0,False,0,12,2019-01-20 00:07:00,93322,12098234,1,51.041798,False,0,BIKE 93322,0,analog_code_lock,0.0,13.689620,1265,False,0,False,0,12,2019-01-20 00:09:00,12098234,1,51.041798,BIKE 93322,0,0.0,13.689620,False,-3.3,75.3,0.0,0.0,0.0,4.3,2.0
1,0,True,0,0,2019-01-20 00:05:00,93576,10299640,5,51.038210,False,15,Wohnheim Gret-Palucca.Straße / Lenneplatz,0,analog_code_lock,4486.0,13.744712,0,False,0,False,0,12,2019-01-20 00:15:00,12099344,1,51.049069,BIKE 93576,0,0.0,13.744122,False,-3.3,75.3,0.0,0.0,0.0,4.3,10.0
