In [1]:
import os
import pandas as pd
import glob
import seaborn as sns

In [2]:
path_input=os.path.join(os.getcwd(), '../data/input/dresden/')
path_output_weather = os.path.join(os.getcwd(), '../data/output/weather/')
path_output=os.path.join(os.getcwd(), '../data/output/')

## Read Data

In [3]:
directors = glob.glob(path_input+"*/**")

all_files = []
for director in directors:
    all_files=all_files+glob.glob(director+"/*.csv")
    
li = []

for filename in all_files:
    df_read = pd.read_csv(filename, index_col=None, header=0)
    li.append(df_read)

df = pd.concat(li, axis=0, ignore_index=True)

In [4]:
#df["datetime"].sort_values()[1450300:1450350]

## Drop Columns

In [5]:
#Columns including single value
#The following columns only included one value (or NaN values)
#city : ['dresden']
#p_rack_locks : [False]
#b_state : ['ok']
#b_active : [ True]
#b_battery_pack : [nan '{"percentage": 0}']
#p_special_racks : [nan 0.]
#p_free_special_racks : [nan 0.]

#Columns which highly correlate

#The following columns highly correlate - meaning they are complementary booleans or if one is high, the other is too

#p_bike [corr(p_spot)=-0.999501, complementary boolean to p_spot]

df_cleaned = df.drop(["city", 
                      "p_bike", 
                      "p_rack_locks", 
                      "b_active", 
                      "b_state", 
                      "b_battery_pack", 
                      "p_rack_locks", 
                      "p_special_racks", 
                      "p_free_special_racks",
                      "p_address", # These 4 last columns will be dropped because there are too many stored NaN values
                      "p_bike_types",
                      "b_pedelec_battery",
                      "p_terminal_type"], axis=1)
df_cleaned.head()

Unnamed: 0,p_spot,b_lock_types,p_maintenance,p_bike_racks,p_place_type,p_number,p_uid,b_number,p_free_racks,b_boardcomputer,datetime,p_lng,b_electric_lock,p_bikes,trip,p_booked_bikes,p_name,b_bike_type,p_lat
0,True,frame_lock,False,0,0,4505.0,10299865,93153,0,7551006387,2019-03-03 00:00:00,13.767586,True,3,first,0,Altleubitz,0,51.015597
1,True,frame_lock,False,0,0,4505.0,10299865,93153,0,7551006387,2019-03-03 23:59:00,13.767586,True,3,last,0,Altleubitz,0,51.015597
2,False,frame_lock,False,0,12,0.0,13101449,93616,0,7551006429,2019-03-03 00:00:00,13.752801,True,1,first,0,BIKE 93616,15,51.065062
3,False,frame_lock,False,0,12,0.0,13101449,93616,0,7551006429,2019-03-03 01:12:00,13.752801,True,1,start,0,BIKE 93616,15,51.065062
4,False,frame_lock,False,0,12,0.0,13103279,93616,0,7551006429,2019-03-03 01:16:00,13.752313,True,1,end,0,BIKE 93616,15,51.064738


## Create Trips

In [6]:
# No Duplicates
len(df_cleaned)-len(df_cleaned.drop_duplicates(subset=df_cleaned.columns.difference(["p_lat", "p_lng"])))

0

In [7]:
print("Starts:", len(df_cleaned[df_cleaned["trip"]=="start"]))
print("Ends:", len(df_cleaned[df_cleaned["trip"]=="end"]))
print("Difference:", len(df_cleaned[df_cleaned["trip"]=="start"])-len(df_cleaned[df_cleaned["trip"]=="end"]))

Starts: 583206
Ends: 567439
Difference: 15767


In [8]:
# df_2: start and end of trips
df_2 = df_cleaned[(df_cleaned["trip"]=="start") | (df_cleaned["trip"]=="end")]
print("Dropped", len(df_cleaned)-len(df_2), "first and last Values")

Dropped 307709 first and last Values


In [9]:
# df_3: start and end of trips sorted by b_number and datetime
df_3=df_2.sort_values(["b_number", "datetime"])
df_3[["b_number", "datetime", "trip"]].iloc[500010:500030] # two examples for multiple starts

Unnamed: 0,b_number,datetime,trip
446639,93547,2019-02-12 21:52:00,start
446640,93547,2019-02-12 21:54:00,start
446641,93547,2019-02-12 21:56:00,end
446642,93547,2019-02-12 22:11:00,start
446643,93547,2019-02-12 22:14:00,end
446644,93547,2019-02-12 22:27:00,start
446645,93547,2019-02-12 22:30:00,end
446646,93547,2019-02-12 23:24:00,start
446647,93547,2019-02-12 23:27:00,start
446648,93547,2019-02-12 23:31:00,end


In [10]:
# sr_1: True if after start entry comes end entry else False
sr_1 = (df_3['trip'] != df_3['trip'].shift())
df_3["valid_start"]=sr_1

# sr_2: 
#sr_2 = (df_3['trip'] != df_3['trip'].shift(-1))
#df_3['valid_end'] = sr_2


In [11]:
# df_4: start and end of trips sorted by b_number and datetime only valid starts
df_4=df_3[df_3["valid_start"]==True]
print("Deleted entries", len(df_3)-len(df_4))
print("Correct entries", len(df_4))

Deleted entries 15767
Correct entries 1134878


In [12]:
# Tests if there are values with two ends after each other
test = df_4["trip"] != df_4["trip"].shift(-1)
print("Entries where Bike X ends with an end and bike Y starts with an end")
pd.DataFrame(test)[pd.DataFrame(test)["trip"]==False]

Entries where Bike X ends with an end and bike Y starts with an end


Unnamed: 0,trip


In [13]:
df_starts = df_4[df_4["trip"]=="start"].reset_index(drop=True)
df_ends = df_4[df_4["trip"]=="end"].reset_index(drop=True)



In [14]:
print(df_starts.head()[["b_number", "datetime", "trip"]], "\n")
print(df_ends.head()[["b_number", "datetime", "trip"]])

   b_number             datetime   trip
0     11225  2019-04-29 06:31:00  start
1     11225  2019-04-29 11:27:00  start
2     11225  2019-04-29 11:58:00  start
3     11225  2019-04-29 12:22:00  start
4     11225  2019-04-29 13:00:00  start 

   b_number             datetime trip
0     11225  2019-04-29 06:35:00  end
1     11225  2019-04-29 11:33:00  end
2     11225  2019-04-29 12:03:00  end
3     11225  2019-04-29 12:39:00  end
4     11225  2019-04-29 13:14:00  end


In [15]:
df_merged = df_starts.merge(df_ends, left_on=df_starts.index, right_on=df_ends.index, suffixes=("_start", "_end"))
df_merged.drop(["key_0", 
                "valid_start_start", 
                "valid_start_end", 
                "trip_start", 
                "trip_end", 
                "b_number_end", 
                "b_boardcomputer_end", 
                "b_lock_types_end", 
                "b_bike_type_end", 
                "b_electric_lock_end",], axis=1, inplace=True)
df_merged.head()

Unnamed: 0,p_spot_start,b_lock_types_start,p_maintenance_start,p_bike_racks_start,p_place_type_start,p_number_start,p_uid_start,b_number_start,p_free_racks_start,b_boardcomputer_start,...,p_place_type_end,p_number_end,p_uid_end,p_free_racks_end,datetime_end,p_lng_end,p_bikes_end,p_booked_bikes_end,p_name_end,p_lat_end
0,True,fork_lock,False,0,0,4389.0,264595,11225,0,22919,...,0,4389.0,264595,0,2019-04-29 06:35:00,13.769281,1,0,Gutenbergstraße,51.058771
1,False,fork_lock,False,0,12,0.0,15109859,11225,0,22919,...,12,0.0,15117356,0,2019-04-29 11:33:00,8.770495,1,0,BIKE 11225,50.813655
2,False,fork_lock,False,0,12,0.0,15117356,11225,0,22919,...,12,0.0,15118389,0,2019-04-29 12:03:00,8.772863,1,0,BIKE 11225,50.808976
3,False,fork_lock,False,0,12,0.0,15118389,11225,0,22919,...,12,0.0,15119670,0,2019-04-29 12:39:00,8.77455,1,0,BIKE 11225,50.822621
4,False,fork_lock,False,0,12,0.0,15119670,11225,0,22919,...,12,0.0,15120946,0,2019-04-29 13:14:00,8.763238,1,0,BIKE 11225,50.813241


## Merge Trips with Weather data

In [16]:
df_dwd=pd.read_csv(os.path.join(path_output_weather, "DWD.csv"))
df_dwd['MESS_DATUM'] = pd.to_datetime(df_dwd['MESS_DATUM'].astype(str), format="%Y-%m-%d %H:%M:%S", errors='coerce')

df_merged["datetime_start"] = pd.to_datetime(df_merged["datetime_start"].astype(str), format="%Y-%m-%d %H:%M:%S", errors="coerce")

In [23]:
df_full = pd.merge_asof(df_merged.sort_values('datetime_start'), 
                        df_dwd, left_on='datetime_start', 
                        right_on='MESS_DATUM', 
                        tolerance=pd.Timedelta('30 min'),
                        allow_exact_matches=True, direction='nearest')

In [24]:
df_full.head(5)

Unnamed: 0.1,p_spot_start,b_lock_types_start,p_maintenance_start,p_bike_racks_start,p_place_type_start,p_number_start,p_uid_start,b_number_start,p_free_racks_start,b_boardcomputer_start,datetime_start,p_lng_start,b_electric_lock_start,p_bikes_start,p_booked_bikes_start,p_name_start,b_bike_type_start,p_lat_start,p_spot_end,p_maintenance_end,p_bike_racks_end,p_place_type_end,p_number_end,p_uid_end,p_free_racks_end,datetime_end,p_lng_end,p_bikes_end,p_booked_bikes_end,p_name_end,p_lat_end,Unnamed: 0,MESS_DATUM,air_deg,air_hum,rain_mm,rain_yn,sun_hour,wind_ms
0,False,analog_code_lock,False,0,12,0.0,12095573,93771,0,22532,2019-01-20 00:00:00,13.75038,False,1,0,BIKE 93771,15,51.071262,False,False,0,12,0.0,12099518,0,2019-01-20 00:28:00,13.693052,1,0,BIKE 93771,51.046234,1752,2019-01-20,-3.3,75.3,0.0,0.0,0.0,4.3
1,True,analog_code_lock,False,0,0,4486.0,10299640,93576,0,0,2019-01-20 00:05:00,13.744712,False,5,0,Wohnheim Gret-Palucca.Straße / Lenneplatz,15,51.03821,False,False,0,12,0.0,12099344,0,2019-01-20 00:15:00,13.744122,1,0,BIKE 93576,51.049069,1752,2019-01-20,-3.3,75.3,0.0,0.0,0.0,4.3
2,True,frame_lock,False,0,0,4483.0,10299584,93440,0,7551004130,2019-01-20 00:07:00,13.69113,True,1,0,Malterstraße (Haltestelle),0,51.04257,True,False,0,0,4458.0,4405670,0,2019-01-20 00:33:00,13.74773,3,0,Wundtstr. / Zellescher Weg,51.02837,1752,2019-01-20,-3.3,75.3,0.0,0.0,0.0,4.3
3,False,analog_code_lock,False,0,12,0.0,12098234,93322,0,1265,2019-01-20 00:07:00,13.68962,False,1,0,BIKE 93322,0,51.041798,False,False,0,12,0.0,12098234,0,2019-01-20 00:09:00,13.68962,1,0,BIKE 93322,51.041798,1752,2019-01-20,-3.3,75.3,0.0,0.0,0.0,4.3
4,True,analog_code_lock,False,0,0,4373.0,264575,93585,0,0,2019-01-20 00:07:00,13.741257,False,5,0,Friedensstraße/Conradstr.,15,51.07174,True,False,0,0,4373.0,264575,0,2019-01-20 00:35:00,13.741257,5,0,Friedensstraße/Conradstr.,51.07174,1752,2019-01-20,-3.3,75.3,0.0,0.0,0.0,4.3


In [25]:
df_full = df_full.drop(["Unnamed: 0", "MESS_DATUM"], axis=1)

In [26]:
# cast start&end times to datetimes
df_full['datetime_start'] = pd.to_datetime(df_full['datetime_start'])
df_full['datetime_end'] = pd.to_datetime(df_full['datetime_end'])
# calculate trip duration
df_full['trip_duration'] = ((df_full['datetime_end'] - df_full['datetime_start']).dt.total_seconds() / 60.0).round(2)

In [27]:
df_full.to_csv(os.path.join(path_output, "Trips.csv"))

In [28]:
# Activate parameter to show ALL columns from dataframe
pd.set_option('display.max_columns', None)
df_full.head(10)

Unnamed: 0,p_spot_start,b_lock_types_start,p_maintenance_start,p_bike_racks_start,p_place_type_start,p_number_start,p_uid_start,b_number_start,p_free_racks_start,b_boardcomputer_start,datetime_start,p_lng_start,b_electric_lock_start,p_bikes_start,p_booked_bikes_start,p_name_start,b_bike_type_start,p_lat_start,p_spot_end,p_maintenance_end,p_bike_racks_end,p_place_type_end,p_number_end,p_uid_end,p_free_racks_end,datetime_end,p_lng_end,p_bikes_end,p_booked_bikes_end,p_name_end,p_lat_end,air_deg,air_hum,rain_mm,rain_yn,sun_hour,wind_ms,trip_duration
0,False,analog_code_lock,False,0,12,0.0,12095573,93771,0,22532,2019-01-20 00:00:00,13.75038,False,1,0,BIKE 93771,15,51.071262,False,False,0,12,0.0,12099518,0,2019-01-20 00:28:00,13.693052,1,0,BIKE 93771,51.046234,-3.3,75.3,0.0,0.0,0.0,4.3,28.0
1,True,analog_code_lock,False,0,0,4486.0,10299640,93576,0,0,2019-01-20 00:05:00,13.744712,False,5,0,Wohnheim Gret-Palucca.Straße / Lenneplatz,15,51.03821,False,False,0,12,0.0,12099344,0,2019-01-20 00:15:00,13.744122,1,0,BIKE 93576,51.049069,-3.3,75.3,0.0,0.0,0.0,4.3,10.0
2,True,frame_lock,False,0,0,4483.0,10299584,93440,0,7551004130,2019-01-20 00:07:00,13.69113,True,1,0,Malterstraße (Haltestelle),0,51.04257,True,False,0,0,4458.0,4405670,0,2019-01-20 00:33:00,13.74773,3,0,Wundtstr. / Zellescher Weg,51.02837,-3.3,75.3,0.0,0.0,0.0,4.3,26.0
3,False,analog_code_lock,False,0,12,0.0,12098234,93322,0,1265,2019-01-20 00:07:00,13.68962,False,1,0,BIKE 93322,0,51.041798,False,False,0,12,0.0,12098234,0,2019-01-20 00:09:00,13.68962,1,0,BIKE 93322,51.041798,-3.3,75.3,0.0,0.0,0.0,4.3,2.0
4,True,analog_code_lock,False,0,0,4373.0,264575,93585,0,0,2019-01-20 00:07:00,13.741257,False,5,0,Friedensstraße/Conradstr.,15,51.07174,True,False,0,0,4373.0,264575,0,2019-01-20 00:35:00,13.741257,5,0,Friedensstraße/Conradstr.,51.07174,-3.3,75.3,0.0,0.0,0.0,4.3,28.0
5,True,analog_code_lock,False,0,0,4310.0,45444,93660,0,0,2019-01-20 00:21:00,13.789338,False,1,0,Panometer,15,51.025741,True,False,0,0,4336.0,264532,0,2019-01-20 00:33:00,13.807733,1,0,Altenberger Platz,51.034938,-3.3,75.3,0.0,0.0,0.0,4.3,12.0
6,False,analog_code_lock,False,0,12,0.0,12098942,93478,0,1206,2019-01-20 00:26:00,13.710755,False,1,0,BIKE 93478,0,51.028163,True,False,0,0,4491.0,10299678,0,2019-01-20 00:28:00,13.742453,1,0,Räcknitzhöhe,51.023416,-3.3,75.3,0.0,0.0,0.0,4.3,2.0
7,True,analog_code_lock,False,0,0,4392.0,264599,93577,0,0,2019-01-20 00:29:00,13.777725,False,1,0,Mosenstraße,15,51.04474,True,False,0,0,4392.0,264599,0,2019-01-20 00:31:00,13.777725,1,0,Mosenstraße,51.04474,-3.3,75.3,0.0,0.0,0.0,4.3,2.0
8,True,analog_code_lock,False,0,0,4403.0,38955,93258,0,1322,2019-01-20 00:31:00,13.739294,False,2,0,Palaisplatz,0,51.060231,True,False,0,0,4327.0,121771,0,2019-01-20 01:02:00,13.750328,4,0,Schauburg,51.071144,-3.4,75.0,0.0,0.0,0.0,4.4,31.0
9,True,analog_code_lock,False,0,0,4348.0,264548,93785,0,22556,2019-01-20 00:33:00,13.722911,False,2,0,Nürnberger Ei,15,51.033767,True,False,0,0,4458.0,4405670,0,2019-01-20 00:49:00,13.74773,5,0,Wundtstr. / Zellescher Weg,51.02837,-3.4,75.0,0.0,0.0,0.0,4.4,16.0


In [29]:
df_full.sort_values(by="datetime_start", ascending=False)

Unnamed: 0,p_spot_start,b_lock_types_start,p_maintenance_start,p_bike_racks_start,p_place_type_start,p_number_start,p_uid_start,b_number_start,p_free_racks_start,b_boardcomputer_start,datetime_start,p_lng_start,b_electric_lock_start,p_bikes_start,p_booked_bikes_start,p_name_start,b_bike_type_start,p_lat_start,p_spot_end,p_maintenance_end,p_bike_racks_end,p_place_type_end,p_number_end,p_uid_end,p_free_racks_end,datetime_end,p_lng_end,p_bikes_end,p_booked_bikes_end,p_name_end,p_lat_end,air_deg,air_hum,rain_mm,rain_yn,sun_hour,wind_ms,trip_duration
567438,False,frame_lock,False,0,12,0.0,27150164,93454,0,7551002419,2020-01-20 23:36:00,13.738484,True,1,0,BIKE 93454,71,51.044302,True,False,0,0,4504.0,10299848,0,2020-01-20 23:50:00,13.729830,17,0,Alexander-Puschkin-Platz,51.070350,-2.4,91.7,0.0,0.0,0.0,3.2,14.0
567437,True,frame_lock,False,0,0,4332.0,73,93824,0,7551016033,2020-01-20 23:33:00,13.741220,True,1,0,Bf. Dresden-Neustadt,71,51.065184,True,False,0,0,4356.0,264558,0,2020-01-20 23:45:00,13.723367,3,0,Hubertusplatz,51.089747,-2.4,91.7,0.0,0.0,0.0,3.2,12.0
567436,True,frame_lock,False,0,0,4373.0,264575,93334,0,7551005670,2020-01-20 23:31:00,13.741257,True,4,0,Friedensstraße/Conradstr.,71,51.071740,True,False,0,0,4327.0,121771,0,2020-01-20 23:36:00,13.750328,5,0,Schauburg,51.071144,-2.4,91.7,0.0,0.0,0.0,3.2,5.0
567435,True,frame_lock,False,0,0,4507.0,11248297,93908,0,7551012906,2020-01-20 23:29:00,13.703979,True,3,0,Tharandter Straße,71,51.043537,True,False,0,0,4458.0,4405670,0,2020-01-20 23:46:00,13.747730,4,0,Wundtstr. / Zellescher Weg,51.028370,-1.6,91.7,0.0,0.0,0.0,3.3,17.0
567434,True,frame_lock,False,0,0,4438.0,310792,93857,0,7551012762,2020-01-20 23:29:00,13.701679,True,6,0,Bonhoeffer Platz,71,51.038466,True,False,0,0,4504.0,10299848,0,2020-01-20 23:49:00,13.729830,16,0,Alexander-Puschkin-Platz,51.070350,-1.6,91.7,0.0,0.0,0.0,3.3,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,True,analog_code_lock,False,0,0,4373.0,264575,93585,0,0,2019-01-20 00:07:00,13.741257,False,5,0,Friedensstraße/Conradstr.,15,51.071740,True,False,0,0,4373.0,264575,0,2019-01-20 00:35:00,13.741257,5,0,Friedensstraße/Conradstr.,51.071740,-3.3,75.3,0.0,0.0,0.0,4.3,28.0
2,True,frame_lock,False,0,0,4483.0,10299584,93440,0,7551004130,2019-01-20 00:07:00,13.691130,True,1,0,Malterstraße (Haltestelle),0,51.042570,True,False,0,0,4458.0,4405670,0,2019-01-20 00:33:00,13.747730,3,0,Wundtstr. / Zellescher Weg,51.028370,-3.3,75.3,0.0,0.0,0.0,4.3,26.0
3,False,analog_code_lock,False,0,12,0.0,12098234,93322,0,1265,2019-01-20 00:07:00,13.689620,False,1,0,BIKE 93322,0,51.041798,False,False,0,12,0.0,12098234,0,2019-01-20 00:09:00,13.689620,1,0,BIKE 93322,51.041798,-3.3,75.3,0.0,0.0,0.0,4.3,2.0
1,True,analog_code_lock,False,0,0,4486.0,10299640,93576,0,0,2019-01-20 00:05:00,13.744712,False,5,0,Wohnheim Gret-Palucca.Straße / Lenneplatz,15,51.038210,False,False,0,12,0.0,12099344,0,2019-01-20 00:15:00,13.744122,1,0,BIKE 93576,51.049069,-3.3,75.3,0.0,0.0,0.0,4.3,10.0
