In [54]:
import os
import pandas as pd
import glob

In [55]:
path=os.path.join(os.getcwd(), '../data/input/dresden/2019/')

## Read Data

In [56]:
directors = glob.glob(path+"*")

all_files = []
for director in directors:
    all_files=all_files+glob.glob(director+"/*.csv")
    
li = []

for filename in all_files:
    df_read = pd.read_csv(filename, index_col=None, header=0)
    li.append(df_read)

df = pd.concat(li, axis=0, ignore_index=True)

## Form Data

In [57]:
# No Duplicates
len(df)-len(df.drop_duplicates(subset=df.columns.difference(["p_lat", "p_lng"])))

0

In [58]:
print("Starts:", len(df[df["trip"]=="start"]))
print("Ends:", len(df[df["trip"]=="end"]))
print("Difference:", len(df[df["trip"]=="start"])-len(df[df["trip"]=="end"]))

Starts: 565996
Ends: 550350
Difference: 15646


In [59]:
# df_2: start and end of trips
df_2 = df[(df["trip"]=="start") | (df["trip"]=="end")]
print("Dropped", len(df)-len(df_2), "first and last Values")

Dropped 290120 first and last Values


In [60]:
# df_3: start and end of trips sorted by b_number and datetime
df_3=df_2.sort_values(["b_number", "datetime"])
df_3[["b_number", "datetime", "trip"]].iloc[500010:500030] # two examples for multiple starts

Unnamed: 0,b_number,datetime,trip
669292,93554,2019-07-05 13:37:00,start
669293,93554,2019-07-05 13:46:00,end
669294,93554,2019-07-05 14:20:00,start
669295,93554,2019-07-05 14:35:00,end
669296,93554,2019-07-05 16:44:00,start
669297,93554,2019-07-05 16:51:00,start
669298,93554,2019-07-05 17:29:00,end
669299,93554,2019-07-05 18:48:00,start
669300,93554,2019-07-05 19:18:00,end
669301,93554,2019-07-05 20:20:00,start


In [61]:
# sr_1: True if after start entry comes end entry else False
sr_1 = (df_3['trip'] != df_3['trip'].shift())
df_3["valid_start"]=sr_1

# sr_2: 
#sr_2 = (df_3['trip'] != df_3['trip'].shift(-1))
#df_3['valid_end'] = sr_2


In [70]:
# df_4: start and end of trips sorted by b_number and datetime only valid starts
df_4=df_3[df_3["valid_start"]==True]
print("Deleted entries", len(df_3)-len(df_4))
print("Correct entries", len(df_4))

Deleted entries 15646
Correct entries 1100700


In [71]:
# Tests if there are values with two ends after each other
test = df_4["trip"] != df_4["trip"].shift(-1)
print("Entries where Bike X ends with an end and bike Y starts with an end")
pd.DataFrame(test)[pd.DataFrame(test)["trip"]==False]

Entries where Bike X ends with an end and bike Y starts with an end


Unnamed: 0,trip


In [72]:
df_starts = df_4[df_4["trip"]=="start"].reset_index(drop=True)
df_ends = df_4[df_4["trip"]=="end"].reset_index(drop=True)



In [73]:
print(df_starts.head()[["b_number", "datetime", "trip"]], "\n")
print(df_ends.head()[["b_number", "datetime", "trip"]])

   b_number             datetime   trip
0     11225  2019-04-29 06:31:00  start
1     11225  2019-04-29 11:27:00  start
2     11225  2019-04-29 11:58:00  start
3     11225  2019-04-29 12:22:00  start
4     11225  2019-04-29 13:00:00  start 

   b_number             datetime trip
0     11225  2019-04-29 06:35:00  end
1     11225  2019-04-29 11:33:00  end
2     11225  2019-04-29 12:03:00  end
3     11225  2019-04-29 12:39:00  end
4     11225  2019-04-29 13:14:00  end


In [74]:
df_merged = df_starts.merge(df_ends, left_on=df_starts.index, right_on=df_ends.index, suffixes=("_start", "_end"))
df_merged.drop(["key_0", "valid_start_start", "valid_start_end"], axis=1, inplace=True)
df_merged.head()

Unnamed: 0,p_rack_locks_start,p_bike_racks_start,b_state_start,p_spot_start,b_active_start,p_booked_bikes_start,p_place_type_start,datetime_start,b_number_start,trip_start,...,b_pedelec_battery_end,p_lng_end,b_boardcomputer_end,p_maintenance_end,p_terminal_type_end,p_bike_end,p_bike_types_end,b_battery_pack_end,p_special_racks_end,p_free_special_racks_end
0,False,0,ok,True,True,0,0,2019-04-29 06:31:00,11225,start,...,,13.769281,22919,False,,False,"{""15"": 1}",,0.0,0.0
1,False,0,ok,False,True,0,12,2019-04-29 11:27:00,11225,start,...,,8.770495,22919,False,,True,"{""15"": 1}",,0.0,0.0
2,False,0,ok,False,True,0,12,2019-04-29 11:58:00,11225,start,...,,8.772863,22919,False,,True,"{""15"": 1}",,0.0,0.0
3,False,0,ok,False,True,0,12,2019-04-29 12:22:00,11225,start,...,,8.77455,22919,False,,True,"{""15"": 1}",,0.0,0.0
4,False,0,ok,False,True,0,12,2019-04-29 13:00:00,11225,start,...,,8.763238,22919,False,,True,"{""15"": 1}",,0.0,0.0
