In [1]:
import pandas as pd
from datetime import datetime
import time
from itertools import islice

In [2]:
def append_new_row(data_frame, row):
    data_frame = data_frame.append({'lat_begin' : row['lat'], 'lng_begin' : row['lng'],
                                'lat_end' : 0, 'lng_end' : 0,
                                'ride_begin' : row['time_end'], 'ride_end' : 0,
                                'ride_time_minutes' : 0,
                                'uid_begin' : row['uid'], 'uid_end' : 0, 'standing_before_ride' : row['standing_time_minutes'],
                                'name' : row['name'], 'address' : row['address'],
                                'bike' : row['bike'], 'bike_racks' : row['bike_racks'], 
                                'bikes' : row['bikes'], 'booked_bikes' : row['booked_bikes'], 
                                'free_racks' : row['free_racks'], 'free_special_racks' : row['free_special_racks'],
                                'maintenance' : row['maintenance'], 'number' : row['number'], 
                                'place_type' : row['place_type'], 'rack_locks' : row['rack_locks'],
                                'special_racks' : row['special_racks'], 'spot' : row['spot'], 
                                'terminal_type' : row['terminal_type'], 'city' : row['city'],
                                'country_code' : row['country_code'], 'company' : row['company'],
                                'timezone' : row['timezone'],
                                'available_bikes' : row['available_bikes']}, ignore_index=True)
    return data_frame

### Processes the data in order to create new column  'ride_time_minutes' 

#### As URL set one of the following:
1. 'cleaned_data_nextbike_yy-mm.csv.gz' if you want to work with original cleaned data 
2. 'processed_data_nextbike_yy-mm.csv.gz' if you want to work with data that has GPS errors removed 

CHANGE NAME OF OUTPUT TO DESCRIBE THE CHOICE

In [3]:
url = "output_datasets/processed_data_nextbike_2020-07.csv.gz"
df = pd.read_csv(url, sep=';', low_memory=False)

In [4]:
fmt = '%Y-%m-%d %H:%M:%S'

In [5]:
pd.set_option('display.max_rows', None)

In [6]:
df["uid"].duplicated().any() #checking if cleaning in notebook @3 was done - there should be no duplicates

False

In [7]:
new_df = pd.DataFrame(columns= ['lat_begin', 'lng_begin', 'lat_end', 'lng_end', 'ride_begin', 'ride_end', 'ride_time_minutes', 
                                'uid_begin', 'uid_end', 'standing_before_ride', 'address', 'name', 'available_bikes', 'bike', 'bike_racks', 'bikes', 
                                'booked_bikes', 'free_racks', 'free_special_racks', 'maintenance', 'number', 
                                'place_type', 'rack_locks', 'special_racks', 'spot', 'terminal_type', 'city',
                                'country_code', 'company', 'timezone'] ) #empty dataframe with the new column structure

In [8]:
bikesToList = df['name'].tolist()
bikesList = list(set(bikesToList))

In [9]:
for name in bikesList: #repeat for each bike/bike name
    single_bike_records = df[df['name']==name]
    first_row_data = single_bike_records.iloc[0]
    last_end_time = first_row_data['time_end']

    new_df = append_new_row(new_df, first_row_data)

    for index, row in islice(single_bike_records.iterrows(), 1, None):
        t_begin = df['time_begin'][index].split('.')[0]
        t_end_previous = last_end_time.split('.')[0]    

        d1 = datetime.strptime(t_begin, fmt)
        d2 = datetime.strptime(t_end_previous, fmt)

        d1_ts = time.mktime(d1.timetuple())
        d2_ts = time.mktime(d2.timetuple())

        minsDiff = round((int(d1_ts-d2_ts) / 60), 0)

        new_df.iloc[-1, new_df.columns.get_loc('ride_time_minutes')] = minsDiff
        new_df.iloc[-1, new_df.columns.get_loc('lat_end')] = row['lat']
        new_df.iloc[-1, new_df.columns.get_loc('lng_end')] = row['lng']
        new_df.iloc[-1, new_df.columns.get_loc('ride_end')] = row['time_begin']
        new_df.iloc[-1, new_df.columns.get_loc('uid_end')] = row['uid']

        last_end_time = row['time_end']
        
        new_df = append_new_row(new_df, row)

In [10]:
new_df.head()

Unnamed: 0,lat_begin,lng_begin,lat_end,lng_end,ride_begin,ride_end,ride_time_minutes,uid_begin,uid_end,standing_before_ride,...,number,place_type,rack_locks,special_racks,spot,terminal_type,city,country_code,company,timezone
0,50.921618,6.933237,50.913,6.92249,2020-07-01 09:31:02.322428,2020-07-01 09:49:01.995455,18,31804315,31819860,571.0,...,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
1,50.913049,6.92249,50.9108,6.94197,2020-07-01 10:35:02.336121,2020-07-01 10:44:02.333799,9,31819860,31821838,46.0,...,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
2,50.910794,6.941967,50.9314,6.91823,2020-07-01 11:32:02.897704,2020-07-01 11:55:01.902627,23,31821838,31824497,48.0,...,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
3,50.931395,6.918226,50.9298,6.91381,2020-07-01 12:03:01.601807,2020-07-01 12:24:02.273590,21,31824497,31825694,8.0,...,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
4,50.929816,6.913806,50.9298,6.91362,2020-07-01 17:59:02.213975,2020-07-01 18:05:02.088433,6,31825694,31843292,335.0,...,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin


###  Prevous code assignes 0 values to some features that should be updated trough itteration to contain acctual end values of one ride.
### As a result for each bike there will be one extra row that has not updated values, meaning itteration has reached the end for selected bike and 0 values were not updated. 
### In the following cells we check for rows like that - idealy number of rows shoud match number of bikes

In [None]:
control_df = new_df[new_df['ride_end'] == 0]
control_df

In [None]:
len(bikesList)

In [None]:
incorrect_indexes = list(control_df.index.values)
len(incorrect_indexes)

In [None]:
for i in incorrect_indexes:
    if i in new_df.index.values:
        new_df.drop(i, inplace=True)
    

In [None]:
new_df = new_df.reset_index()

In [None]:
new_df.drop(['index'], axis=1)

In [None]:
new_df['ride_time_minutes'].value_counts(dropna=False)

In [None]:
single_bike = df[df['name']== 'BIKE 21423' ]
single_bike

In [None]:
single_bike_new = new_df[new_df['name']== 'BIKE 21423' ]
single_bike_new

In [None]:
new_df.to_csv('rides_table_nextbike_2020-06.csv', sep=';', index=False) 