In [None]:
import pandas as pd
from datetime import datetime
import time
from itertools import islice

In [None]:
def append_new_row(data_frame, row):
    data_frame = data_frame.append({'lat_begin' : row['lat'], 'lng_begin' : row['lng'],
                                'lat_end' : 0, 'lng_end' : 0,
                                'ride_begin' : row['parking_end'], 'ride_end' : 0,
                                'ride_time' : 0,
                                'uid_begin' : row['uid'], 'uid_end' : 0, 'standing_before_ride' : row['parked_time'],
                                'name' : row['name'], 'address' : row['address'],
                                'bike' : row['bike'], 'bike_racks' : row['bike_racks'], 
                                'bikes' : row['bikes'], 'booked_bikes' : row['booked_bikes'], 
                                'free_racks' : row['free_racks'], 'free_special_racks' : row['free_special_racks'],
                                'maintenance' : row['maintenance'], 'number' : row['number'], 
                                'place_type' : row['place_type'], 'rack_locks' : row['rack_locks'],
                                'special_racks' : row['special_racks'], 'spot' : row['spot'], 
                                'terminal_type' : row['terminal_type'], 'city' : row['city'],
                                'country_code' : row['country_code'], 'company' : row['company'],
                                'timezone' : row['timezone'],
                                'available_bikes' : row['available_bikes']}, ignore_index=True)
    return data_frame

In [None]:
def parse_timestamp(el):
    a = el.split('.')[0]
    b = datetime.strptime(a, fmt)
    return b

In [None]:
fmt = '%Y-%m-%d %H:%M:%S'

### Processes the data in order to create new column  'ride_time_minutes' 

#### As URL set one of the following:
1. 'cleaned_data_nextbike_yy-mm.csv.gz' if you want to work with original cleaned data 
2. 'processed_data_nextbike_yy-mm.csv.gz' if you want to work with data that has GPS errors removed 

CHANGE NAME OF OUTPUT TO DESCRIBE THE CHOICE

In [None]:
url = "/bigdata/jelicicna/output_datasets/processed_data_nextbike_2019-12.csv.gz"
df = pd.read_csv(url, sep=';', low_memory=False)

In [None]:
df['parking_begin'] = df['parking_begin'].map(parse_timestamp)
df['parking_end'] = df['parking_end'].map(parse_timestamp)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df["uid"].duplicated().any() #checking if cleaning in notebook @3 was done - there should be no duplicates

In [None]:
df.head()

In [None]:
new_df = pd.DataFrame(columns= ['lat_begin', 'lng_begin', 'lat_end', 'lng_end', 'ride_begin', 'ride_end', 'ride_time', 
                                'uid_begin', 'uid_end', 'standing_before_ride', 'address', 'name', 'available_bikes', 'bike', 'bike_racks', 'bikes', 
                                'booked_bikes', 'free_racks', 'free_special_racks', 'maintenance', 'number', 
                                'place_type', 'rack_locks', 'special_racks', 'spot', 'terminal_type', 'city',
                                'country_code', 'company', 'timezone'] ) #empty dataframe with the new column structure

In [None]:
bikesToList = df['name'].tolist()
bikesList = list(set(bikesToList))

In [None]:
for name in bikesList: #repeat for each bike/bike name
    single_bike_records = df[df['name']==name]
    first_row_data = single_bike_records.iloc[0]
    last_end_time = first_row_data['parking_end']

    new_df = append_new_row(new_df, first_row_data)

    for index, row in islice(single_bike_records.iterrows(), 1, None):

        d1_ts = time.mktime(df['parking_begin'][index].timetuple())
        d2_ts = time.mktime(last_end_time.timetuple())

        minsDiff = round((int(d1_ts-d2_ts) / 60), 0)

        new_df.iloc[-1, new_df.columns.get_loc('ride_time')] = minsDiff
        new_df.iloc[-1, new_df.columns.get_loc('lat_end')] = row['lat']
        new_df.iloc[-1, new_df.columns.get_loc('lng_end')] = row['lng']
        new_df.iloc[-1, new_df.columns.get_loc('ride_end')] = row['parking_begin']
        new_df.iloc[-1, new_df.columns.get_loc('uid_end')] = row['uid']

        last_end_time = row['parking_end']
        
        new_df = append_new_row(new_df, row)

In [None]:
new_df.head()

In [None]:
new_df.info()

###  Prevous code assignes 0 values to some features that should be updated trough itteration to contain acctual end values of one ride.
### As a result for each bike there will be one extra row that has not updated values, meaning itteration has reached the end for selected bike and 0 values were not updated. 
### In the following cells we check for rows like that - idealy number of rows shoud match number of bikes

In [None]:
control_df = new_df[new_df['ride_end'] == 0]
control_df.info()

In [None]:
len(bikesList)

In [None]:
incorrect_indexes = list(control_df.index.values)
len(incorrect_indexes)

In [None]:
for i in incorrect_indexes:
    if i in new_df.index.values:
        new_df.drop(i, inplace=True)

In [None]:
new_df = new_df.reset_index()

In [None]:
new_df = new_df.drop(['index'], axis=1)

In [None]:
#new_df['ride_time'].value_counts(dropna=False)

In [None]:
new_df.info()

In [None]:
new_df.to_csv('/bigdata/jelicicna/output_datasets/rides_table_nextbike_2019-12.csv', sep=';', index=False) 