In [None]:
import pandas as pd
import random
from datetime import datetime
import time
from itertools import islice

In [None]:
def unique(sequence): #removes duplicates from a list whilst preserving order
    visited = set()
    return [x for x in sequence if not (x in visited or visited.add(x))]

In [None]:
def parse_timestamp(el):
    a = el.split('.')[0]
    b = datetime.strptime(a, fmt)
    return b

In [None]:
fmt = '%Y-%m-%d %H:%M:%S'

In [None]:
url = "output_datasets/parked_time_data_nextbike_2020-01.csv.gz"

In [None]:
df = pd.read_csv(url, sep=';', low_memory=False)

In [None]:
df['parking_begin'] = df['parking_begin'].map(parse_timestamp)
df['parking_end'] = df['parking_end'].map(parse_timestamp)

In [None]:
df.columns

In [None]:
pd.set_option('display.max_columns()', None)

In [None]:
new_df = pd.DataFrame(columns= ['old_index', 'lat', 'lng', 'parking_begin', 'parking_end', 'parked_time',
                                'uid', 'name', 'address', 'available_bikes', 'maintenance', 'bike', 'bike_racks', 'bikes', 
                                'booked_bikes', 'free_racks', 'free_special_racks', 'number', 
                                'place_type', 'rack_locks', 'special_racks', 'spot', 'terminal_type', 'city',
                                'country_code', 'company', 'timezone'] ) 
#empty dataframe with the new column structure


#### OLD_INDEX is gonna be used later when we itterate trough smaller data frames containing only data rows with same uid. 
#### These subset data frames create new indexes but we use old index to compare positions based on the original place in the main df

In [None]:
df.head(10)

In [None]:
uidsToList = df['uid'].tolist()
uidsList = unique(uidsToList) #removes duplicates from a list whilst preserving order

In [None]:
len(uidsList)

In [None]:
df.reset_index(inplace=True)

In [None]:
df.rename(columns={"index": "old_index"}, inplace=True) #create duplicate column of indexes - old_index

### We want to merge all the rows with the same uid into one row
### First we need to check GPS errors

Old index order is used to interupt same uid processing if two rows are not in a sequence in original data frame. This eliminates the error that was created in cases when one uid appeared for to riddes that are sepparated by many other rides on the same bike. This error is related to how the uid is generated

In [None]:
questionable_changes = {} #creating dictionary to hold uid:difference 
for uid in uidsList: 
    single_uid_records = df[df['uid']==uid]
    first_row_data = single_uid_records.iloc[0]
    last_end_time = first_row_data['parking_end']
    last_oi = first_row_data['old_index']

    for index, row in islice(single_uid_records.iterrows(), 1, None):
        if last_oi+1 != df['old_index'][index]: #using old index to interupt same uid processing if two rows are not in a sequence in original data frame
            break

        d1_ts = time.mktime(df['parking_begin'][index].timetuple())
        d2_ts = time.mktime(last_end_time.timetuple())

        minsDiff = round((int(d1_ts-d2_ts) / 60), 0)

        last_end_time = row['parking_end']
        last_oi = df['old_index'][index]
        if minsDiff > 1:
            questionable_changes[uid] = minsDiff
        


In [None]:
questionable_changes

In [None]:
len(questionable_changes)

In [None]:
len(set(questionable_changes))

#### We checked GPS errors and found multiple questionable changes, meaning that bike was missing from GPS for longer time than just an momentarily variation in location. 
#### Changes in location were controled (making distance controle for that) and conclusion is thata everything with same uid should be merged. Rows with same uid's will be merged and minutes of standing added together + difference in minutes that looks like a ride but it is only GPS error

### Running cells to clean the data from extra rows (temporar solutions for distance check and for choosing lat and lng)

In [None]:
for uid in uidsList: 
    single_uid_records = df[df['uid']==uid]
    first_row_data = single_uid_records.iloc[0]
    last_end_time = first_row_data['parking_end']
    current_sum = first_row_data['parked_time']
    last_oi = first_row_data['old_index']


    new_df = new_df.append({'old_index': first_row_data['old_index'], 'lat' : first_row_data['lat'], 'lng' : first_row_data['lng'], 
                            'parking_begin' : first_row_data['parking_begin'],
                            'parking_end' : last_end_time, 'parked_time' : first_row_data['parked_time'],
                            'uid' : first_row_data['uid'], 'name' : first_row_data['name'], 'address' : first_row_data['address'],
                            'bike' : first_row_data['bike'], 'bike_racks' : first_row_data['bike_racks'], 
                            'bikes' : first_row_data['bikes'], 'booked_bikes' : first_row_data['booked_bikes'], 
                            'free_racks' : first_row_data['free_racks'], 'free_special_racks' : first_row_data['free_special_racks'],
                            'maintenance' : first_row_data['maintenance'], 'number' : first_row_data['number'], 
                            'place_type' : first_row_data['place_type'], 'rack_locks' : first_row_data['rack_locks'],
                            'special_racks' : first_row_data['special_racks'], 'spot' : first_row_data['spot'], 
                            'terminal_type' : first_row_data['terminal_type'], 'city' : first_row_data['city'],
                            'country_code' : first_row_data['country_code'], 'company' : first_row_data['company'],
                            'timezone' : first_row_data['timezone'], 'available_bikes' : first_row_data['available_bikes']}, ignore_index=True) 

    for index, row in islice(single_uid_records.iterrows(), 1, None):
        if last_oi+1 != df['old_index'][index]:
            break

        d1_ts = time.mktime(df['parking_begin'][index].timetuple())
        d2_ts = time.mktime(last_end_time.timetuple())

        minsDiff = round((int(d1_ts-d2_ts) / 60), 0)
        current_sum += row['parked_time'] + minsDiff

        new_df.iloc[-1, new_df.columns.get_loc('parked_time')] = current_sum
        new_df.iloc[-1, new_df.columns.get_loc('lat')] = row['lat']
        new_df.iloc[-1, new_df.columns.get_loc('lng')] = row['lng']
        new_df.iloc[-1, new_df.columns.get_loc('parking_end')] = row['parking_end']

        last_end_time = row['parking_end']
        last_oi = df['old_index'][index]




In [None]:
new_df.drop(['old_index'], axis=1,inplace=True)

In [None]:
new_df.head(10)

In [None]:
new_df.to_csv('processed_data_nextbike_2020-01.csv', sep=';', index=False) 