In [1]:
import pandas as pd
import random
from datetime import datetime
import time
from itertools import islice

In [2]:
def unique(sequence): #removes duplicates from a list whilst preserving order
    visited = set()
    return [x for x in sequence if not (x in visited or visited.add(x))]

In [3]:
def parse_timestamp(el):
    a = el.split('.')[0]
    b = datetime.strptime(a, fmt)
    return b

In [4]:
fmt = '%Y-%m-%d %H:%M:%S'

In [5]:
url = "/bigdata/jelicicna/output_datasets/parked_time_data_nextbike_2020-10.csv.gz"
#url = "parked_time_data_nextbike_2020-01.csv"

In [6]:
df = pd.read_csv(url, sep=';', low_memory=False)

In [7]:
df['parking_begin'] = df['parking_begin'].map(parse_timestamp)
df['parking_end'] = df['parking_end'].map(parse_timestamp)

In [8]:
df.columns

Index(['lat', 'lng', 'parking_begin', 'parking_end', 'parked_time', 'uid',
       'name', 'address', 'available_bikes', 'maintenance', 'bike',
       'bike_racks', 'bikes', 'booked_bikes', 'free_racks',
       'free_special_racks', 'number', 'place_type', 'rack_locks',
       'special_racks', 'spot', 'terminal_type', 'city', 'country_code',
       'company', 'timezone'],
      dtype='object')

In [9]:
pd.set_option('display.max_columns()', None)

In [10]:
new_df = pd.DataFrame(columns= ['old_index', 'lat', 'lng', 'parking_begin', 'parking_end', 'parked_time',
                                'uid', 'name', 'address', 'available_bikes', 'maintenance', 'bike', 'bike_racks', 'bikes', 
                                'booked_bikes', 'free_racks', 'free_special_racks', 'number', 
                                'place_type', 'rack_locks', 'special_racks', 'spot', 'terminal_type', 'city',
                                'country_code', 'company', 'timezone'] ) 
#empty dataframe with the new column structure


#### OLD_INDEX is gonna be used later when we itterate trough smaller data frames containing only data rows with same uid. 
#### These subset data frames create new indexes but we use old index to compare positions based on the original place in the main df

In [11]:
df.head()

Unnamed: 0,lat,lng,parking_begin,parking_end,parked_time,uid,name,address,available_bikes,maintenance,bike,bike_racks,bikes,booked_bikes,free_racks,free_special_racks,number,place_type,rack_locks,special_racks,spot,terminal_type,city,country_code,company,timezone
0,50.91756,6.940145,2020-10-01 00:00:02,2020-10-01 12:49:02,769.0,37174790,BIKE 22754,,1275,False,True,0,1,0,0,0,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
1,50.904742,6.96317,2020-10-01 15:01:02,2020-10-01 15:01:02,0.0,37199571,BIKE 22754,,1310,False,True,0,1,0,0,0,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
2,50.904745,6.963179,2020-10-01 15:02:03,2020-10-01 15:09:02,7.0,37199571,BIKE 22754,,1308,False,True,0,1,0,0,0,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
3,50.904874,6.962288,2020-10-01 15:10:02,2020-10-01 15:10:02,0.0,37206818,BIKE 22754,,1309,False,True,0,1,0,0,0,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
4,50.904932,6.962707,2020-10-01 15:11:02,2020-10-01 17:58:02,167.0,37206870,BIKE 22754,,1309,False,True,0,1,0,0,0,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin


In [12]:
uidsToList = df['uid'].tolist()
uidsList = unique(uidsToList) #removes duplicates from a list whilst preserving order

In [13]:
len(uidsList)

58987

In [14]:
df.reset_index(inplace=True)

In [15]:
df.rename(columns={"index": "old_index"}, inplace=True) #create duplicate column of indexes - old_index

### We want to merge all the rows with the same uid into one row
### First we need to check GPS errors

Old index order is used to interupt same uid processing if two rows are not in a sequence in original data frame. This eliminates the error that was created in cases when one uid appeared for two riddes that are sepparated by many other rides on the same bike. This error is related to how the uid is generated

In [16]:
questionable_changes = {} #creating dictionary to hold uid:difference 
for uid in uidsList: 
    single_uid_records = df[df['uid']==uid]
    first_row_data = single_uid_records.iloc[0]
    last_end_time = first_row_data['parking_end']
    last_oi = first_row_data['old_index']

    for index, row in islice(single_uid_records.iterrows(), 1, None):
        if last_oi+1 != df['old_index'][index]: #using old index to interupt same uid processing if two rows are not in a sequence in original data frame
            break

        d1_ts = time.mktime(df['parking_begin'][index].timetuple())
        d2_ts = time.mktime(last_end_time.timetuple())

        minsDiff = round((int(d1_ts-d2_ts) / 60), 0)

        last_end_time = row['parking_end']
        last_oi = df['old_index'][index]
        if minsDiff > 1:
            questionable_changes[uid] = minsDiff
        


#### Same uid indicates one parking - if there are multiple rows with same uid we expect that to be a GPS error that creates false rides in duration of one minute while the bike was actualy parked . The result of questionable changes search is a dictionary containing all uids that have multiple rows where bike desapeared from parking for more than one minute

In [17]:
questionable_changes

{37192836: 18.0,
 37210381: 1503.0,
 37440854: 3.0,
 37482666: 9541.0,
 37594055: 15.0,
 37621321: 38.0,
 37690267: 2.0,
 37690809: 2.0,
 37691004: 2.0,
 37691041: 2.0,
 37756670: 1043.0,
 38041513: 16.0,
 38508009: 81.0,
 38509402: 2.0,
 38509407: 2.0,
 38509437: 2.0}

In [18]:
len(questionable_changes)

16

In [19]:
len(set(questionable_changes))

16

#### We checked GPS errors and found multiple questionable changes, meaning that bike was missing from GPS for longer time than just an momentarily variation in location. 

#### Changes in location were controled (making distance controle for that) and conclusion is thata everything with same uid should be merged. Rows with same uid's will be merged and minutes of standing added together + difference in minutes that looks like a ride but it is only GPS error

### Running cells to clean the data from extra rows (temporar solutions for distance check and for choosing lat and lng)

In [20]:
for uid in uidsList: 
    single_uid_records = df[df['uid']==uid]
    first_row_data = single_uid_records.iloc[0]
    last_end_time = first_row_data['parking_end']
    current_sum = first_row_data['parked_time']
    last_oi = first_row_data['old_index']

    

    new_df = new_df.append({'old_index': first_row_data['old_index'], 'lat' : first_row_data['lat'], 'lng' : first_row_data['lng'], 
                            'parking_begin' : first_row_data['parking_begin'],
                            'parking_end' : last_end_time, 'parked_time' : first_row_data['parked_time'],
                            'uid' : first_row_data['uid'], 'name' : first_row_data['name'], 'address' : first_row_data['address'],
                            'bike' : first_row_data['bike'], 'bike_racks' : first_row_data['bike_racks'], 
                            'bikes' : first_row_data['bikes'], 'booked_bikes' : first_row_data['booked_bikes'], 
                            'free_racks' : first_row_data['free_racks'], 'free_special_racks' : first_row_data['free_special_racks'],
                            'maintenance' : first_row_data['maintenance'], 'number' : first_row_data['number'], 
                            'place_type' : first_row_data['place_type'], 'rack_locks' : first_row_data['rack_locks'],
                            'special_racks' : first_row_data['special_racks'], 'spot' : first_row_data['spot'], 
                            'terminal_type' : first_row_data['terminal_type'], 'city' : first_row_data['city'],
                            'country_code' : first_row_data['country_code'], 'company' : first_row_data['company'],
                            'timezone' : first_row_data['timezone'], 'available_bikes' : first_row_data['available_bikes']}, ignore_index=True) 

    for index, row in islice(single_uid_records.iterrows(), 1, None):
        if last_oi+1 != df['old_index'][index]:
            break

        d1_ts = time.mktime(df['parking_begin'][index].timetuple())
        d2_ts = time.mktime(last_end_time.timetuple())

        minsDiff = round((int(d1_ts-d2_ts) / 60), 0)
        current_sum += row['parked_time'] + minsDiff

        new_df.iloc[-1, new_df.columns.get_loc('parked_time')] = current_sum
        new_df.iloc[-1, new_df.columns.get_loc('lat')] = row['lat']
        new_df.iloc[-1, new_df.columns.get_loc('lng')] = row['lng']
        new_df.iloc[-1, new_df.columns.get_loc('parking_end')] = row['parking_end']

        last_end_time = row['parking_end']
        last_oi = df['old_index'][index]




In [21]:
new_df.drop(['old_index'], axis=1,inplace=True)

In [22]:
new_df

Unnamed: 0,lat,lng,parking_begin,parking_end,parked_time,uid,name,address,available_bikes,maintenance,bike,bike_racks,bikes,booked_bikes,free_racks,free_special_racks,number,place_type,rack_locks,special_racks,spot,terminal_type,city,country_code,company,timezone
0,50.917560,6.940145,2020-10-01 00:00:02,2020-10-01 12:49:02,769.0,37174790,BIKE 22754,,1275,False,True,0,1,0,0,0,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
1,50.904745,6.963179,2020-10-01 15:01:02,2020-10-01 15:09:02,8.0,37199571,BIKE 22754,,1310,False,True,0,1,0,0,0,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
2,50.904874,6.962288,2020-10-01 15:10:02,2020-10-01 15:10:02,0.0,37206818,BIKE 22754,,1309,False,True,0,1,0,0,0,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
3,50.904932,6.962707,2020-10-01 15:11:02,2020-10-01 17:58:02,167.0,37206870,BIKE 22754,,1309,False,True,0,1,0,0,0,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
4,50.921114,6.918753,2020-10-01 22:10:01,2020-10-02 08:37:02,627.0,37231210,BIKE 22754,,1305,False,True,0,1,0,0,0,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
5,50.929149,6.957356,2020-10-02 13:59:02,2020-10-02 15:42:03,103.0,37259531,BIKE 22754,,1308,False,True,0,1,0,0,0,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
6,50.895554,6.966998,2020-10-02 16:33:03,2020-10-02 17:27:02,54.0,37269773,BIKE 22754,,1249,False,True,0,1,0,0,0,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
7,50.933670,6.928282,2020-10-02 18:09:02,2020-10-02 23:32:02,323.0,37278019,BIKE 22754,,1283,False,True,0,1,0,0,0,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
8,50.929974,6.919886,2020-10-03 00:24:02,2020-10-03 06:07:02,343.0,37299042,BIKE 22754,,1300,False,True,0,1,0,0,0,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
9,50.935928,6.938952,2020-10-03 07:03:02,2020-10-03 07:48:03,45.0,37309640,BIKE 22754,,1321,False,True,0,1,0,0,0,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin


In [23]:
new_df.to_csv('/bigdata/jelicicna/output_datasets/processed_data_nextbike_2020-10.csv', sep=';', index=False) 