In [1]:
import pandas as pd
import random
from datetime import datetime
import time
from itertools import islice

In [2]:
def unique(sequence): #removes duplicates from a list whilst preserving order
    visited = set()
    return [x for x in sequence if not (x in visited or visited.add(x))]

In [3]:
url = "station_times_for_fixing_data_nextbike_2020-01.csv.gz"

In [4]:
df = pd.read_csv(url, sep=';', low_memory=False)

In [5]:
fmt = '%Y-%m-%d %H:%M:%S'

In [6]:
df.columns

Index(['lat', 'lng', 'time_begin', 'time_end', 'standing_time_minutes', 'uid',
       'name', 'address', 'bike', 'bike_racks', 'bikes', 'booked_bikes',
       'free_racks', 'free_special_racks', 'maintenance', 'number',
       'place_type', 'rack_locks', 'special_racks', 'spot', 'terminal_type',
       'city', 'country_code', 'company', 'timezone', 'available_bikes'],
      dtype='object')

In [6]:
pd.set_option('display.max_columns()', None)

In [7]:
new_df = pd.DataFrame(columns= ['old_index', 'lat', 'lng', 'time_begin', 'time_end', 'standing_time_minutes', 'uid', 'name', 'address',
                                'bike', 'bike_racks', 'bikes', 
                                'booked_bikes', 'free_racks', 'free_special_racks', 'maintenance', 'number', 
                                'place_type', 'rack_locks', 'special_racks', 'spot', 'terminal_type', 'city',
                                'country_code', 'company', 'timezone', 'available_bikes'] ) 
#empty dataframe with the new column structure


#### OLD_INDEX is gonna be used later when we itterate trough smaller data frames containing only data rows with same uid. 
#### These subset data frames create new indexes but we use old index to compare positions based on the original place in the main df

In [2]:
df

NameError: name 'df' is not defined

In [9]:
uidsToList = df['uid'].tolist()
uidsList = unique(uidsToList) #removes duplicates from a list whilst preserving order

In [10]:
len(uidsList)

65179

In [11]:
df.reset_index(inplace=True)

In [12]:
df.rename(columns={"index": "old_index"}, inplace=True) #create duplicate column of indexes - old_index

### We want to merge all the rows with the same uid into one row
### First we need to check GPS errors

Old index order is used to interupt same uid processing if two rows are not in a sequence in original data frame. This eliminates the error that was created in cases when one uid appeared for to riddes that are sepparated by many other rides on the same bike. This error is related to how the uid is generated

In [13]:
questionable_changes = {} #creating dictionary to hold uid:difference 
for uid in uidsList: 
    single_uid_records = df[df['uid']==uid]
    first_row_data = single_uid_records.iloc[0]
    last_end_time = first_row_data['time_end']
    last_oi = first_row_data['old_index']

    for index, row in islice(single_uid_records.iterrows(), 1, None):
        if last_oi+1 != df['old_index'][index]: #using old index to interupt same uid processing if two rows are not in a sequence in original data frame
            break
        t_begin = df['time_begin'][index].split('.')[0]
        t_end_previous = last_end_time.split('.')[0]    

        d1 = datetime.strptime(t_begin, fmt)
        d2 = datetime.strptime(t_end_previous, fmt)

        d1_ts = time.mktime(d1.timetuple())
        d2_ts = time.mktime(d2.timetuple())

        minsDiff = round((int(d1_ts-d2_ts) / 60), 0)

        last_end_time = row['time_end']
        last_oi = df['old_index'][index]
        if minsDiff > 1:
            questionable_changes[uid] = minsDiff
        


In [14]:
questionable_changes

{26870459: 1558.0,
 26556383: 3.0,
 27375414: 2.0,
 26561791: 1254.0,
 26550515: 14.0,
 27375246: 13.0,
 27411154: 2.0,
 26612553: 2667.0,
 27239943: 1045.0,
 27215287: 5.0,
 26619292: 20446.0,
 27375320: 2.0,
 27411206: 2.0,
 27310782: 2.0,
 27004024: 6.0,
 27163803: 5.0,
 26897899: 44.0,
 27373636: 2531.0,
 27180239: 1420.0,
 26962478: 107.0,
 26902080: 5.0,
 26919602: 1473.0,
 26676668: 4.0,
 26830402: 4.0,
 27346323: 1465.0,
 26574707: 4738.0,
 27411156: 2.0,
 27424198: 2.0,
 26898627: 867.0,
 27410752: 2.0,
 27450077: 2.0,
 27424820: 2.0}

In [15]:
len(questionable_changes)

32

In [16]:
len(set(questionable_changes))

32

#### We checked GPS errors and found multiple questionable changes, meaning that bike was missing from GPS for longer time than just an momentarily variation in location. 
#### Changes in location were controled (making distance controle for that) and conclusion is thata everything with same uid should be merged. Rows with same uid's will be merged and minutes of standing added together + difference in minutes that looks like a ride but it is only GPS error

### Running cells to clean the data from extra rows (temporar solutions for distance check and for choosing lat and lng)

In [17]:
for uid in uidsList: 
    single_uid_records = df[df['uid']==uid]
    first_row_data = single_uid_records.iloc[0]
    last_end_time = first_row_data['time_end']
    current_sum = first_row_data['standing_time_minutes']
    last_oi = first_row_data['old_index']


    new_df = new_df.append({'old_index': first_row_data['old_index'], 'lat' : first_row_data['lat'], 'lng' : first_row_data['lng'], 
                            'time_begin' : first_row_data['time_begin'],
                            'time_end' : last_end_time, 'standing_time_minutes' : first_row_data['standing_time_minutes'],
                            'uid' : first_row_data['uid'], 'name' : first_row_data['name'], 'address' : first_row_data['address'],
                            'bike' : first_row_data['bike'], 'bike_racks' : first_row_data['bike_racks'], 
                            'bikes' : first_row_data['bikes'], 'booked_bikes' : first_row_data['booked_bikes'], 
                            'free_racks' : first_row_data['free_racks'], 'free_special_racks' : first_row_data['free_special_racks'],
                            'maintenance' : first_row_data['maintenance'], 'number' : first_row_data['number'], 
                            'place_type' : first_row_data['place_type'], 'rack_locks' : first_row_data['rack_locks'],
                            'special_racks' : first_row_data['special_racks'], 'spot' : first_row_data['spot'], 
                            'terminal_type' : first_row_data['terminal_type'], 'city' : first_row_data['city'],
                            'country_code' : first_row_data['country_code'], 'company' : first_row_data['company'],
                            'timezone' : first_row_data['timezone'], 'available_bikes' : first_row_data['available_bikes']}, ignore_index=True) 

    for index, row in islice(single_uid_records.iterrows(), 1, None):
        if last_oi+1 != df['old_index'][index]:
            break
        t_begin = df['time_begin'][index].split('.')[0]
        t_end_previous = last_end_time.split('.')[0]    

        d1 = datetime.strptime(t_begin, fmt)
        d2 = datetime.strptime(t_end_previous, fmt)

        d1_ts = time.mktime(d1.timetuple())
        d2_ts = time.mktime(d2.timetuple())

        minsDiff = round((int(d1_ts-d2_ts) / 60), 0)
        current_sum += row['standing_time_minutes'] + minsDiff

        new_df.iloc[-1, new_df.columns.get_loc('standing_time_minutes')] = current_sum
        new_df.iloc[-1, new_df.columns.get_loc('lat')] = row['lat']
        new_df.iloc[-1, new_df.columns.get_loc('lng')] = row['lng']
        new_df.iloc[-1, new_df.columns.get_loc('time_end')] = row['time_end']

        last_end_time = row['time_end']
        last_oi = df['old_index'][index]




In [19]:
new_df.drop(['old_index'], axis=1,inplace=True)

In [20]:
new_df

Unnamed: 0,lat,lng,time_begin,time_end,standing_time_minutes,uid,name,address,bike,bike_racks,bikes,booked_bikes,free_racks,free_special_racks,maintenance,number,place_type,rack_locks,special_racks,spot,terminal_type,city,country_code,company,timezone,available_bikes
0,50.965871,6.958885,2020-01-01 00:00:02.250239,2020-01-02 08:33:01.862366,1953.0,26502959,BIKE 22174,,True,0,1,0,0,0,False,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,2620
1,50.967846,6.957729,2020-01-02 11:04:02.156789,2020-01-02 11:04:02.156789,0.0,26553268,BIKE 22174,,True,0,1,0,0,0,False,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,2614
2,50.966346,6.955552,2020-01-02 11:05:01.959077,2020-01-02 14:37:02.217892,212.0,26554933,BIKE 22174,,True,0,1,0,0,0,False,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,2612
3,50.962111,6.953751,2020-01-02 14:46:02.261764,2020-01-02 14:53:02.574806,7.0,26559553,BIKE 22174,,True,0,1,0,0,0,False,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,2596
4,50.962755,6.954006,2020-01-02 14:54:02.736863,2020-01-02 16:38:02.237953,104.0,26559847,BIKE 22174,,True,0,1,0,0,0,False,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,2596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65174,50.956329,6.961367,2020-01-29 16:46:03.386454,2020-01-29 19:06:02.448191,140.0,27393224,BIKE 22189,,True,0,1,0,0,0,False,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,2620
65175,50.956544,6.960522,2020-01-29 20:49:02.999793,2020-01-30 19:05:02.312216,1336.0,27399233,BIKE 22189,,True,0,1,0,0,0,False,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,2611
65176,50.954342,6.963644,2020-01-30 19:21:03.153910,2020-01-31 15:40:02.609916,1219.0,27422488,BIKE 22189,,True,0,1,0,0,0,False,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,2619
65177,50.947149,6.922682,2020-01-31 15:57:02.645204,2020-01-31 16:52:03.247628,55.0,27443473,BIKE 22189,,True,0,1,0,0,0,False,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,2612


In [21]:
new_df.to_csv('processed_data_nextbike_2020-01cc.csv', sep=';', index=False) 

### Checking the data

In [None]:
single_uid = df[df['uid']== 26062537 ]
single_uid

In [None]:
single_uid = new_df[new_df['uid']== 26062537 ]
single_uid

In [None]:
new_df

In [None]:
uidsToList2 = new_df['uid'].tolist()
uidsList2 = list(set(uidsToList))

In [None]:
len(uidsList2)

In [None]:
len(set(uidsList2))

In [None]:
new_df['standing_time_minutes'].value_counts(dropna=False)

In [None]:
quid= df[df['uid']==25690126]
quid