In [1]:
import pandas as pd
import math as mt
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import statistics
from itertools import islice

In [2]:
from pylab import rcParams
rcParams['figure.figsize'] = 16, 9

In [3]:
url = "distance_and_ride_time_data_nextbike_2020-06cc.csv"
url_pr = "processed_data_nextbike_2020-06cc.csv"
df = pd.read_csv(url, sep=';', low_memory=False)
df_pr = pd.read_csv(url, sep=';', low_memory=False)

In [4]:
pd.set_option('display.max_rows()', None)

#### Function for limiting possible lengths of distances to 2500meters in order to work with one part of data without extreme values

In [5]:
def drop_duplicates(l):
    '''Droping duplicates from a list while maintaining order'''
    seen = set()
    seen_add = seen.add
    return [x for x in l if not (x in seen or seen_add(x))]

In [6]:
def short_list(time):
    return [x for x in sr[time] if x < 2500]

In [7]:
df = df.drop(['bike', 'bike_racks', 'bikes', 'booked_bikes', 'free_racks', 'free_special_racks', 'maintenance', 'number', 
              'place_type', 'rack_locks', 'special_racks', 'spot', 'terminal_type', 'city',
              'country_code', 'company', 'timezone', 'available_bikes'], axis=1)

In [8]:
df.size

859032

In [9]:
df_pr.size

2147580

In [10]:
rows_with_nan = []
for index, row in df.iterrows():
    is_nan_series = row.isnull()
    if is_nan_series.any():
        rows_with_nan.append(index)

In [11]:
df_adr = df.drop(rows_with_nan, axis=0)

In [12]:
df_adr.head()

Unnamed: 0,lat_begin,lng_begin,lat_end,lng_end,ride_begin,ride_end,ride_time_minutes,distance,uid_begin,uid_end,address,name
43,50.962633,7.003422,50.982984,6.943306,2020-06-23 15:16:01.469897,2020-06-23 15:51:02.883198,35.0,4780.0,31412495,31428321,Buchheimer Str. 53,BIKE 21261
384,50.937337,6.982906,50.947377,6.94515,2020-06-04 10:20:02.015185,2020-06-04 10:32:02.176659,12.0,2872.0,30610825,30611125,"Willy-Brandt-Platz 3, 50679 Köln",BIKE 21142
389,51.987538,5.891279,50.949319,6.95241,2020-06-10 17:26:02.325793,2020-06-10 17:27:01.832579,1.0,136898.0,30844480,30844589,Schilderstr,BIKE 21142
390,50.949319,6.95241,50.930628,6.940701,2020-06-10 19:16:01.961847,2020-06-10 19:42:01.696540,26.0,2235.0,30844589,30852169,Schildergasse,BIKE 21142
411,42.841646,-82.880387,50.937619,7.006554,2020-06-19 10:48:01.493559,2020-06-19 10:49:02.138013,1.0,6459100.0,31243469,31243504,Hollwegstraße,BIKE 21142


In [13]:
address_list = df_adr.index.to_list()

In [14]:
previous_address_list = [(i-1) for i in address_list]
following_address_list = [(i+1) for i in address_list]

In [15]:
overview_list = previous_address_list + address_list + following_address_list
overview_list.sort()
overview_list = drop_duplicates(overview_list)

In [16]:
df_suspicious_indexes = df.loc[overview_list]

In [17]:
df_suspicious_indexes.head()

Unnamed: 0,lat_begin,lng_begin,lat_end,lng_end,ride_begin,ride_end,ride_time_minutes,distance,uid_begin,uid_end,address,name
42,50.985846,6.945063,50.962633,7.003422,2020-06-23 08:18:01.578288,2020-06-23 08:46:01.778788,28.0,4835.0,31294157,31412495,,BIKE 21261
43,50.962633,7.003422,50.982984,6.943306,2020-06-23 15:16:01.469897,2020-06-23 15:51:02.883198,35.0,4780.0,31412495,31428321,Buchheimer Str. 53,BIKE 21261
44,50.982984,6.943306,50.990532,6.942997,2020-06-23 17:33:02.078141,2020-06-23 17:40:01.829675,7.0,840.0,31428321,31435730,,BIKE 21261
383,50.939328,7.003626,50.937337,6.982906,2020-06-04 10:16:01.961973,2020-06-04 10:18:02.005541,2.0,1469.0,30552315,30610825,,BIKE 21142
384,50.937337,6.982906,50.947377,6.94515,2020-06-04 10:20:02.015185,2020-06-04 10:32:02.176659,12.0,2872.0,30610825,30611125,"Willy-Brandt-Platz 3, 50679 Köln",BIKE 21142


In [18]:
previous_index = overview_list[0]
previous_distance = df_suspicious_indexes.iloc[0]['distance']
suspicious_return_trips = []
for index,row in islice(df_suspicious_indexes.iterrows(), 1, None):
    if (row['distance'] == previous_distance):
        suspicious_return_trips.append(previous_index)
        suspicious_return_trips.append(index)
        
    previous_index = index
    previous_distance = row['distance']

In [19]:
suspicious_return_trips

[388, 389, 56136, 56137]

In [20]:
suspicious_distances = []
for index,row in df.iterrows():
    if (row['distance'] > 20000):
        suspicious_distances.append(index)

In [21]:
suspicious_distances

[388,
 389,
 410,
 411,
 10694,
 10695,
 17436,
 17437,
 28469,
 28470,
 29148,
 49220,
 49221,
 56852,
 60949,
 60950,
 65843,
 71254]

In [None]:
#suspicious_times = []
#for index, row in df.iterrows():
    #if row['ride_time_minutes'] > 2880:
        #suspicious_times.append(index)

In [None]:
suspicious_times #remains to be investigated

In [22]:
df.loc[71254]

lat_begin                               50.9192
lng_begin                               6.94059
lat_end                                 24.8874
lng_end                                 85.6735
ride_begin           2020-06-08 10:36:01.722689
ride_end             2020-06-15 18:13:02.131704
ride_time_minutes                         10537
distance                            7.11842e+06
uid_begin                              30734840
uid_end                                31077661
address                                     NaN
name                                 BIKE 22829
Name: 71254, dtype: object

In [23]:
single_bike1 = df[df['name']== 'BIKE 22829' ] #the bike that appears in India
single_bike1

Unnamed: 0,lat_begin,lng_begin,lat_end,lng_end,ride_begin,ride_end,ride_time_minutes,distance,uid_begin,uid_end,address,name
71237,50.930883,6.982423,50.936483,6.973939,2020-06-02 19:52:01.597082,2020-06-02 20:08:02.937887,16.0,861.0,30547506,30549344,,BIKE 22829
71238,50.936483,6.973939,50.93156,6.995322,2020-06-02 21:12:01.520129,2020-06-02 21:39:02.379746,27.0,1596.0,30549344,30552884,,BIKE 22829
71239,50.93156,6.995322,50.918304,6.978265,2020-06-02 22:18:01.870950,2020-06-02 22:36:03.357806,18.0,1898.0,30552884,30555231,,BIKE 22829
71240,50.918304,6.978265,50.921531,6.932361,2020-06-02 23:07:01.481846,2020-06-03 11:21:02.256313,734.0,3239.0,30555231,30566924,,BIKE 22829
71241,50.921531,6.932361,50.931155,6.933808,2020-06-03 13:11:01.594453,2020-06-03 13:57:01.450742,46.0,1075.0,30566924,30572830,,BIKE 22829
71242,50.931155,6.933808,50.933774,6.927543,2020-06-03 16:03:01.624929,2020-06-03 16:19:02.404938,16.0,527.0,30572830,30579261,,BIKE 22829
71243,50.933774,6.927543,50.95543,6.907188,2020-06-03 16:56:01.602248,2020-06-03 19:56:02.608255,180.0,2799.0,30579261,30590948,,BIKE 22829
71244,50.95543,6.907188,50.91703,6.928348,2020-06-03 21:52:01.892672,2020-06-04 22:32:01.932721,1480.0,4521.0,30590948,30603587,,BIKE 22829
71245,50.91703,6.928348,50.934909,6.9499,2020-06-05 08:51:02.546428,2020-06-05 09:39:02.252019,48.0,2498.0,30603587,30642905,,BIKE 22829
71246,50.934909,6.9499,50.926665,6.936464,2020-06-05 12:55:02.038601,2020-06-05 15:14:01.662169,139.0,1315.0,30642905,30651055,,BIKE 22829


In [26]:
single_bike2 = df[df['name']== 'BIKE 22447' ] #the bike that spent days in Bonn
single_bike2

Unnamed: 0,lat_begin,lng_begin,lat_end,lng_end,ride_begin,ride_end,ride_time_minutes,distance,uid_begin,uid_end,address,name
29151,50.957318,6.994188,50.957516,7.011543,2020-06-17 13:59:01.705480,2020-06-17 14:16:02.180207,17.0,1216.0,31148628,31159806,,BIKE 22447
29152,50.957516,7.011543,50.945297,7.004668,2020-06-17 15:13:02.241147,2020-06-17 17:20:01.750001,127.0,1442.0,31159806,31169450,,BIKE 22447
29153,50.945297,7.004668,50.943636,7.002369,2020-06-17 18:22:02.642355,2020-06-17 18:48:02.413702,26.0,245.0,31169450,31175407,,BIKE 22447
29154,50.943636,7.002369,50.944949,7.002819,2020-06-17 18:52:02.429497,2020-06-17 19:04:01.922568,12.0,149.0,31175407,31176425,,BIKE 22447
29155,50.944949,7.002819,50.955271,7.006061,2020-06-17 19:53:02.101423,2020-06-17 20:06:02.164899,13.0,1170.0,31176425,31179859,,BIKE 22447
29156,50.955271,7.006061,50.944826,7.002802,2020-06-17 23:14:02.364191,2020-06-18 03:53:02.247941,279.0,1184.0,31179859,31187527,,BIKE 22447
29157,50.944826,7.002802,50.935495,6.987459,2020-06-18 08:20:02.362989,2020-06-18 09:06:02.839341,46.0,1494.0,31187527,31196351,,BIKE 22447
29158,50.935495,6.987459,50.944814,7.003258,2020-06-18 13:42:02.048855,2020-06-18 13:53:02.091743,11.0,1517.0,31196351,31207032,,BIKE 22447
29159,50.944814,7.003258,50.938151,6.997724,2020-06-18 15:05:01.561201,2020-06-18 15:12:01.957368,7.0,836.0,31207032,31210504,,BIKE 22447
29160,50.938151,6.997724,50.945071,7.005121,2020-06-18 15:27:01.952365,2020-06-18 15:36:01.918165,9.0,928.0,31210504,31211601,,BIKE 22447


In [None]:
#single_bike3 = df_no_adr[df_no_adr['name']== 'BIKE 22385' ]
#single_bike3

In [27]:
no_duplicates = set(suspicious_distances) - set(suspicious_return_trips)

In [29]:
suspicious_indexes = suspicious_return_trips + list(no_duplicates)
suspicious_indexes

[388,
 389,
 56136,
 56137,
 49220,
 49221,
 10694,
 10695,
 17436,
 65843,
 56852,
 28469,
 28470,
 60949,
 60950,
 71254,
 410,
 411,
 29148,
 17437]

In [None]:
#This is to be used only if we want to include the data about the bike that spent days out of Cologne in the list for droping
#out_of_cologne = set(single_bike2.index)
#list_for_dropping = set(suspicious_indexes) - out_of_cologne
#list_for_dropping = suspicious_indexes + list(list_for_dropping)

In [30]:
df_no = df_pr.drop(suspicious_indexes, axis=0)
df_no.reset_index(inplace=True)
df_no.drop('index', axis=1, inplace=True)

In [31]:
df_no.head(50)

Unnamed: 0,lat_begin,lng_begin,lat_end,lng_end,ride_begin,ride_end,ride_time_minutes,distance,uid_begin,uid_end,...,place_type,rack_locks,special_racks,spot,terminal_type,city,country_code,company,timezone,available_bikes
0,50.96738,6.913591,50.968653,6.909912,2020-06-01 11:23:02.135730,2020-06-01 11:52:01.971278,29.0,294.0,30468631,30480098,...,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,1058
1,50.968653,6.909912,50.972235,6.924658,2020-06-05 17:07:01.701984,2020-06-05 19:36:02.939119,149.0,1107.0,30480098,30654954,...,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,1079
2,50.972235,6.924658,50.954284,6.919008,2020-06-05 21:19:02.341096,2020-06-05 23:04:02.169224,105.0,2035.0,30654954,30665132,...,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,1089
3,50.954284,6.919008,50.955787,6.914857,2020-06-05 23:30:02.456473,2020-06-06 04:53:01.679870,323.0,335.0,30665132,30669192,...,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,1079
4,50.955787,6.914857,50.932989,6.92249,2020-06-06 09:38:02.380791,2020-06-06 09:58:01.499404,20.0,2592.0,30669192,30673723,...,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,1094
5,50.932989,6.92249,50.936811,6.922821,2020-06-06 14:32:02.513259,2020-06-06 15:13:01.452866,41.0,426.0,30673723,30683067,...,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,1098
6,50.936811,6.922821,50.936704,6.963254,2020-06-06 16:22:02.042554,2020-06-06 17:20:02.849924,58.0,2834.0,30683067,30690361,...,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,1053
7,50.936704,6.963254,50.935375,6.956512,2020-06-06 17:25:01.986203,2020-06-06 17:31:02.264822,6.0,495.0,30690361,30691326,...,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,1062
8,50.935375,6.956512,50.937103,6.96217,2020-06-06 17:37:02.155707,2020-06-06 17:44:01.614297,7.0,441.0,30691326,30691955,...,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,1061
9,50.937103,6.96217,50.90601,6.956364,2020-06-06 19:57:01.447900,2020-06-06 20:29:01.743729,32.0,3482.0,30691955,30699364,...,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin,1059


In [None]:
df_no.to_csv('clust_data_nextbike_2020-06cc.csv', sep=';', index=False) 