In [1]:
import pandas as pd
from itertools import islice

In [2]:
url = "/bigdata/jelicicna/output_datasets/distances_table_nextbike_2020-01.csv.gz"
#url_pr = "processed_data_nextbike_2020-06cc.csv"
df = pd.read_csv(url, sep=';', low_memory=False)
#df_pr = pd.read_csv(url, sep=';', low_memory=False)

In [3]:
pd.set_option('display.max_columns()', None)

#### Function for limiting possible lengths of distances to 2500meters in order to work with one part of data without extreme values

In [4]:
def drop_duplicates(l):
    '''Droping duplicates from a list while maintaining order'''
    seen = set()
    seen_add = seen.add
    return [x for x in l if not (x in seen or seen_add(x))]

In [5]:
def short_list(time):
    return [x for x in sr[time] if x < 2500]

In [6]:
df = df.drop(['bike', 'bike_racks', 'bikes', 'booked_bikes', 'free_racks', 'free_special_racks', 'maintenance', 'number', 
              'place_type', 'rack_locks', 'special_racks', 'spot', 'terminal_type', 'city',
              'country_code', 'company', 'timezone'], axis=1)

df.drop(['index'], axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,lat_begin,lng_begin,lat_end,lng_end,ride_begin,ride_end,ride_time,distance,uid_begin,uid_end,standing_before_ride,address,name,available_bikes
0,50.932058,6.988541,50.936326,6.986125,2020-01-01 02:40:01,2020-01-01 02:46:02,6.0,504.0,26503191,26523894,160.0,,BIKE 22739,2620
1,50.936326,6.986125,50.932037,6.988254,2020-01-01 06:25:02,2020-01-01 06:30:03,5.0,500.0,26523894,26528020,219.0,,BIKE 22739,2613
2,50.932037,6.988254,50.933224,6.982801,2020-01-01 12:56:02,2020-01-01 13:03:01,7.0,404.0,26528020,26534301,386.0,,BIKE 22739,2622
3,50.933224,6.982801,50.933625,6.981179,2020-01-01 15:27:02,2020-01-01 15:31:02,4.0,122.0,26534301,26537369,144.0,,BIKE 22739,2616
4,50.933625,6.981179,50.933253,6.982808,2020-01-01 22:52:02,2020-01-01 22:55:02,3.0,121.0,26537369,26544916,441.0,,BIKE 22739,2604


In [8]:
negative_values = df[df['ride_time'] < 0].index.to_list()
negative_values

[]

In [9]:
#use only if there are values that you need to drop in negative values
df.drop(negative_values, inplace=True)

In [10]:
rows_with_nan = []
for index, row in df.iterrows():
    is_nan_series = row.isnull()
    if is_nan_series.any():
        rows_with_nan.append(index)

In [11]:
df_adr = df.drop(rows_with_nan, axis=0)

In [12]:
df_adr.head()

Unnamed: 0,lat_begin,lng_begin,lat_end,lng_end,ride_begin,ride_end,ride_time,distance,uid_begin,uid_end,standing_before_ride,address,name,available_bikes
264,50.919418,6.959776,50.944125,6.959246,2020-01-14 17:33:02,2020-01-14 18:10:03,37.0,2748.0,26937609,26942141,18.0,Bonner Str. 28,BIKE 21579,2480
378,50.918073,6.960369,50.915496,6.970114,2020-01-30 13:00:02,2020-01-30 13:29:02,29.0,741.0,27409728,27413034,102.0,Bonner Str. 45,BIKE 22396,2611
411,50.966313,7.015538,50.965844,7.015161,2020-01-06 16:36:02,2020-01-08 09:41:01,2465.0,59.0,26659638,26722710,249.0,"Schanzenstraße 26, Köln",BIKE 22040,2570
486,50.942163,6.957415,50.943219,6.953894,2020-01-13 06:32:02,2020-01-13 06:38:02,6.0,273.0,26878086,26878795,33.0,Köln - Trankgasse 11,BIKE 21822,2558
558,50.941769,6.957032,50.93679,6.947512,2020-01-15 07:27:02,2020-01-15 07:59:01,32.0,867.0,26939877,26956992,78.0,trankgasse,BIKE 21570,2521


In [13]:
address_list = df_adr.index.to_list()

In [14]:
previous_address_list = [(i-1) for i in address_list]
following_address_list = [(i+1) for i in address_list]

In [15]:
overview_list = previous_address_list + address_list + following_address_list
overview_list.sort()
overview_list = drop_duplicates(overview_list)

In [16]:
df_suspicious_indexes = df.loc[overview_list]

In [18]:
df_suspicious_indexes

Unnamed: 0,lat_begin,lng_begin,lat_end,lng_end,ride_begin,ride_end,ride_time,distance,uid_begin,uid_end,standing_before_ride,address,name,available_bikes
263,50.924489,6.957470,50.919418,6.959776,2020-01-14 17:02:02,2020-01-14 17:15:02,13.0,587.0,26928080,26937609,113.0,,BIKE 21579,2510
264,50.919418,6.959776,50.944125,6.959246,2020-01-14 17:33:02,2020-01-14 18:10:03,37.0,2748.0,26937609,26942141,18.0,Bonner Str. 28,BIKE 21579,2480
265,50.944125,6.959246,50.945099,6.946622,2020-01-14 23:38:03,2020-01-15 04:04:02,266.0,891.0,26942141,26950724,328.0,,BIKE 21579,2455
377,50.917671,6.960294,50.918073,6.960369,2020-01-28 10:49:02,2020-01-30 11:18:02,2909.0,45.0,27360304,27409728,7.0,,BIKE 22396,2608
378,50.918073,6.960369,50.915496,6.970114,2020-01-30 13:00:02,2020-01-30 13:29:02,29.0,741.0,27409728,27413034,102.0,Bonner Str. 45,BIKE 22396,2611
379,50.915496,6.970114,50.940738,6.961923,2020-01-30 17:02:02,2020-01-30 17:29:02,27.0,2866.0,27413034,27419636,213.0,,BIKE 22396,2610
410,50.963471,7.006077,50.966313,7.015538,2020-01-06 09:36:01,2020-01-06 12:27:02,171.0,734.0,26654521,26659638,212.0,,BIKE 22040,2625
411,50.966313,7.015538,50.965844,7.015161,2020-01-06 16:36:02,2020-01-08 09:41:01,2465.0,59.0,26659638,26722710,249.0,"Schanzenstraße 26, Köln",BIKE 22040,2570
412,50.965844,7.015161,50.965935,7.015219,2020-01-08 12:44:01,2020-01-08 12:51:02,7.0,11.0,26722710,26727431,183.0,,BIKE 22040,2541
485,50.928035,6.918631,50.942163,6.957415,2020-01-13 05:31:02,2020-01-13 05:59:02,28.0,3140.0,26869058,26878086,606.0,,BIKE 21822,2557


In [None]:
#with open('mytable.tex','w') as tf:
 #   tf.write(df.head().to_latex())

In [19]:
previous_index = overview_list[0]
previous_distance = df_suspicious_indexes.iloc[0]['distance']
suspicious_return_trips = []
for index,row in islice(df_suspicious_indexes.iterrows(), 1, None):
    if (row['distance'] == previous_distance):
        suspicious_return_trips.append(previous_index)
        suspicious_return_trips.append(index)
        
    previous_index = index
    previous_distance = row['distance']

In [20]:
suspicious_return_trips

[1703, 1704, 4637, 4638, 5313, 5314, 8225, 8226, 27789, 27790, 37306, 37307]

In [21]:
df.loc[1703:1704]

Unnamed: 0,lat_begin,lng_begin,lat_end,lng_end,ride_begin,ride_end,ride_time,distance,uid_begin,uid_end,standing_before_ride,address,name,available_bikes
1703,50.951432,6.919364,50.926444,6.958002,2020-01-14 15:20:02,2020-01-15 07:29:02,969.0,3881.0,26903584,26956825,952.0,Bahnhof Ehrenfeld,BIKE 22716,2523
1704,50.926444,6.958002,50.951432,6.919364,2020-01-16 09:40:03,2020-01-16 09:53:03,13.0,3881.0,26956825,27000341,1571.0,,BIKE 22716,2523


In [46]:
suspicious_distances = []
for index,row in df.iterrows():
    if (row['distance'] > 20000):
        suspicious_distances.append(index)

In [47]:
suspicious_distances

[131, 132, 15897, 31582, 34459, 34460, 36435, 36436]

In [50]:
suspicious_indexes = set(suspicious_return_trips + suspicious_distances)
len(suspicious_indexes)

780

In [51]:
len(suspicious_indexes)

780

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29555 entries, 0 to 29554
Data columns (total 14 columns):
lat_begin               29555 non-null float64
lng_begin               29555 non-null float64
lat_end                 29555 non-null float64
lng_end                 29555 non-null float64
ride_begin              29555 non-null object
ride_end                29555 non-null object
ride_time               29555 non-null float64
distance                29555 non-null float64
uid_begin               29555 non-null int64
uid_end                 29555 non-null int64
standing_before_ride    29555 non-null float64
address                 229 non-null object
name                    29555 non-null object
available_bikes         29555 non-null int64
dtypes: float64(7), int64(3), object(4)
memory usage: 3.2+ MB


In [25]:
df_no = df.drop(suspicious_indexes, axis=0)
df_no.reset_index(inplace=True)
df_no.drop('index', axis=1, inplace=True)

In [26]:
df_no.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28984 entries, 0 to 28983
Data columns (total 14 columns):
lat_begin               28984 non-null float64
lng_begin               28984 non-null float64
lat_end                 28984 non-null float64
lng_end                 28984 non-null float64
ride_begin              28984 non-null object
ride_end                28984 non-null object
ride_time               28984 non-null float64
distance                28984 non-null float64
uid_begin               28984 non-null int64
uid_end                 28984 non-null int64
standing_before_ride    28984 non-null float64
address                 182 non-null object
name                    28984 non-null object
available_bikes         28984 non-null int64
dtypes: float64(7), int64(3), object(4)
memory usage: 3.1+ MB


In [27]:
df_no.to_csv('/bigdata/jelicicna/output_datasets/anlysis_data_nextbike_2021-01.csv', sep=';', index=False) 

In [None]:
df_no[df_no[""]]