In [1]:
import pandas as pd
from itertools import islice
import seaborn as sns
import matplotlib

In [2]:
url = "/bigdata/jelicicna/output_datasets/distances_table_nextbike_2021-03.csv.gz"
#url_pr = "processed_data_nextbike_2020-06cc.csv"
df = pd.read_csv(url, sep=';', low_memory=False)
#df_pr = pd.read_csv(url, sep=';', low_memory=False)

In [3]:
matplotlib.rcParams['figure.figsize'] = (16.0, 12.0)
matplotlib.style.use('ggplot')

In [4]:
pd.set_option('display.max_columns()', None)

#### Function for limiting possible lengths of distances to 2500meters in order to work with one part of data without extreme values

In [5]:
def drop_duplicates(l):
    '''Droping duplicates from a list while maintaining order'''
    seen = set()
    seen_add = seen.add
    return [x for x in l if not (x in seen or seen_add(x))]

In [6]:
def short_list(time):
    return [x for x in sr[time] if x < 2500]

In [7]:
df = df.drop(['bike', 'bike_racks', 'bikes', 'booked_bikes', 'free_racks', 'free_special_racks', 'maintenance', 'number', 
              'place_type', 'rack_locks', 'special_racks', 'spot', 'terminal_type', 'city',
              'country_code', 'company', 'timezone'], axis=1)

df.drop(['index'], axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,lat_begin,lng_begin,lat_end,lng_end,ride_begin,ride_end,ride_time,distance,uid_begin,uid_end,standing_before_ride,address,name,available_bikes
0,50.906261,6.944233,50.930842,6.900563,2021-03-01 15:41:01,2021-03-01 16:27:02,46.0,4105.0,41946174,41964922,915.0,,BIKE 22414,679
1,50.930842,6.900563,50.928876,6.900142,2021-03-01 16:34:01,2021-03-01 16:35:02,1.0,221.0,41964922,41966407,7.0,,BIKE 22414,601
2,50.928876,6.900142,50.937211,6.900554,2021-03-01 17:54:02,2021-03-01 18:08:01,14.0,927.0,41966407,41970488,79.0,,BIKE 22414,600
3,50.937211,6.900554,50.944225,6.91116,2021-03-01 18:28:01,2021-03-01 18:46:01,18.0,1078.0,41970488,41972082,20.0,,BIKE 22414,595
4,50.944225,6.91116,50.906063,6.944202,2021-03-01 22:14:02,2021-03-01 23:12:02,58.0,4836.0,41972082,41977369,208.0,,BIKE 22414,606


In [9]:
negative_values = df[df['ride_time'] < 0].index.to_list()
negative_values

[]

In [10]:
#use only if there are values that you need to drop in negative values
#df.drop(negative_values, inplace=True)

In [None]:
out_rides = []
for index, row in df.iterrows():
    if (row['ride_time'] < 2) or (row['ride_time'] >= 700):
        out_rides.append(index)

In [None]:
out_distances_total = []
for index, row in df.iterrows():
    if (row['distance'] < 200) or (row['distance'] >= 10200):
        out_distances_total.append(index)

In [None]:
len(df)

In [None]:
len(out_distances_total)

In [None]:
len(out_rides)

In [None]:
out_set = len(set(out_rides + out_distances_total))
out_set

In [None]:
# Function to callculate the intersection of two lists
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

In [None]:
intersect = intersection(out_rides, out_distances_total)
len(intersect)

In [None]:
df_limit = df.drop(out_rides, axis=0)

In [None]:
df_limit['distance'].value_counts()

In [None]:
out_distances = []
for index, row in df_limit.iterrows():
    if (row['distance'] < 200) or (row['distance'] >= 10200):
        out_distances.append(index)

In [None]:
len(out_distances)

In [None]:
len(out_distances + out_rides)

In [None]:
quotient = len(out_distances + out_rides) / len(df)
percent = quotient * 100
print(percent)

In [None]:
df_c = df_limit.drop(out_distances, axis=0)
df_c.reset_index(inplace=True)

In [None]:
df_c.info()

In [None]:
df_c['distance'].value_counts()

In [None]:
df_c['ride_time'].value_counts()

In [None]:
ind = []
for index, row in df_c.iterrows():
    if row['distance']<300:
        ind.append(index)

In [None]:
ind

In [None]:
df_c.iloc[ind]

In [None]:
sns.set_style('white')
sns.set_context("paper", font_scale = 1.5)
sns.distplot(df_c['ride_time'], bins = 140)

In [None]:
sns.set_style('white')
sns.set_context("paper", font_scale = 1.5)
sns.distplot(df_c['distance'], bins = 140)

In [None]:
rows_with_nan = []
for index, row in df_c.iterrows():
    is_nan_series = row.isnull()
    if is_nan_series.any():
        rows_with_nan.append(index)

In [None]:
df_adr = df_c.drop(rows_with_nan, axis=0)

In [None]:
df_adr

In [None]:
previous_index = 0
previous_distance = df_c.iloc[0]['distance']
suspicious_return_trips = []
for index,row in islice(df_c.iterrows(), 1, None):
    if (row['distance'] == previous_distance):
        suspicious_return_trips.append(previous_index)
        suspicious_return_trips.append(index)
        
    previous_index = index
    previous_distance = row['distance']

In [None]:
df_c.iloc[suspicious_return_trips]

In [None]:
def cal_distance_in_meters(speed, time):
    return (speed * round((time/60),1))*1000;

In [None]:
def cal_dis(speed, time):
    print(" Time(hr) :", time) ;
    print(" Speed(km / hr) :", speed);
    return speed * time;

print(" The calculated Distance(km) :",
                   cal_dis(20, 0.3));

In [None]:
calculated_distances = []
for row,col in df_c.iterrows():
    recorded_distance = df_c['distance'][row]
    duration = df_c['ride_time'][row]
    
    calculated_distance = cal_distance_in_meters(25, duration)
    calculated_distances.append(calculated_distance)

In [None]:
for d in range(len(calculated_distances)):
    calculated_distances[d] = round(calculated_distances[d] + 1500, 1)

In [None]:
df_c['calculated_distance'] = calculated_distances

In [None]:
unmatched_distances = []
for index, row in df_c.iterrows():
    if df_c['distance'][index] > df_c['calculated_distance'][index]:
        unmatched_distances.append(index)

In [None]:
df_c.iloc[unmatched_distances]

In [None]:
len(unmatched_distances)

In [None]:
for i in unmatched_distances:
    if i in df_c.index.values:
        df_c.drop(i, inplace=True)

In [None]:
df_c.reset_index(inplace=True)

In [None]:
df_c['address'].value_counts(dropna=False)

In [None]:
df_c.rename(columns={"index": "old_index"}, inplace=True)
df_c.drop(['level_0', 'address'], axis=1, inplace=True)

In [None]:
df_c.to_csv('/bigdata/jelicicna/output_datasets/anlysis_data_nextbike_2021-03.csv', sep=';', index=False) 

In [None]:
len(df_c)