In [None]:
import pandas as pd
from itertools import islice
import seaborn as sns
import matplotlib

In [None]:
url = "./distances_table_nextbike_2020-01.csv.gz"
df = pd.read_csv(url, sep=';', low_memory=False)

In [None]:
matplotlib.rcParams['figure.figsize'] = (16.0, 12.0)
matplotlib.style.use('ggplot')

In [None]:
pd.set_option('display.max_columns()', None)

In [None]:
def drop_duplicates(l):
    '''Droping duplicates from a list while maintaining order'''
    seen = set()
    seen_add = seen.add
    return [x for x in l if not (x in seen or seen_add(x))]

In [None]:
def short_list(time):
    return [x for x in sr[time] if x < 2500]

In [None]:
df = df.drop(['bike', 'bike_racks', 'bikes', 'booked_bikes', 'free_racks', 'free_special_racks', 'maintenance', 'number', 
              'place_type', 'rack_locks', 'special_racks', 'spot', 'terminal_type', 'city',
              'country_code', 'company', 'timezone'], axis=1)

In [None]:
negative_values = df[df['ride_time'] < 0].index.to_list()
negative_values

In [None]:
#use only if there are values that you need to drop in negative values
#df.drop(negative_values, inplace=True)

In [None]:
out_rides = []
for index, row in df.iterrows():
    if (row['ride_time'] < 2) or (row['ride_time'] >= 702):
        out_rides.append(index)

In [None]:
out_distances_total = []
for index, row in df.iterrows():
    if (row['distance'] < 200) or (row['distance'] >= 15200):
        out_distances_total.append(index)

In [None]:
len(df)

In [None]:
len(out_distances_total)

In [None]:
len(out_rides)

In [None]:
out_set = len(set(out_rides + out_distances_total))
out_set

In [None]:
# Function to callculate the intersection of two lists
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

In [None]:
intersect = intersection(out_rides, out_distances_total)
len(intersect)

In [None]:
df_limit = df.drop(out_rides, axis=0)

In [None]:
df_limit['distance'].value_counts()

In [None]:
out_distances = []
for index, row in df_limit.iterrows():
    if (row['distance'] < 200) or (row['distance'] >= 15200):
        out_distances.append(index)

In [None]:
quotient = len(out_distances + out_rides) / len(df)
percent = quotient * 100
print(percent)

In [None]:
df_c = df_limit.drop(out_distances, axis=0)
df_c.reset_index(inplace=True)

In [None]:
df_c.info()

In [None]:
df_c['distance'].value_counts()

In [None]:
df_c['ride_time'].value_counts()

In [None]:
ind = []
for index, row in df_c.iterrows():
    if row['distance']<300:
        ind.append(index)

In [None]:
df_c.iloc[ind]

In [None]:
sns.set_style('white')
sns.set_context("paper", font_scale = 1.5)
sns.distplot(df_c['ride_time'], bins = 140)

In [None]:
sns.set_style('white')
sns.set_context("paper", font_scale = 1.5)
sns.distplot(df_c['distance'], bins = 140)

In [None]:
rows_with_nan = []
for index, row in df_c.iterrows():
    is_nan_series = row.isnull()
    if is_nan_series.any():
        rows_with_nan.append(index)

In [None]:
df_adr = df_c.drop(rows_with_nan, axis=0)

In [None]:
df_adr

In [None]:
previous_index = 0
previous_distance = df_c.iloc[0]['distance']
suspicious_return_trips = []
for index,row in islice(df_c.iterrows(), 1, None):
    if (row['distance'] == previous_distance):
        suspicious_return_trips.append(previous_index)
        suspicious_return_trips.append(index)
        
    previous_index = index
    previous_distance = row['distance']

In [None]:
df_c.iloc[suspicious_return_trips]

In [None]:
def cal_distance_in_meters(speed, time):
    return (speed * round((time/60),1))*1000;

In [None]:
def cal_dis(speed, time):
    print(" Time(hr) :", time) ;
    print(" Speed(km / hr) :", speed);
    return speed * time;

print(" The calculated Distance(km) :",
                   cal_dis(20, 0.3));

In [None]:
calculated_distances = []
for row,col in df_c.iterrows():
    recorded_distance = df_c['distance'][row]
    duration = df_c['ride_time'][row]
    
    calculated_distance = cal_distance_in_meters(25, duration)
    calculated_distances.append(calculated_distance)

In [None]:
for d in range(len(calculated_distances)):
    calculated_distances[d] = round(calculated_distances[d] + 1500, 1)

In [None]:
df_c['calculated_distance'] = calculated_distances

In [None]:
unmatched_distances = []
for index, row in df_c.iterrows():
    if df_c['distance'][index] > df_c['calculated_distance'][index]:
        unmatched_distances.append(index)

In [None]:
df_c.iloc[unmatched_distances]

In [None]:
len(unmatched_distances)

In [None]:
for i in unmatched_distances:
    if i in df_c.index.values:
        df_c.drop(i, inplace=True)

In [None]:
df_c.reset_index(inplace=True)

In [None]:
df_c['address'].value_counts(dropna=False)

In [None]:
df_c.rename(columns={"index": "old_index"}, inplace=True)
df_c.drop(['level_0', 'address'], axis=1, inplace=True)

In [None]:
#df_c.to_csv('./limited_data_nextbike_2020-01.csv', sep=';', index=False) 