In [1]:
import pandas as pd
import numpy as np
import ast
import datetime
from math import radians, cos, sin, asin, sqrt

In [16]:
usecols = ['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID', 'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE']

df = pd.read_csv('Porto_taxi_data_test_partial_trajectories.csv', usecols=usecols)

# dropping and converting some columns
# TRIP_ID: id, DAY_TYPE: calculado errado no dataset.
#df.drop(['TRIP_ID','DAY_TYPE' ], axis=1, inplace=True)
df.TIMESTAMP = pd.to_datetime(df.TIMESTAMP, unit='s')

In [17]:
df.head(2)

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,T1,B,,15.0,20000542,2014-08-14 17:57:17,A,False,"[[-8.585676, 41.148522], [-8.585712000000001, ..."
1,T2,B,,57.0,20000108,2014-08-14 17:50:11,A,False,"[[-8.610876000000001, 41.14557], [-8.610858, 4..."


In [18]:
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    #dlon = lon2 - lon1 
    #dlat = lat2 - lat1 
    a = sin((lat2 - lat1)/2)**2 + cos(lat1) * cos(lat2) * sin((lon2 - lon1)/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [19]:
def outliers_iqr(line):
    if line.size>0:
        q1_lon, q3_lon = np.percentile(line[:,0], [25, 75])
        iqr_lon = q3_lon - q1_lon
        lower_bound_lon = q1_lon - (iqr_lon * 1.5)
        upper_bound_lon = q3_lon + (iqr_lon * 1.5)
        

        q1_lat, q3_lat = np.percentile(line[:,1], [25, 75])
        iqr_lat = q3_lat - q1_lat
        lower_bound_lat = q1_lat - (iqr_lat * 1.5)
        upper_bound_lat = q3_lat + (iqr_lat * 1.5)
        
        # print(np.where((line[:,0] > upper_bound_lon) | (line[:,0] < lower_bound_lon))[0], '----', np.where((line[:,1] > upper_bound_lat) | (line[:,1] < lower_bound_lat))[0], '-----', final)
       
        return np.unique(np.concatenate([np.where((line[:,0] > upper_bound_lon) | (line[:,0] < lower_bound_lon))[0], \
                                         np.where((line[:,1] > upper_bound_lat) | (line[:,1] < lower_bound_lat))[0]], axis=0))

In [20]:
def converte_dots_b(x):
    
    global cont;
    
    try:
        cont+=1
        dist=0
        line = np.array(ast.literal_eval(x))
       
        #print(list(set(outliers)))
        if len(line) > 0:
            line=np.delete(line,list(outliers_iqr(line)), 0)
            
        for index,_ in enumerate(line):
            lon1, lat1, lon2, lat2 = line[index:index+2,].ravel()
            #print(lon1, lat1, lon2, lat2)
            #print(line[index:index+2,].ravel())
            dist += haversine(lon1, lat1, lon2, lat2)
    except Exception as e:
        #print(cont, e)
        None
    finally:
        if cont%10000 ==0: print(datetime.datetime.now(), cont)
        return round(dist,2)

In [21]:
%%time
cont=0
df['dist_perc'] = df.POLYLINE.apply(converte_dots_b)

Wall time: 901 ms


In [22]:
%%time
df['start'] = df.POLYLINE.apply(lambda x: ast.literal_eval(x)[0] if len(x)>2 else None)

Wall time: 462 ms


In [23]:
%%time
df['stop'] = df.POLYLINE.apply(lambda x: ast.literal_eval(x)[-1] if len(x)>2 else None)

Wall time: 439 ms


In [24]:
df['hour'] = df.TIMESTAMP.dt.hour

In [25]:
df['start_lat'] = df.POLYLINE.apply(lambda x: ast.literal_eval(x)[0][-1] if len(x)>2 else None)
df['start_lon'] = df.POLYLINE.apply(lambda x: ast.literal_eval(x)[0][0] if len(x)>2 else None)
df['stop_lat'] = df.POLYLINE.apply(lambda x: ast.literal_eval(x)[-1][-1] if len(x)>2 else None)
df['stop_lon'] = df.POLYLINE.apply(lambda x: ast.literal_eval(x)[-1][0] if len(x)>2 else None)

In [28]:
df.to_csv('test_trated.csv', index=False, chunksize=700)