In [1]:
import pandas as pd
import numpy as np
import ast
import datetime
from math import radians, cos, sin, asin, sqrt

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor

In [2]:
usecols = ['CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID', 'TIMESTAMP',
           'dist_perc', 'start_lat', 'start_lon', 'stop_lat', 'stop_lon']

df = pd.read_csv('train_tratado_outliers_2.csv', usecols=usecols, parse_dates=True, nrows=100000)

# cabeçalho minusculo
df.columns = df.columns.map(lambda x: x.lower())
#conversão da data de obj para datetime
df.timestamp = pd.to_datetime(df.timestamp)

In [3]:
def trata_dados(df):
    #--- Timestamp
    df['month'] = df['timestamp'].dt.month
    df['day'] = df['timestamp'].dt.day
    #df['cat_hour'] = pd.cut(df.timestamp.dt.hour, [-1, 8, 16, 23], labels=[0,1,2])
    df['hour'] = df.timestamp.dt.hour
    del df['timestamp']

    #--- Call_type
    df['call_type'] = preprocessing.LabelEncoder().fit_transform(df['call_type'])
    enc = OneHotEncoder().fit_transform(df[['call_type']])
    del df['call_type']
    df['call_type_a'] = enc.toarray()[:,0]
    df['call_type_b'] = enc.toarray()[:,1]
    df['call_type_c'] = enc.toarray()[:,2]

    #--- Fill na
    df.fillna(-1, inplace=True)

    return df

In [4]:
df = trata_dados(df)

~~**Working with polyline (get first, last and penultimate)**~~

In [4]:
#def points(x):
#    try:
#        
#        global cont;
#        
#        line = np.array(ast.literal_eval(x))
#        
#        if line.size>2:
#            return line[0][1], line[0][0], line[-2][1], line[-2][0], line[-1][1], line[-1][0]
#        
#        return 0,0,0,0,0,0
#    except Exception as e:
#        #print(e)
#        None
#   #finally:
#   #    if cont%10000 ==0: print(datetime.datetime.now(), cont)

In [5]:
#cont=0
#df['sta_lat'],df['sta_lon'],df['pen_lat'],df['pen_lon'],df['sto_lat'],df['sto_lon'] = zip(*df['polyline'].apply(points))

In [6]:
#del df['polyline']

## Start ML process

In [5]:
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    #dlon = lon2 - lon1 
    #dlat = lat2 - lat1 
    a = sin((lat2 - lat1)/2)**2 + cos(lat1) * cos(lat2) * sin((lon2 - lon1)/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [6]:
# Runs some algorithm for comparing

def algs_test(X1, X2, y1, y2):

    algs =[('linear', LinearRegression()),
           ('ridge', Ridge(random_state=99)),
           ('lasso', Lasso(random_state=99)), #lasso 142.7065081365015 561.2795889474457
           ('lsvr', LinearSVR(random_state=99)),
           ('KNN reg', KNeighborsRegressor()), #KNN reg 3.3861972620770264 3.5182287757352873
           ('gboost', GradientBoostingRegressor(random_state=99)),
           ('ada boost', AdaBoostRegressor(random_state=99)),
           ('rnd forest', RandomForestRegressor(random_state=99)),
           ('xgboost', XGBRegressor(random_state=99, n_jobs=-1))
           ]

    for name, alg in algs:
        try:
            alg.fit(X1, y1)
            preds = alg.predict(X2)

            result = y2.copy()
            result['pred_lat'] = preds[:,0]
            result['pred_lon'] = preds[:,1]
            hav_dist = result.apply(lambda x: haversine(x.stop_lon, x.stop_lat, x.pred_lon, x.pred_lat) ,axis=1) 

            print('normal',name, hav_dist.mean(), hav_dist.std())
        except: # se o algoritmo nao possui suporte nativo para multioutput
            mo = MultiOutputRegressor(alg, n_jobs=-1).fit(X1, y1)
            preds = mo.predict(X2)

            result = y2.copy()
            result['pred_lat'] = preds[:,0]
            result['pred_lon'] = preds[:,1]
            hav_dist = result.apply(lambda x: haversine(x.stop_lon, x.stop_lat, x.pred_lon, x.pred_lat) ,axis=1) 

            print('multi',name, hav_dist.mean(), hav_dist.std())

In [19]:
# http://colingorrie.github.io/outlier-detection.html

def outliers_iqr(ys):
    quartile_1, quartile_3 = np.percentile(ys, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    return np.where((ys > upper_bound) | (ys < lower_bound))

def outliers_z_score(ys):
    threshold = 3

    mean_y = np.mean(ys)
    print(mean_y)
    stdev_y = np.std(ys)
    z_scores = [(y - mean_y) / stdev_y for y in ys]
    return np.where(np.abs(z_scores) > threshold)


def outliers_modified_z_score(ys):
    threshold = 3.5

    median_y = np.median(ys)
    median_absolute_deviation_y = np.median([np.abs(y - median_y) for y in ys])
    modified_z_scores = [0.6745 * (y - median_y) / median_absolute_deviation_y
                         for y in ys]
    return np.where(np.abs(modified_z_scores) > threshold)

#------------#
def train_test_mimax(df, remove_outliers=''):
    
    y = df[['stop_lat', 'stop_lon']]
    X = df.drop(['stop_lat', 'stop_lon'], axis=1)
    scaler = MinMaxScaler().fit(X)

    #-- split train/test #X1, X2, y1, y2
    X1, X2, y1, y2 = train_test_split(X, y, test_size=0.3, random_state=42)
    
    if(remove_outliers=='iqr'):
        idx = outliers_iqr(X1.dist_perc)[0].tolist() 
        X1 = X1[~X1.index.isin(idx)]
        y1 = y1[~y1.index.isin(idx)]
    
    if(remove_outliers=='zscore'):
        idx = outliers_z_score(X1.dist_perc)[0].tolist() 
        X1 = X1[~X1.index.isin(idx)]
        y1 = y1[~y1.index.isin(idx)]
    
    if(remove_outliers=='zscore_mod'):
        idx = outliers_modified_z_score(X1.dist_perc)[0].tolist() 
        X1 = X1[~X1.index.isin(idx)]
        y1 = y1[~y1.index.isin(idx)]
        
    #-- fit antes para treinar com todos dados e transform depois para pegar dados de 
    #-- treino com/sem outliers mas treinados com todos dados pra escala ficar igual
    X1 = scaler.transform(X1)
    X2 = scaler.transform(X2)
    
    return X1, X2, y1, y2

**First test**

In [8]:
algs_test(*train_test_mimax(df))

normal linear 3.242446434491333 4.572731319644844
normal ridge 5.035208578741895 7.074332186094088
normal lasso 36.32477907103046 303.9345416283136
multi lsvr 3.1863218127999615 4.762337661297881
normal KNN reg 3.475595038193555 6.4593803860315
multi gboost 2.7754355473903676 3.2098926284276637
multi ada boost 8.46685560011833 5.36460407247813
normal rnd forest 2.5492028646315354 3.4014832618236284
multi xgboost 2.791883626724204 3.2605747610577662


Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x0000000839F8D908>>
Traceback (most recent call last):
  File "C:\Users\a46396\AppData\Local\Continuum\anaconda3\lib\site-packages\xgboost\core.py", line 482, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


**Removing outliers by distance using IQR**

In [10]:
algs_test(*train_test_mimax(df, 'iqr'))

normal linear 3.242386329226894 4.575807076958077
normal ridge 5.065930261976436 7.113565077592066
normal lasso 36.46481219866562 303.91621155983614
multi lsvr 3.193235886865255 4.768011044606274
normal KNN reg 3.512267713886269 11.501344991222437
multi gboost 2.7736478427403704 3.209004814636749
multi ada boost 3.431128864111579 3.4512566221495034
normal rnd forest 2.5532163030487838 3.357841860165246
multi xgboost 2.799051014026038 3.254320443859125


Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x0000000839F34470>>
Traceback (most recent call last):
  File "C:\Users\a46396\AppData\Local\Continuum\anaconda3\lib\site-packages\xgboost\core.py", line 482, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


**Removing outliers by distance using Z-Score**

In [18]:
algs_test(*train_test_mimax(df, 'zscore'))

5.524522999999997
normal linear 3.2436848335907 4.574511975109914
normal ridge 5.0397501508541405 7.079543576556709
normal lasso 36.285558826890096 303.93967407595665
multi lsvr 3.1662636567340745 4.768261461226569
normal KNN reg 3.507742216388954 11.501676636252968
multi gboost 2.7787856041016905 3.220716502126241
multi ada boost 8.74874922900235 6.964924596533357
normal rnd forest 2.547612635309929 3.3392721066150717
multi xgboost 2.791280498177931 3.228035830172794


Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x0000000839F49DD8>>
Traceback (most recent call last):
  File "C:\Users\a46396\AppData\Local\Continuum\anaconda3\lib\site-packages\xgboost\core.py", line 482, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


**Removing outliers by distance using Z-Score modified**

In [20]:
algs_test(*train_test_mimax(df, 'zscore_mod'))

normal linear 3.2416689392046907 4.574244919310808
normal ridge 5.06199428500645 7.1079244262875525
normal lasso 36.48563984433071 303.91348752880737
multi lsvr 3.1965189978394926 4.778123022004583
normal KNN reg 3.5118850508358155 11.501247475085533
multi gboost 2.773009388176488 3.204280089755923
multi ada boost 6.996953951045134 5.5171882920350335
normal rnd forest 2.555691849542318 3.3879691115355257
multi xgboost 2.8103219355324827 3.273452329149767


Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x0000000839F8D320>>
Traceback (most recent call last):
  File "C:\Users\a46396\AppData\Local\Continuum\anaconda3\lib\site-packages\xgboost\core.py", line 482, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'
