In [1]:
import pandas as pd
import numpy as np
import ast
import datetime
from math import radians, cos, sin, asin, sqrt

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [2]:
usecols = ['CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID', 'TIMESTAMP',
           'dist_perc', 'start_lat', 'start_lon', 'stop_lat', 'stop_lon']

df = pd.read_csv('train_tratado_outliers_2.csv', usecols=usecols, parse_dates=True, nrows=300000)

# cabeçalho minusculo
df.columns = df.columns.map(lambda x: x.lower())
#conversão da data de obj para datetime
df.timestamp = pd.to_datetime(df.timestamp)

In [3]:
def trata_dados(df):
    #--- Timestamp
    df['month'] = df['timestamp'].dt.month
    df['day'] = df['timestamp'].dt.day
    #df['cat_hour'] = pd.cut(df.timestamp.dt.hour, [-1, 8, 16, 23], labels=[0,1,2])
    df['hour'] = df.timestamp.dt.hour
    del df['timestamp']

    #--- Call_type
    df['call_type'] = preprocessing.LabelEncoder().fit_transform(df['call_type'])
    enc = OneHotEncoder().fit_transform(df[['call_type']])
    del df['call_type']
    df['call_type_a'] = enc.toarray()[:,0]
    df['call_type_b'] = enc.toarray()[:,1]
    df['call_type_c'] = enc.toarray()[:,2]

    #--- Fill na
    df.fillna(-1, inplace=True)

    return df

In [4]:
df = trata_dados(df)

~~**Working with polyline (get first, last and penultimate)**~~

In [4]:
#def points(x):
#    try:
#        
#        global cont;
#        
#        line = np.array(ast.literal_eval(x))
#        
#        if line.size>2:
#            return line[0][1], line[0][0], line[-2][1], line[-2][0], line[-1][1], line[-1][0]
#        
#        return 0,0,0,0,0,0
#    except Exception as e:
#        #print(e)
#        None
#   #finally:
#   #    if cont%10000 ==0: print(datetime.datetime.now(), cont)

In [5]:
#cont=0
#df['sta_lat'],df['sta_lon'],df['pen_lat'],df['pen_lon'],df['sto_lat'],df['sto_lon'] = zip(*df['polyline'].apply(points))

In [6]:
#del df['polyline']

## Start ML process

In [6]:
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    #dlon = lon2 - lon1 
    #dlat = lat2 - lat1 
    a = sin((lat2 - lat1)/2)**2 + cos(lat1) * cos(lat2) * sin((lon2 - lon1)/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [7]:
# Runs some algorithm for comparing

def algs_test(X1, X2, y1, y2):

    algs =[('linear', LinearRegression()),
           ('ridge', Ridge(random_state=99)),
           ('lasso', Lasso(random_state=99)), #lasso 142.7065081365015 561.2795889474457
           ('lsvr', LinearSVR(random_state=99)),
           ('KNN reg', KNeighborsRegressor()), #KNN reg 3.3861972620770264 3.5182287757352873
           ('gboost', GradientBoostingRegressor(random_state=99)),
           ('ada boost', AdaBoostRegressor(random_state=99)),
           ('rnd forest', RandomForestRegressor(random_state=99)),
           ('xgboost', XGBRegressor(random_state=99, n_jobs=-1))
           ]

    for name, alg in algs:
        try:
            alg.fit(X1, y1)
            preds = alg.predict(X2)

            result = y2.copy()
            result['pred_lat'] = preds[:,0]
            result['pred_lon'] = preds[:,1]
            hav_dist = result.apply(lambda x: haversine(x.stop_lon, x.stop_lat, x.pred_lon, x.pred_lat) ,axis=1) 

            print('normal',name, hav_dist.mean(), hav_dist.std())
        except: # se o algoritmo nao possui suporte nativo para multioutput
            mo = MultiOutputRegressor(alg, n_jobs=-1).fit(X1, y1)
            preds = mo.predict(X2)

            result = y2.copy()
            result['pred_lat'] = preds[:,0]
            result['pred_lon'] = preds[:,1]
            hav_dist = result.apply(lambda x: haversine(x.stop_lon, x.stop_lat, x.pred_lon, x.pred_lat) ,axis=1) 

            print('multi',name, hav_dist.mean(), hav_dist.std())

In [8]:
# http://colingorrie.github.io/outlier-detection.html

def outliers_iqr(ys):
    quartile_1, quartile_3 = np.percentile(ys, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    return np.where((ys > upper_bound) | (ys < lower_bound))

def outliers_z_score(ys):
    threshold = 3

    mean_y = np.mean(ys)
    print(mean_y)
    stdev_y = np.std(ys)
    z_scores = [(y - mean_y) / stdev_y for y in ys]
    return np.where(np.abs(z_scores) > threshold)


def outliers_modified_z_score(ys):
    threshold = 3.5

    median_y = np.median(ys)
    median_absolute_deviation_y = np.median([np.abs(y - median_y) for y in ys])
    modified_z_scores = [0.6745 * (y - median_y) / median_absolute_deviation_y
                         for y in ys]
    return np.where(np.abs(modified_z_scores) > threshold)

#------------#
def train_test_mimax(df, remove_outliers=''):
    
    y = df[['stop_lat', 'stop_lon']]
    X = df.drop(['stop_lat', 'stop_lon'], axis=1)
    scaler = MinMaxScaler().fit(X)

    #-- split train/test #X1, X2, y1, y2
    X1, X2, y1, y2 = train_test_split(X, y, test_size=0.3, random_state=42)
    
    if(remove_outliers=='iqr'):
        idx = outliers_iqr(X1.dist_perc)[0].tolist() 
        X1 = X1[~X1.index.isin(idx)]
        y1 = y1[~y1.index.isin(idx)]
    
    if(remove_outliers=='zscore'):
        idx = outliers_z_score(X1.dist_perc)[0].tolist() 
        X1 = X1[~X1.index.isin(idx)]
        y1 = y1[~y1.index.isin(idx)]
    
    if(remove_outliers=='zscore_mod'):
        idx = outliers_modified_z_score(X1.dist_perc)[0].tolist() 
        X1 = X1[~X1.index.isin(idx)]
        y1 = y1[~y1.index.isin(idx)]
        
    #-- fit antes para treinar com todos dados e transform depois para pegar dados de 
    #-- treino com/sem outliers mas treinados com todos dados pra escala ficar igual
    X1 = scaler.transform(X1)
    X2 = scaler.transform(X2)
    
    return X1, X2, y1, y2

**First test**

In [8]:
algs_test(*train_test_mimax(df))

normal linear 3.2638564456020216 4.60906383905595
normal ridge 4.388145964021234 6.801949589262064
normal lasso 33.33326825121277 274.1536739161117
multi lsvr 3.172587565614162 4.999598708420609
normal KNN reg 3.4925721814934696 4.876261908980907
multi gboost 2.7986075444261864 8.309698856389755
multi ada boost 3.3230221978902423 16.09500488109424
normal rnd forest 2.5058967783997264 4.675219338605338
multi xgboost 2.797081070316527 4.386391102434249


Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x0000000463CF8AC8>>
Traceback (most recent call last):
  File "C:\Users\a46396\AppData\Local\Continuum\anaconda3\lib\site-packages\xgboost\core.py", line 482, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


**Removing outliers by distance using IQR**

In [None]:
algs_test(*train_test_mimax(df, 'iqr'))

**Removing outliers by distance using Z-Score**

In [None]:
algs_test(*train_test_mimax(df, 'zscore'))

**Removing outliers by distance using Z-Score modified**

In [None]:
algs_test(*train_test_mimax(df, 'zscore_mod'))

Remover os outliers não fez grandes mudanças no modelo. Os melhores foram os ensemble. Vamos usar o Gradient boost e Random Forest como vencedores.


In [87]:
def haversine2(y_real, y_pred): # def haversine2(lon1, lat1, lon2, lat2):
    print( y_real, y_pred)
    
    
    print('y_real:')
    print(type(y_real))

    print('-----')
    print('y_pred:')
    print(type(y_pred))

    y_real['pred_lat'] = y_pred[:,0]
    y_real['pred_lon'] = y_pred[:,1]
    
    return y_real.apply(lambda x: haversine(x.stop_lon, x.stop_lat, x.pred_lon, x.pred_lat) ,axis=1).mean()

In [88]:
scoring = make_scorer(haversine2, greater_is_better=False)

In [83]:
scoring

make_scorer(haversine2, greater_is_better=False)

In [43]:


param_grid = {'max_depth':[10, 40],
              'min_sample_leaf':[50, 150],
              'max_features':[None, 8]}

gs = GridSearchCV( \
        RandomForestRegressor(n_estimators=100, random_state=99, n_jobs=-1),  \
            param_grid=param_grid,  \
            n_jobs=-1,  \
            cv=10, \
            scoring=scoring, \
            verbose=3)

In [12]:
y = df[['stop_lat', 'stop_lon']]
X = df.drop(['stop_lat', 'stop_lon'], axis=1)
X = MinMaxScaler().fit_transform(X)

In [None]:
gs.fit(X,y)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


#####################

In [14]:
X1, X2, y1, y2 = train_test_split(X, y, test_size=0.3, random_state=42)

reg = RandomForestRegressor(n_jobs=-1, random_state=99, verbose=2)
reg.fit(X1, y1)

building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   20.0s finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=99, verbose=2, warm_start=False)

In [15]:
preds = reg.predict(X2)

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.8s finished


In [24]:
preds[:,0]

array([41.1828552 , 41.164437  , 41.1575046 , ..., 41.1635664 ,
       41.21931485, 41.16710235])

In [89]:
scoring(reg, X1, y1)

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    1.0s finished


         stop_lat  stop_lon
186295  41.147118 -8.607591
127847  41.140926 -8.615259
274740  41.180877 -8.654238
74908   41.169924 -8.661240
11630   41.159187 -8.682390
196075  41.174523 -8.645301
73229   41.148054 -8.599050
254953  41.129937 -8.594577
84602   41.174532 -8.567487
210884  41.148990 -8.610651
278828  41.162238 -8.613450
148628  41.147271 -8.620371
224342  41.195610 -8.703765
194353  41.144157 -8.605746
203786  41.154696 -8.649693
183881  41.190363 -8.577747
228651  41.161725 -8.651151
281371  41.174964 -8.688888
12280   41.164515 -8.600553
221003  41.147532 -8.622279
29631   41.148144 -8.614989
203181  41.176341 -8.585649
292660  41.237577 -8.670303
231986  41.117328 -8.645409
17499   41.175414 -8.613882
131146  41.137317 -8.615493
87444   41.180913 -8.582661
206704  41.163165 -8.623161
229838  41.180976 -8.624979
7155    41.147100 -8.606871
...           ...       ...
123855  41.160078 -8.640954
2747    41.174064 -8.545923
130523  41.145732 -8.607348
149503  41.151474 -8

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


-0.9639253228300669

In [84]:
y1.values

array([[41.147118, -8.607591],
       [41.140926, -8.615259],
       [41.180877, -8.654238],
       ...,
       [41.159034, -8.608329],
       [41.152221, -8.646804],
       [41.16132 , -8.571069]])