In [1]:
import numpy as np
import pandas as pd
import random

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, make_scorer
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
#import lightgbm 

import datetime
import time

  from numpy.core.umath_tests import inner1d


In [2]:
random.seed(30)

In [476]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [477]:
train['due'] = pd.to_datetime(train['due'])
test['due'] = pd.to_datetime(test['due'])

## Feature Engineering

In [478]:
train['due'].dt.year.unique() # year does not matter

array([2014])

In [479]:
test['due'].dt.year.unique()

array([2014])

In [480]:
train['month'] = train['due'].dt.month
test['month'] = test['due'].dt.month

train['weekday'] = train['due'].dt.weekday # The day of the week with Monday=0, Sunday=6
test['weekday'] = test['due'].dt.weekday

train['day'] = train['due'].dt.day # The day of the week with Monday=0, Sunday=6
test['day'] = test['due'].dt.day

train['is year end'] = train['due'].dt.is_year_end # Ney Year's Eve
test['is year end'] = test['due'].dt.is_year_end

train['hour'] = train['due'].dt.hour
test['hour'] = test['due'].dt.hour

train['minute'] = train['due'].dt.minute
test['minute'] = test['due'].dt.minute

In [484]:
train['f_class'] = train['f_class'].fillna('NaN')
train['s_class'] = train['s_class'].fillna('NaN')
train['t_class'] = train['t_class'].fillna('NaN')

test['f_class'] = test['f_class'].fillna('NaN')
test['s_class'] = test['s_class'].fillna('NaN')
test['t_class'] = test['t_class'].fillna('NaN')

In [488]:
train_dummies = pd.get_dummies(train, columns=['f_class', 's_class', 't_class'])
test_dummies = pd.get_dummies(test, columns=['f_class', 's_class', 't_class'])

In [489]:
train.head()

Unnamed: 0,cancel_time,driver_found,due,f_class,lat,lon,s_class,t_class,burned,month,weekday,day,is year end,hour,minute
0,55,False,2014-01-01 00:09:32,econom,55.75013,37.823242,,,True,1,2,1,False,0,9
1,-1,True,2014-01-01 00:09:32,econom,55.75013,37.823242,,,False,1,2,1,False,0,9
2,-1,True,2014-01-01 00:10:00,econom,55.651582,37.340891,,,False,1,2,1,False,0,10
3,-1,True,2014-01-01 00:10:00,econom,55.633404,37.797595,,,False,1,2,1,False,0,10
4,-1,True,2014-01-01 00:10:00,econom,55.77033,37.519917,,,False,1,2,1,False,0,10


In [490]:
train = train.drop(['cancel_time', 'driver_found'], axis=1) # no such features in test

In [491]:
train_dummies = train_dummies.drop(['cancel_time', 'driver_found'], axis=1)

In [542]:
all_data = pd.concat([train_dummies, test_dummies])

all_data.tail()

Unnamed: 0,burned,day,due,f_class_NaN,f_class_business,f_class_econom,f_class_vip,hour,is year end,lat,...,month,s_class_NaN,s_class_business,s_class_econom,s_class_vip,t_class_NaN,t_class_business,t_class_econom,t_class_vip,weekday
1793285,,31,2014-03-31 23:55:00.000,0,0,1,0,23,False,55.736944,...,3,1,0,0,0,1,0,0,0,0
1793290,,31,2014-03-31 23:55:00.000,0,0,1,0,23,False,55.74147,...,3,1,0,0,0,1,0,0,0,0
1793299,,31,2014-03-31 23:55:00.000,0,0,1,0,23,False,55.682874,...,3,1,0,0,0,1,0,0,0,0
1793294,,31,2014-03-31 23:55:00.201,0,0,1,0,23,False,55.60369,...,3,0,1,0,0,1,0,0,0,0
1793288,,31,2014-03-31 23:55:00.464,0,0,1,0,23,False,55.620941,...,3,0,1,0,0,1,0,0,0,0


In [544]:
all_data['time_suspicious'] = (all_data['due'] == all_data['due'].shift(5)) # подсказка Артема
all_data['group_suspicious'] = (all_data['time_suspicious'] == all_data['time_suspicious'].shift(1)) # подск. Артема
all_data = all_data[6:]

In [547]:
all_data['duplicated'] = all_data.duplicated(['due', 'lon', 'lat']) # подсказка Артема

## Learning

In [548]:
train_delta = all_data[(all_data.burned.isna() == False)]
test_delta = all_data[all_data.burned.isna() == True].drop('burned' ,axis=1)

In [551]:
len(train_delta) + len(test_delta) == len(all_data)

True

In [553]:
#### SOLUTION

features_delta = train_delta.columns.drop(['due', 'burned', 'is year end']).tolist()
df_delta = np.array(train_delta[features_delta])

train_size = 0.7
train_X = np.array(train_delta[:int(len(train_delta)*train_size)][features_delta])
train_y = train_delta[:int(len(train_delta)*train_size)].burned.values

test_X = np.array(train_delta[int(len(train_delta)*train_size):][features_delta])
test_y = train_delta[int(len(train_delta)*train_size):].burned.values

# to deal with imbalanced sample
negative_to_positive_ratio = len(train_y[train_y == False]) / len(train_y[train_y == True])

XGB_delta = xgb.XGBClassifier(n_estimators=200, max_depth=5,
                             scale_pos_weight=negative_to_positive_ratio, 
                             n_jobs=-1, random_state=26)

XBG_delta_pred = XGB_delta.fit(train_X, train_y).predict_proba(test_X)[:, 1]

roc_auc_score(test_y.astype(int), XBG_delta_pred) 

0.6409867069143632

In [564]:
# did not help
train_delta.day = np.sin(np.pi*train_delta.day/15)
train_delta.month = np.sin(np.pi*train_delta.month/6)
train_delta.weekday = np.sin(np.pi*train_delta.weekday/3.5)
train_delta.hour = np.sin(np.pi*train_delta.hour/12)
train_delta.minute = np.sin(np.pi*train_delta.minute/30)

test_delta.day = np.sin(np.pi*test_delta.day/15)
test_delta.month = np.sin(np.pi*test_delta.month/6)
test_delta.weekday = np.sin(np.pi*test_delta.weekday/3.5)
test_delta.hour = np.sin(np.pi*test_delta.hour/12)
test_delta.minute = np.sin(np.pi*test_delta.minute/30)

features_delta = train_delta.columns.drop(['due', 'burned', 'is year end']).tolist()
df_delta = np.array(train_delta[features_delta])

train_size = 0.7
train_X = np.array(train_delta[:int(len(train_delta)*train_size)][features_delta])
train_y = train_delta[:int(len(train_delta)*train_size)].burned.values

test_X = np.array(train_delta[int(len(train_delta)*train_size):][features_delta])
test_y = train_delta[int(len(train_delta)*train_size):].burned.values

negative_to_positive_ratio = len(train_y[train_y == False]) / len(train_y[train_y == True])

XGB_delta = xgb.XGBClassifier(n_estimators=400, max_depth=5,
                             scale_pos_weight=negative_to_positive_ratio, 
                             n_jobs=-1, random_state=26)

XBG_delta_pred = XGB_delta.fit(train_X, train_y).predict_proba(test_X)[:, 1]

roc_auc_score(test_y.astype(int), XBG_delta_pred) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


0.6423115526903135

In [567]:
dict(zip(features_delta, XGB_delta.feature_importances_))

{'day': 0.082671605,
 'duplicated': 0.014999118,
 'f_class_NaN': 0.008999471,
 'f_class_business': 0.0074113286,
 'f_class_econom': 0.006705488,
 'f_class_vip': 0.005117346,
 'group_suspicious': 0.004235045,
 'hour': 0.08134816,
 'lat': 0.29327688,
 'lon': 0.29230633,
 'minute': 0.062025763,
 'month': 0.026821952,
 's_class_NaN': 0.012793365,
 's_class_business': 0.0069701783,
 's_class_econom': 0.0030880535,
 's_class_vip': 0.005293806,
 't_class_NaN': 0.005823187,
 't_class_business': 0.0013234515,
 't_class_econom': 0.0022057525,
 't_class_vip': 0.0014116816,
 'time_suspicious': 0.014910888,
 'weekday': 0.06026116}

## Predictions to submit

In [568]:
df_xgb = np.array(test_delta[features_delta])

In [569]:
probas = XGB_delta.predict_proba(df_xgb)[:, 1]
df = pd.DataFrame(probas, columns=['Prob'])
df.to_csv('submit.csv', index_label='Id')