In [1]:
import sys
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from xgboost.core import DMatrix
from xgboost.training import train, cv
from xgboost.plotting import plot_importance


In [103]:
df_train = pd.read_csv('dataset/train.csv',
                        usecols=['row_id','x','y','time','place_id','accuracy'], 
                        index_col = 0)

In [31]:
df_test = pd.read_csv('dataset/test.csv',
                        usecols=['row_id','x','y','time','accuracy'], 
                        index_col = 0)

In [117]:
print(df_train.time.min(), df_train.time.max())
print(df_test.time.min(), df_test.time.max())

1 786239
786242 1006589


In [124]:
df_test[:10]

Unnamed: 0_level_0,x,y,accuracy,time
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.1675,1.3608,107,930883
1,7.3909,2.5301,35,893017
2,8.0978,2.3473,62,976933
3,0.999,1.0591,62,907285
4,0.667,9.7254,40,914399
5,0.1771,0.0022,161,814077
6,5.5299,4.6581,2,930759
7,8.6021,3.1744,4,862115
8,4.225,6.3435,162,787391
9,0.6489,6.2611,39,793166


In [15]:
df_test = pd.read_csv('Dataset\\test0.csv', usecols = ['row_id', 'place_id'])

In [63]:
plt.plot(df_train2.index, df_train2.time, 'bo')
#plt.plot(df_train2.time, 'bo')

plt.show()

In [53]:
df_test.time.hist()
plt.show()

In [None]:
df_train.index[:1

In [125]:
train_len = int(df_train.shape[0]*0.8)
train_len

23294416

In [127]:
train80 = df_train_sorted[:train_len].sort_index().index
train20 = df_train_sorted[train_len:].sort_index().index


In [128]:
train80

Int64Index([       0,        1,        2,        4,        5,        7,
                   8,        9,       10,       11,
            ...
            29118009, 29118010, 29118011, 29118012, 29118013, 29118014,
            29118015, 29118016, 29118017, 29118020],
           dtype='int64', name='row_id', length=23294416)

In [129]:
train20

Int64Index([       3,        6,       16,       20,       23,       27,
                  34,       35,       44,       52,
            ...
            29117963, 29117965, 29117967, 29117975, 29117978, 29118002,
            29118005, 29118008, 29118018, 29118019],
           dtype='int64', name='row_id', length=5823605)

In [119]:
df_train_fold = df_train_sorted[:train_len:].sort_index()
df_test_fold = df_train_sorted[train_len:].sort_index()

In [140]:
df_test_fold.y.hist()
plt.show()

In [99]:
df_test_fold.time.min()

2

In [102]:
df_test_fold[df_test_fold.time == 2]

Unnamed: 0_level_0,x,y,accuracy,time,place_id
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
27743788,3.7725,6.4504,65,2,4115006237


In [141]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

In [142]:
df_all.time.max()

1006589

In [148]:
1006589/60/24/30

23.300671296296297

In [150]:
month = df_all.time//60*24*30

In [152]:
month.hist(bins=23)
plt.show()

In [153]:
df_train.time.max()/60/24

545.99930555555557

In [173]:
np.column_stack((df_train.index, df_train['place_id'].values))

array([[         0, 8523065625],
       [         1, 1757726713],
       [         2, 1137537235],
       ..., 
       [  29118018, 2838334300],
       [  29118019, 1007355847],
       [  29118020, 7028698129]], dtype=int64)

In [174]:
df_train.index

Int64Index([       0,        1,        2,        3,        4,        5,
                   6,        7,        8,        9,
            ...
            29118011, 29118012, 29118013, 29118014, 29118015, 29118016,
            29118017, 29118018, 29118019, 29118020],
           dtype='int64', name='row_id', length=29118021)

In [177]:
np.random.seed(2)
folds = KFold(1000, n_folds = 2, shuffle = True)
for train, test in folds:
    filter = train 
    break
filter

array([  0,   5,   6,   8,   9,  11,  14,  17,  19,  21,  23,  24,  25,
        26,  27,  28,  31,  33,  34,  35,  36,  39,  43,  44,  45,  46,
        47,  49,  50,  51,  52,  54,  56,  57,  59,  61,  63,  64,  66,
        69,  73,  75,  78,  79,  81,  82,  83,  85,  86,  87,  88,  92,
        93,  94,  95,  96,  97,  98, 100, 102, 103, 104, 105, 106, 110,
       112, 114, 115, 118, 121, 122, 124, 125, 127, 132, 133, 135, 138,
       140, 144, 145, 147, 148, 149, 150, 151, 153, 154, 159, 162, 166,
       167, 168, 170, 173, 184, 186, 187, 189, 190, 191, 195, 196, 201,
       207, 208, 210, 211, 215, 217, 218, 219, 220, 224, 228, 229, 233,
       234, 237, 238, 242, 243, 244, 245, 246, 247, 252, 253, 254, 255,
       256, 259, 260, 261, 262, 263, 264, 266, 269, 270, 271, 274, 276,
       277, 278, 281, 282, 283, 287, 290, 292, 296, 298, 299, 301, 302,
       303, 306, 307, 313, 316, 319, 320, 323, 324, 325, 326, 329, 331,
       332, 336, 337, 339, 341, 348, 349, 350, 351, 354, 357, 35