Copyright (c) 2020, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [1]:
import os, time
start = time.time()

In [2]:
import pandas as pd, numpy as np, gc
from datetime import datetime
import joblib

import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [3]:
from numba import jit, njit, prange
from sklearn.metrics import precision_recall_curve, auc, log_loss

def compute_prauc(gt, pred, nafill=True):
    if nafill:
        pred[ np.isnan(pred) ] = np.nanmean( pred )
    prec, recall, thresh = precision_recall_curve(gt, pred)
    prauc = auc(recall, prec)
    return prauc

@jit
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc

@njit
def numba_log_loss(y,x):
    n = x.shape[0]
    ll = 0.
    for i in prange(n):
        if y[i]<=0.:
            ll += np.log(1-x[i] + 1e-15 )
        else:
            ll += np.log(x[i] + 1e-15)
    return -ll / n

def compute_rce(gt , pred, nafill=True, verbose=0):
    if nafill:
        pred[ np.isnan(pred) ] = np.nanmean( pred )
        
    cross_entropy = numba_log_loss( gt, pred  )
    
    yt = np.mean(gt>0)     
    strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
    
    if verbose:
        print( "logloss: {0:.5f} / {1:.5f} = {2:.5f}".format(cross_entropy, strawman_cross_entropy, cross_entropy/strawman_cross_entropy))
        print( 'mean:    {0:.5f} / {1:.5f}'.format( np.nanmean( pred ) , yt  ) )
    
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0


In [4]:
def save_memory( df ):
    features = df.columns
    for i in range( df.shape[1] ):
        if df.dtypes[i] == 'uint8':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'bool':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'uint32':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'int64':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'float64':
            df[features[i]] = df[features[i]].astype( np.float32 )
            gc.collect()

# Load Train

In [5]:
train = pd.read_parquet( 'train.parquet' )
test0 = pd.read_parquet( 'valid.parquet' )
test1 = pd.read_parquet( 'test.parquet' )
gc.collect()

train['tr'] = 0
test0['tr'] = 1
test1['tr'] = 1

train.shape, test0.shape, test1.shape

((106254462, 28), (9760684, 28), (9765321, 28))

In [6]:
train = pd.concat( (train,test0,test1), sort=False )
gc.collect()
del test0,test1
gc.collect()

train.shape

(125780467, 28)

In [7]:
%%time
train = train.sort_values('timestamp').reset_index(drop=True) #TIME ORDER
gc.collect()

CPU times: user 1min 30s, sys: 24.6 s, total: 1min 54s
Wall time: 1min 54s


0

In [8]:
train.loc[ train.reply>0, 'reply' ] = 1
train.loc[ train.retweet>0, 'retweet' ] = 1
train.loc[ train.retweet_comment>0, 'retweet_comment' ] = 1
train.loc[ train.like>0, 'like' ] = 1

train['engage'] = 0
train.loc[ (train.reply>0)|(train.retweet>0)|(train.retweet_comment>0)|(train.like>0)  , 'engage'] = 1
gc.collect()

0

In [9]:
train.head()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,tr,len_hashtags,len_domains,len_links,engage
0,0,19329,5,0,0,2,54,1580947200,16906,9794149,90,True,1321282652,18916826,163,111,False,1543074641,False,0,0,0,0,31145094,0,0,0,0,0
1,2839,3268367,9,425779,12,2,54,1580947200,233502,1661107,125,True,1257040330,19649316,134,165,False,1351006912,False,0,0,0,0,38859618,0,3,1,1,0
2,594,736879,5,109298,542,2,54,1580947200,5604,12029492,14,True,1239741288,15045821,100,229,False,1348592309,False,0,0,0,0,96650689,0,1,1,1,0
3,0,3403570,9,0,0,2,54,1580947200,10859,4398172,825,True,1236008126,15883237,5,130,False,1404447043,False,0,0,0,0,7549091,0,0,0,0,0
4,732011,8323311,5,948558,12,2,54,1580947200,52763,246867,559,True,1219588461,4466763,1233,747,False,1314638570,False,0,0,0,0,43687561,0,1,1,1,0


In [10]:
save_memory(train)
gc.collect()

0

In [11]:
train.dtypes

hashtags              int32
tweet_id              int32
media                  int8
links                 int32
domains               int32
tweet_type             int8
language               int8
timestamp             int32
a_user_id             int32
a_follower_count      int32
a_following_count     int32
a_is_verified          int8
a_account_creation    int32
b_user_id             int32
b_follower_count      int32
b_following_count     int32
b_is_verified          int8
b_account_creation    int32
b_follows_a            int8
reply                 int32
retweet               int32
retweet_comment       int32
like                  int32
id                    int32
tr                    int32
len_hashtags          int32
len_domains           int32
len_links             int32
engage                int32
dtype: object

In [12]:
dt0 = train[['tr','a_user_id','a_follower_count','a_following_count','timestamp','id']].copy()
dt1 = train[['tr','b_user_id','b_follower_count','b_following_count','timestamp','id']].copy()
dt1.columns = ['tr','a_user_id','a_follower_count','a_following_count','timestamp','id']
dt1['id'] = -1
dt = pd.concat( (dt0,dt1), sort=False )
del dt0,dt1; _=gc.collect()
dt.head()

Unnamed: 0,tr,a_user_id,a_follower_count,a_following_count,timestamp,id
0,0,16906,9794149,90,1580947200,31145094
1,0,233502,1661107,125,1580947200,38859618
2,0,5604,12029492,14,1580947200,96650689
3,0,10859,4398172,825,1580947200,7549091
4,0,52763,246867,559,1580947200,43687561


In [13]:
%%time
dt['a_fc_max'] = dt.groupby(['tr','a_user_id','a_follower_count'])['timestamp'].transform('max'); _=gc.collect()
dt['a_fc_min'] = dt.groupby(['tr','a_user_id','a_follower_count'])['timestamp'].transform('min'); _=gc.collect()

dt['a_fc_dif'] = dt['a_fc_max'] - dt['a_fc_min']; _=gc.collect()

dt['a_fc_dif_max'] = dt.groupby(['tr','a_user_id'])['a_fc_dif'].transform('max'); _=gc.collect()

dt['a_fc_dif_flag'] = 1* ((dt['a_fc_dif'] == dt['a_fc_dif_max']) )  ; _=gc.collect()
dt.loc[ dt.a_fc_dif==0 ,'a_fc_dif_flag'] = -1

del dt['a_fc_max'],dt['a_fc_min'],dt['a_fc_dif'],dt['a_fc_dif_max'] ; _=gc.collect()
dt['a_fc_dif_flag'].value_counts()

CPU times: user 5min 33s, sys: 42.5 s, total: 6min 15s
Wall time: 6min 15s


 1    167646303
 0     54714969
-1     29199662
Name: a_fc_dif_flag, dtype: int64

In [14]:
train['a_user_fer_count_delta_time'] = train.merge( dt, on='id', how='left' )['a_fc_dif_flag'] ; _=gc.collect()

del dt['a_fc_dif_flag'] ; _=gc.collect()

train['a_user_fer_count_delta_time'].value_counts()

 1    71610061
 0    41612270
-1    12558136
Name: a_user_fer_count_delta_time, dtype: int64

In [15]:
train.groupby(['tr','a_user_fer_count_delta_time'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
tr,a_user_fer_count_delta_time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,-1,0.020097,0.044176,0.004059,0.20647
0,0,0.000692,0.002269,0.000176,0.008234
0,1,0.039583,0.176012,0.011349,0.70707
1,-1,0.024092,0.057007,0.004971,0.25342
1,0,0.00063,0.002395,0.000178,0.008361
1,1,0.03951,0.165661,0.011116,0.701947


In [16]:
%%time
dt['a_fc_max'] = dt.groupby(['tr','a_user_id','a_following_count'])['timestamp'].transform('max'); _=gc.collect()
dt['a_fc_min'] = dt.groupby(['tr','a_user_id','a_following_count'])['timestamp'].transform('min'); _=gc.collect()

dt['a_fc_dif'] = dt['a_fc_max'] - dt['a_fc_min']; _=gc.collect()

dt['a_fc_dif_max'] = dt.groupby(['tr','a_user_id'])['a_fc_dif'].transform('max'); _=gc.collect()

dt['a_fc_dif_flag'] = 1* ((dt['a_fc_dif'] == dt['a_fc_dif_max']) )  ; _=gc.collect()
dt.loc[ dt.a_fc_dif==0 ,'a_fc_dif_flag'] = -1

del dt['a_fc_max'],dt['a_fc_min'],dt['a_fc_dif'],dt['a_fc_dif_max'] ; _=gc.collect()
dt['a_fc_dif_flag'].value_counts()

CPU times: user 5min 31s, sys: 42.8 s, total: 6min 14s
Wall time: 6min 14s


 1    181626111
 0     40782460
-1     29152363
Name: a_fc_dif_flag, dtype: int64

In [17]:
train['a_user_fing_count_delta_time'] = train.merge( dt, on='id', how='left' )['a_fc_dif_flag'] ; _=gc.collect()

del dt['a_fc_dif_flag'] ; _=gc.collect()

train['a_user_fing_count_delta_time'].value_counts()

 1    88946997
 0    26692736
-1    10140734
Name: a_user_fing_count_delta_time, dtype: int64

In [18]:
train.groupby(['tr','a_user_fing_count_delta_time'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
tr,a_user_fing_count_delta_time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,-1,0.024614,0.053076,0.004891,0.249945
0,0,0.001274,0.004174,0.000332,0.015661
0,1,0.032066,0.142614,0.009195,0.572812
1,-1,0.027763,0.064502,0.005666,0.288032
1,0,0.002241,0.008575,0.000624,0.033851
1,1,0.030612,0.128305,0.008612,0.543375


In [19]:
%%time
dt['a_fc_max'] = dt.groupby(['tr','a_user_id','a_follower_count','a_following_count'])['timestamp'].transform('max'); _=gc.collect()
dt['a_fc_min'] = dt.groupby(['tr','a_user_id','a_follower_count','a_following_count'])['timestamp'].transform('min'); _=gc.collect()

dt['a_fc_dif'] = dt['a_fc_max'] - dt['a_fc_min']; _=gc.collect()

dt['a_fc_dif_max'] = dt.groupby(['tr','a_user_id'])['a_fc_dif'].transform('max'); _=gc.collect()

dt['a_fc_dif_flag'] = 1* ((dt['a_fc_dif'] == dt['a_fc_dif_max']) )  ; _=gc.collect()
dt.loc[ dt.a_fc_dif==0 ,'a_fc_dif_flag'] = -1

del dt['a_fc_max'],dt['a_fc_min'],dt['a_fc_dif'],dt['a_fc_dif_max'] ; _=gc.collect()
dt['a_fc_dif_flag'].value_counts()

CPU times: user 6min, sys: 47.7 s, total: 6min 48s
Wall time: 6min 48s


 1    154700820
 0     57042946
-1     39817168
Name: a_fc_dif_flag, dtype: int64

In [20]:
train['a_user_fering_count_delta_time'] = train.merge( dt, on='id', how='left' )['a_fc_dif_flag'] ; _=gc.collect()

del dt['a_fc_dif_flag'] ; _=gc.collect()

train['a_user_fering_count_delta_time'].value_counts()

 1    69864760
 0    41814045
-1    14101662
Name: a_user_fering_count_delta_time, dtype: int64

In [21]:
train.groupby(['tr','a_user_fering_count_delta_time'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
tr,a_user_fering_count_delta_time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,-1,0.018203,0.040254,0.00369,0.18658
0,0,0.000466,0.001572,0.000114,0.005641
0,1,0.0406,0.180575,0.011646,0.725342
1,-1,0.023155,0.055146,0.004798,0.243641
1,0,0.00038,0.001397,0.000102,0.00497
1,1,0.040343,0.169392,0.011362,0.717774


In [22]:
%%time
dt['a_fc_count'] = dt.groupby(['tr','a_user_id','a_following_count'])['id'].transform('count'); _=gc.collect()
dt['a_fc_max']   = dt.groupby(['tr','a_user_id'])['a_fc_count'].transform('max'); _=gc.collect()
dt['a_fc_count_flag'] = 1* ((dt['a_fc_count'] == dt['a_fc_max']))  ; _=gc.collect()
dt.loc[ dt.a_fc_count<=1,'a_fc_count_flag'] = -1

del dt['a_fc_count'], dt['a_fc_max']
dt['a_fc_count_flag'].value_counts()

CPU times: user 3min 22s, sys: 25.5 s, total: 3min 47s
Wall time: 3min 47s


 1    188304368
 0     35241293
-1     28015273
Name: a_fc_count_flag, dtype: int64

In [23]:
train['a_user_fing_count_mode'] = train.merge( dt, on='id', how='left' )['a_fc_count_flag'] ; _=gc.collect()
del dt['a_fc_count_flag'] ; _=gc.collect()
train['a_user_fing_count_mode'].value_counts()

 1    91660777
 0    25115653
-1     9004037
Name: a_user_fing_count_mode, dtype: int64

In [24]:
train.groupby(['tr','a_user_fing_count_mode'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
tr,a_user_fing_count_mode,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,-1,0.025084,0.051289,0.004879,0.228335
0,0,0.002847,0.009445,0.000895,0.039377
0,1,0.030968,0.137799,0.008833,0.55428
1,-1,0.02802,0.064513,0.005692,0.282475
1,0,0.002427,0.007586,0.000738,0.030976
1,1,0.029961,0.125501,0.008397,0.532633


In [25]:
%%time
dt['a_fc_count'] = dt.groupby(['tr','a_user_id','a_follower_count'])['id'].transform('count'); _=gc.collect()
dt['a_fc_max']   = dt.groupby(['tr','a_user_id'])['a_fc_count'].transform('max'); _=gc.collect()
dt['a_fc_count_flag'] = 1* ((dt['a_fc_count'] == dt['a_fc_max']))  ; _=gc.collect()
dt.loc[ dt.a_fc_count<=1,'a_fc_count_flag'] = -1

del dt['a_fc_count'], dt['a_fc_max']
dt['a_fc_count_flag'].value_counts()

CPU times: user 3min 26s, sys: 25.6 s, total: 3min 52s
Wall time: 3min 52s


 1    173582226
 0     51104798
-1     26873910
Name: a_fc_count_flag, dtype: int64

In [26]:
train['a_user_fer_count_mode'] = train.merge( dt, on='id', how='left' )['a_fc_count_flag'] ; _=gc.collect()
del dt['a_fc_count_flag'] ; _=gc.collect()
train['a_user_fer_count_mode'].value_counts()

 1    74100800
 0    41446934
-1    10232733
Name: a_user_fer_count_mode, dtype: int64

In [27]:
train.groupby(['tr','a_user_fer_count_mode'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
tr,a_user_fer_count_mode,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,-1,0.022713,0.047402,0.004496,0.209543
0,0,0.001158,0.004002,0.000341,0.016586
0,1,0.038391,0.170392,0.01097,0.685652
1,-1,0.025234,0.059094,0.005179,0.257678
1,0,0.001188,0.004073,0.000372,0.017244
1,1,0.037998,0.159076,0.010649,0.67504


In [28]:
%%time
dt['a_fc_count'] = dt.groupby(['tr','a_user_id','a_follower_count','a_following_count'])['id'].transform('count'); _=gc.collect()
dt['a_fc_max']   = dt.groupby(['tr','a_user_id'])['a_fc_count'].transform('max'); _=gc.collect()
dt['a_fc_count_flag'] = 1* ((dt['a_fc_count'] == dt['a_fc_max']))  ; _=gc.collect()
dt.loc[ dt.a_fc_count<=1,'a_fc_count_flag'] = -1

del dt['a_fc_count'], dt['a_fc_max']
dt['a_fc_count_flag'].value_counts()

CPU times: user 3min 37s, sys: 27.5 s, total: 4min 5s
Wall time: 4min 5s


 1    161621635
 0     52522505
-1     37416794
Name: a_fc_count_flag, dtype: int64

In [29]:
train['a_user_fering_count_mode'] = train.merge( dt, on='id', how='left' )['a_fc_count_flag'] ; _=gc.collect()
del dt['a_fc_count_flag'] ; _=gc.collect()
train['a_user_fering_count_mode'].value_counts()

 1    72344320
 0    41734346
-1    11701801
Name: a_user_fering_count_mode, dtype: int64

In [30]:
train.groupby(['tr','a_user_fering_count_mode'])[['reply','retweet','retweet_comment','like']].agg(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
tr,a_user_fering_count_mode,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,-1,0.020088,0.042321,0.003996,0.185588
0,0,0.001062,0.00368,0.00031,0.01542
0,1,0.039271,0.174453,0.01123,0.701993
1,-1,0.024167,0.056996,0.004983,0.247005
1,0,0.001115,0.003833,0.000353,0.016352
1,1,0.038695,0.162213,0.010851,0.688758


In [31]:
del dt; _=gc.collect()

In [32]:
train['a_count_combined'] = pd.factorize(
    (1+train['a_user_fer_count_delta_time'])*3**0 +
    (1+train['a_user_fing_count_delta_time'])*3**1 +
    (1+train['a_user_fering_count_delta_time'])*3**2 +
    (1+train['a_user_fing_count_mode'])*3**3 +
    (1+train['a_user_fer_count_mode'])*3**4 +
    (1+train['a_user_fering_count_mode'])*3**5 
)[0]
train['a_count_combined'].value_counts()

4      66679103
3      20360928
0      15238222
7       7894164
2       2000523
11      1634665
5       1141579
9       1055293
8       1053085
1        789305
33       723815
22       698429
38       644191
6        609055
18       514046
14       431281
35       418266
10       364163
16       324954
31       288635
26       268953
12       240838
21       217728
49       200249
29       155603
27       133913
25       104587
56        94672
19        94353
28        79062
48        77674
45        70953
43        68126
30        62100
15        61673
50        58368
59        53834
51        44606
74        42061
24        40606
32        40119
52        39756
37        37004
40        36760
23        35430
42        35414
47        33143
53        32531
87        27367
20        25683
77        23459
70        22566
69        18235
61        18049
90        16478
57        16125
41        15875
76        14562
46        13689
63        13529
79        13461
62        11544
65      

In [33]:
train.loc[ train.tr==0 ].groupby('a_count_combined')[['reply','retweet','retweet_comment','like']].agg(['mean'])

Unnamed: 0_level_0,reply,retweet,retweet_comment,like
Unnamed: 0_level_1,mean,mean,mean,mean
a_count_combined,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,0.000127,0.000461,3.4e-05,0.001665
1,0.000363,0.00117,7.7e-05,0.005518
2,4.6e-05,0.00018,1.5e-05,0.000664
3,0.000116,0.000357,2.7e-05,0.001301
4,0.04077,0.183205,0.011665,0.735364
5,0.003396,0.013687,0.000915,0.049737
6,0.029276,0.109068,0.009193,0.473737
7,0.028324,0.057317,0.005479,0.257351
8,0.037854,0.127268,0.012772,0.542895
9,0.004761,0.015482,0.001237,0.056036


In [34]:
train.head()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,tr,len_hashtags,len_domains,len_links,engage,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode,a_count_combined
0,0,19329,5,0,0,2,54,1580947200,16906,9794149,90,1,1321282652,18916826,163,111,0,1543074641,0,0,0,0,0,31145094,0,0,0,0,0,0,1,0,1,0,0,0
1,2839,3268367,9,425779,12,2,54,1580947200,233502,1661107,125,1,1257040330,19649316,134,165,0,1351006912,0,0,0,0,0,38859618,0,3,1,1,0,-1,1,-1,1,0,0,1
2,594,736879,5,109298,542,2,54,1580947200,5604,12029492,14,1,1239741288,15045821,100,229,0,1348592309,0,0,0,0,0,96650689,0,1,1,1,0,0,1,0,1,0,0,0
3,0,3403570,9,0,0,2,54,1580947200,10859,4398172,825,1,1236008126,15883237,5,130,0,1404447043,0,0,0,0,0,7549091,0,0,0,0,0,0,0,0,1,0,0,2
4,732011,8323311,5,948558,12,2,54,1580947200,52763,246867,559,1,1219588461,4466763,1233,747,0,1314638570,0,0,0,0,0,43687561,0,1,1,1,0,0,1,0,1,0,0,0


In [35]:
dt = train[['id','a_count_combined','a_user_fer_count_delta_time','a_user_fing_count_delta_time','a_user_fering_count_delta_time','a_user_fing_count_mode','a_user_fer_count_mode','a_user_fering_count_mode']]
dt = dt.sort_values( 'id' )
dt = dt.reset_index( drop=True )
save_memory( dt )
dt.to_parquet( 'a_count_combined-final.parquet' )
dt.tail(5)

Unnamed: 0,id,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode
125780462,125780462,4,1,1,1,1,1,1
125780463,125780463,4,1,1,1,1,1,1
125780464,125780464,7,-1,-1,-1,-1,-1,-1
125780465,125780465,4,1,1,1,1,1,1
125780466,125780466,4,1,1,1,1,1,1
