Copyright (c) 2020, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [28]:
import os, time
os.environ["CUDA_VISIBLE_DEVICES"]="0"
start = time.time()

In [29]:
import pandas as pd, numpy as np, gc
from datetime import datetime
import joblib

import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import cudf, cupy, time
cudf.__version__

startNB = time.time()

In [30]:
from numba import jit, njit, prange
from sklearn.metrics import precision_recall_curve, auc, log_loss

def compute_prauc(gt, pred, nafill=True):
    if nafill:
        pred[ np.isnan(pred) ] = np.nanmean( pred )
    prec, recall, thresh = precision_recall_curve(gt, pred)
    prauc = auc(recall, prec)
    return prauc

@jit
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc

@njit
def numba_log_loss(y,x):
    n = x.shape[0]
    ll = 0.
    for i in prange(n):
        if y[i]<=0.:
            ll += np.log(1-x[i] + 1e-15 )
        else:
            ll += np.log(x[i] + 1e-15)
    return -ll / n

def compute_rce(gt , pred, nafill=True, verbose=0):
    if nafill:
        pred[ np.isnan(pred) ] = np.nanmean( pred )
        
    cross_entropy = numba_log_loss( gt, pred  )
    
    yt = np.mean(gt>0)     
    strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
    
    if verbose:
        print( "logloss: {0:.5f} / {1:.5f} = {2:.5f}".format(cross_entropy, strawman_cross_entropy, cross_entropy/strawman_cross_entropy))
        print( 'mean:    {0:.5f} / {1:.5f}'.format( np.nanmean( pred ) , yt  ) )
    
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

In [31]:
def save_memory( df ):
    features = df.columns
    for i in range( df.shape[1] ):
        if df.dtypes[i] == 'uint8':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'bool':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'uint32':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'int64':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'float64':
            df[features[i]] = df[features[i]].astype( np.float32 )
            gc.collect()
    

# Load Train

In [32]:
%%time
train = pd.read_parquet( '../../01_Preprocess/GPU/dask_input/step2_output/train.parquet' )
test0 = pd.read_parquet( '../../01_Preprocess/GPU/dask_input/step2_output/valid.parquet' )
test1 = pd.read_parquet( '../../01_Preprocess/GPU/dask_input/step2_output/test.parquet' )
gc.collect()

train['tr'] = 0
test0['tr'] = 1
test1['tr'] = 2

train.shape, test0.shape, test1.shape

CPU times: user 3.52 s, sys: 3.2 s, total: 6.72 s
Wall time: 762 ms


((21043825, 28), (0, 28), (0, 28))

In [33]:
train = pd.concat( (train,test0,test1), sort=False )
gc.collect()
del test0, test1
gc.collect()
train.shape

(21043825, 28)

In [34]:
%%time
train = train.sort_values('id').reset_index(drop=True) 
gc.collect()

CPU times: user 12.2 s, sys: 2.11 s, total: 14.3 s
Wall time: 14.3 s


20

In [35]:
train.loc[ train.reply>0, 'reply' ] = 1
train.loc[ train.retweet>0, 'retweet' ] = 1
train.loc[ train.retweet_comment>0, 'retweet_comment' ] = 1
train.loc[ train.like>0, 'like' ] = 1
gc.collect()

20

In [36]:
train.head()

Unnamed: 0,timestamp,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,language,tweet_type,media,tweet_id,a_user_id,b_user_id,len_hashtags,len_domains,len_links,domains,links,hashtags,tr
0,1581258144,3517,590,False,1391702737,27428,600,False,1520948869,False,0,0,0,0,1,11,2,12,6454562,641962,1090376,0,0,0,0,0,0,0
1,1581018910,39786,32978,False,1322740272,17,77,False,1569692352,True,0,0,0,0,2,47,2,4,3914171,3102992,8107365,0,0,0,0,0,0,0
2,1581244866,141726,3232,False,1343181073,2,29,False,1568107028,False,0,0,0,1,3,11,2,12,12623118,6662439,9461066,0,1,1,84668,713126,0,0
3,1581250124,3044,99,False,1350921963,3,30,False,1568107028,False,0,0,0,0,4,11,2,12,10154963,2634886,9461066,2,1,1,119320,1138355,957923,0
4,1580958805,141370,3235,False,1343181073,3,30,False,1568107028,False,0,0,0,0,5,11,2,12,5388518,6662439,9461066,0,0,0,0,0,0,0


In [37]:
save_memory(train)
gc.collect()

0

In [38]:
train.dtypes

timestamp             int32
a_follower_count      int32
a_following_count     int32
a_is_verified          int8
a_account_creation    int32
b_follower_count      int32
b_following_count     int32
b_is_verified          int8
b_account_creation    int32
b_follows_a            int8
reply                 int32
retweet               int32
retweet_comment       int32
like                  int32
id                    int32
language               int8
tweet_type             int8
media                  int8
tweet_id              int32
a_user_id             int32
b_user_id             int32
len_hashtags          int32
len_domains           int32
len_links             int32
domains               int32
links                 int32
hashtags              int32
tr                    int32
dtype: object

In [39]:
train['dt_day']  = pd.to_datetime( train['timestamp'] , unit='s' ).dt.day.values.astype( np.int8 )
train['dt_dow']  = pd.to_datetime( train['timestamp'] , unit='s' ).dt.dayofweek.values.astype( np.int8 )
train['dt_hour'] = pd.to_datetime( train['timestamp'] , unit='s' ).dt.hour.values.astype( np.int8 )
_=gc.collect()
train.groupby('dt_day')['id'].agg('count')

dt_day
6      185665
7      197201
8      204395
9      196781
10     199532
11     194268
12     180104
13    2817994
14    2704192
15    2911887
16    2829524
17    3040898
18    2822688
19    2558696
Name: id, dtype: int64

In [40]:
dt = pd.read_parquet( '../../preprocessings/a_count_combined-final.parquet' )
dt.head()

Unnamed: 0,id,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode
0,1,0,0,0,0,0,0,0
1,2,5,0,1,0,0,0,0
2,3,1,-1,-1,-1,-1,-1,-1
3,4,1,-1,-1,-1,-1,-1,-1
4,5,1,-1,-1,-1,-1,-1,-1


In [41]:
train['a_count_combined']             = dt['a_count_combined']
train['a_user_fer_count_delta_time']  = dt['a_user_fer_count_delta_time']
train['a_user_fing_count_delta_time'] = dt['a_user_fing_count_delta_time']
train['a_user_fering_count_delta_time']=dt['a_user_fering_count_delta_time']
train['a_user_fing_count_mode']       = dt['a_user_fing_count_mode']
train['a_user_fer_count_mode']        = dt['a_user_fer_count_mode']
train['a_user_fering_count_mode']     = dt['a_user_fering_count_mode']

train.loc[ (train.dt_day==12)|(train.dt_day==18) ,['a_count_combined','a_user_fer_count_delta_time','a_user_fing_count_delta_time','a_user_fering_count_delta_time','a_user_fing_count_mode','a_user_fer_count_mode','a_user_fering_count_mode']] = -9

del dt; _=gc.collect()
train.head()

Unnamed: 0,timestamp,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,language,tweet_type,media,tweet_id,a_user_id,b_user_id,len_hashtags,len_domains,len_links,domains,links,hashtags,tr,dt_day,dt_dow,dt_hour,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode
0,1581258144,3517,590,0,1391702737,27428,600,0,1520948869,0,0,0,0,0,1,11,2,12,6454562,641962,1090376,0,0,0,0,0,0,0,9,6,14,0,0,0,0,0,0,0
1,1581018910,39786,32978,0,1322740272,17,77,0,1569692352,1,0,0,0,0,2,47,2,4,3914171,3102992,8107365,0,0,0,0,0,0,0,6,3,19,5,0,1,0,0,0,0
2,1581244866,141726,3232,0,1343181073,2,29,0,1568107028,0,0,0,0,1,3,11,2,12,12623118,6662439,9461066,0,1,1,84668,713126,0,0,9,6,10,1,-1,-1,-1,-1,-1,-1
3,1581250124,3044,99,0,1350921963,3,30,0,1568107028,0,0,0,0,0,4,11,2,12,10154963,2634886,9461066,2,1,1,119320,1138355,957923,0,9,6,12,1,-1,-1,-1,-1,-1,-1
4,1580958805,141370,3235,0,1343181073,3,30,0,1568107028,0,0,0,0,0,5,11,2,12,5388518,6662439,9461066,0,0,0,0,0,0,0,6,3,3,1,-1,-1,-1,-1,-1,-1


In [42]:
train.dtypes

timestamp                         int32
a_follower_count                  int32
a_following_count                 int32
a_is_verified                      int8
a_account_creation                int32
b_follower_count                  int32
b_following_count                 int32
b_is_verified                      int8
b_account_creation                int32
b_follows_a                        int8
reply                             int32
retweet                           int32
retweet_comment                   int32
like                              int32
id                                int32
language                           int8
tweet_type                         int8
media                              int8
tweet_id                          int32
a_user_id                         int32
b_user_id                         int32
len_hashtags                      int32
len_domains                       int32
len_links                         int32
domains                           int32


In [43]:
dt = pd.read_parquet( '../preprocessings/text-processings-1.parquet' )
dt.sort_values('id', inplace=True)
dt.head()

FileNotFoundError: ../preprocessings/text-processings-1.parquet

In [17]:
train['count_ats']     = dt['count_ats']
train['count_char']    = dt['count_char']
train['count_words']   = dt['count_words']
train['tw_hash']       = dt['tw_hash']
train['tw_freq_hash']  = dt['tw_freq_hash']
train['tw_first_word'] = dt['tw_first_word']
train['tw_second_word']= dt['tw_second_word']
train['tw_last_word']  = dt['tw_last_word']
train['tw_llast_word'] = dt['tw_llast_word']
train['tw_len']        = dt['tw_len']
train['tw_hash0']      = dt['tw_hash0']
train['tw_hash1']      = dt['tw_hash1']
train['tw_rt_uhash']   = dt['tw_rt_uhash']
del dt
gc.collect()

0

In [18]:
train.tail()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,tr,dt_day,dt_dow,dt_hour,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode,count_ats,count_char,count_words,tw_hash,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,tw_hash0,tw_hash1,tw_rt_uhash
146255999,0,73443134,0,0,0,1,59,1582081341,1362744,5390,833,0,1546658592,30848700,56,267,0,1428007228,0,0,0,0,0,146255999,2,19,2,3,2,1,1,1,1,1,1,0,76,15,39170347,34858164,28651,210319,2793,27536,6,0,0,7739
146256000,0,73443135,0,0,0,1,54,1582091134,179475,17747,1467,0,1172558525,4950585,1837,129,0,1272381192,0,0,0,0,0,146256000,2,19,2,5,0,0,1,0,1,0,0,0,159,29,38847520,1535005,300662,267542,11,867,13,0,0,10748
146256001,0,73443136,9,0,0,1,4,1582086464,366281,4386,80,0,1457176980,30312854,8,57,0,1235182992,0,0,0,0,0,146256001,2,19,2,4,2,1,1,1,1,1,1,0,72,10,49748666,44369960,7361311,466019,19261,202536,5,0,0,58462
146256002,845560,73443137,0,0,0,2,11,1581665518,1030299,4236,4119,0,1524226898,3141261,717,464,0,1501554925,1,0,0,0,0,146256002,2,14,4,7,2,1,1,1,1,1,1,0,161,42,50323632,44880036,7551233,985413,112,2712,14,0,0,0
146256003,6845,73443138,7,0,0,2,11,1581799075,6937005,4354,3629,0,1269050894,3141261,717,464,0,1501554925,1,0,0,0,0,146256003,2,15,5,20,2,1,1,1,1,1,1,0,216,71,50323633,44880037,2674107,2769099,646,2378,26,0,0,0


In [19]:
train.groupby('tr')['id'].agg('count')

tr
0    121386431
1     12434735
2     12434838
Name: id, dtype: int64

In [20]:
def MultiTE_gpu( tra, col, tar, L=1, smooth_method=0  ):
    np.random.seed(L)

    cols = col+[tar]
    gf = cudf.from_pandas(tra[cols])
    mn = gf[tar].mean().astype('float32')
    
    predtrain = np.zeros( tra.shape[0] )
    
    for fold in [7,8,9,10,11,12]:
        px = np.where( tra.dt_day <fold )[0]
        py = np.where( tra.dt_day==fold )[0]
        mn = gf[tar].iloc[px].mean().astype('float32')
        if smooth_method==0:
            te = gf.iloc[px].groupby(col)[tar].agg(['mean','count'])
            te['smooth']  = (te['mean']*te['count'])
            te['smooth'] += (mn*L)
            te['smooth'] /= (te['count']+L)
            te = te.drop( ['mean','count'] )
        elif smooth_method==1:
            te = gf.iloc[px].groupby(col)[tar].agg(['sum','count'])
            te['smooth'] = (te['sum']+L) / (te['count']+1)
            te = te.drop( ['sum','count'] )
        gf2 = gf.iloc[py].copy()
        gf2 = gf2.set_index( col )
        gf2['id'] = cupy.arange( gf2.shape[0] )
        gf2 = gf2.join( te, how='left' )
        gf2 = gf2.sort_values( 'id' )
        del te, gf2['id']
        predtrain[py] = gf2.smooth.fillna(-999).to_array()
        del gf2

    px = np.where( tra.dt_day <13 )[0]
    py = np.where( tra.dt_day>=13 )[0]
    mn = gf[tar].iloc[px].mean().astype('float32')
    if smooth_method==0:
        te = gf.iloc[px].groupby(col)[tar].agg(['mean','count'])
        te['smooth']  = (te['mean']*te['count'])
        te['smooth'] += (mn*L)
        te['smooth'] /= (te['count']+L)
        te = te.drop( ['mean','count'] )
    elif smooth_method==1:
        te = gf.iloc[px].groupby(col)[tar].agg(['sum','count'])
        te['smooth'] = (te['sum']+L) / (te['count']+1)
        te = te.drop( ['sum','count'] )
    gf2 = gf.iloc[py].copy()
    gf2 = gf2.set_index( col )
    gf2['id'] = cupy.arange( gf2.shape[0] )
    gf2 = gf2.join( te, how='left' )
    gf2 = gf2.sort_values( 'id' )
    del te, gf2['id']
    predtrain[py] = gf2.smooth.fillna(-999).to_array()            
    del gf2

    px = np.where( (tra.dt_day>=7)&(tra.dt_day<=11) )[0]
    py = np.where( tra.dt_day==6 )[0]
    mn = gf[tar].iloc[px].mean().astype('float32')
    if smooth_method==0:
        te = gf.iloc[px].groupby(col)[tar].agg(['mean','count'])
        te['smooth']  = (te['mean']*te['count'])
        te['smooth'] += (mn*L)
        te['smooth'] /= (te['count']+L)
        te = te.drop( ['mean','count'] )
    elif smooth_method==1:
        te = gf.iloc[px].groupby(col)[tar].agg(['sum','count'])
        te['smooth'] = (te['sum']+L) / (te['count']+1)
        te = te.drop( ['sum','count'] )
    gf2 = gf.iloc[py].copy()
    gf2 = gf2.set_index( col )
    gf2['id'] = cupy.arange( gf2.shape[0] )
    gf2 = gf2.join( te, how='left' )
    gf2 = gf2.sort_values( 'id' )
    del te, gf2['id']
    predtrain[py] = gf2.smooth.fillna(-999).to_array()            
    del gf2
    
    _ = gc.collect()
    predtrain[predtrain <= -999 ] = np.nan
    return predtrain.astype(np.float32)

In [21]:
train.head()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,tr,dt_day,dt_dow,dt_hour,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode,count_ats,count_char,count_words,tw_hash,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,tw_hash0,tw_hash1,tw_rt_uhash
0,0,0,0,0,0,2,11,1581262691,0,986,1201,0,1274269909,6237935,94,648,0,1478011810,0,0,0,0,0,0,0,9,6,15,18,-1,0,-1,0,0,0,0,68,17,0,0,0,0,0,0,7,0,0,0
1,3,1,0,0,0,1,11,1581497241,1,1225,677,0,1255778244,879586,1139,46,0,1540395738,1,0,1,0,1,1,0,12,2,8,-9,-9,-9,-9,-9,-9,-9,0,182,37,1,1,1,1,1,1,12,0,0,661291
2,0,2,0,1,1,2,11,1580978528,2,3016,1623,0,1313450503,647103,780,440,0,1432084055,1,0,0,0,1,2,0,6,3,8,2,1,1,1,1,1,1,0,105,24,2,2,2,2,2,2,6,0,0,0
3,0,3,0,0,0,1,54,1581321849,3,2121,16,0,1547717153,13774339,1,45,0,1534313747,0,0,0,0,1,3,0,10,0,8,2,1,1,1,1,1,1,2,103,22,3,3,3,3,3,3,10,198539,2048,616
4,0,4,5,0,0,2,11,1580956787,4,813505,200,1,1476348838,13774340,171,388,0,1490166885,0,0,0,0,1,4,0,6,3,2,2,1,1,1,1,1,1,0,237,63,4,4,4,4,4,4,16,0,0,0


In [22]:
%%time
for t in ['like']:
    for c in [
        ['b_user_id','tweet_type','language'],
        ['tw_first_word','tweet_type','language'],
        ['tw_last_word','tweet_type','language'],
        ['tw_hash0','tweet_type','language'],
        ['tw_hash1','tweet_type','language'],
        ['tw_rt_uhash','tweet_type','language'],
        
        ['a_user_id'],
        ['b_user_id'],
        ['tw_hash'],
        ['tw_freq_hash'],
        
        ['media','tweet_type','language','a_is_verified','b_is_verified','b_follows_a'],
        ['a_count_combined','tweet_type','language'],
        ['a_user_fer_count_delta_time','media','language'],
        ['a_user_fing_count_delta_time','media','language'],
        ['a_user_fering_count_delta_time','tweet_type','language'],
        ['a_user_fing_count_mode','media','language'],
        ['a_user_fer_count_mode','media','language'],
        ['a_user_fering_count_mode','tweet_type','language'],
        
        ['domains','media','tweet_type','language'],
        ['links','media','tweet_type','language'],
        ['hashtags','media','tweet_type','language'],
        ]:
        fname = 'TE_'+'_'.join(c)+'_'+t
        print( fname )
        train[fname] = MultiTE_gpu( train, c, t, 20, 0 )

TE_b_user_id_tweet_type_language_like
TE_tw_first_word_tweet_type_language_like
TE_tw_last_word_tweet_type_language_like
TE_tw_hash0_tweet_type_language_like
TE_tw_hash1_tweet_type_language_like
TE_tw_rt_uhash_tweet_type_language_like
TE_a_user_id_like
TE_b_user_id_like
TE_tw_hash_like
TE_tw_freq_hash_like
TE_media_tweet_type_language_a_is_verified_b_is_verified_b_follows_a_like
TE_a_count_combined_tweet_type_language_like
TE_a_user_fer_count_delta_time_media_language_like
TE_a_user_fing_count_delta_time_media_language_like
TE_a_user_fering_count_delta_time_tweet_type_language_like
TE_a_user_fing_count_mode_media_language_like
TE_a_user_fer_count_mode_media_language_like
TE_a_user_fering_count_mode_tweet_type_language_like
TE_domains_media_tweet_type_language_like
TE_links_media_tweet_type_language_like
TE_hashtags_media_tweet_type_language_like
CPU times: user 9min 20s, sys: 1min 55s, total: 11min 15s
Wall time: 11min 15s


In [23]:
train.head()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,tr,dt_day,dt_dow,dt_hour,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode,count_ats,count_char,count_words,tw_hash,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,tw_hash0,tw_hash1,tw_rt_uhash,TE_b_user_id_tweet_type_language_like,TE_tw_first_word_tweet_type_language_like,TE_tw_last_word_tweet_type_language_like,TE_tw_hash0_tweet_type_language_like,TE_tw_hash1_tweet_type_language_like,TE_tw_rt_uhash_tweet_type_language_like,TE_a_user_id_like,TE_b_user_id_like,TE_tw_hash_like,TE_tw_freq_hash_like,TE_media_tweet_type_language_a_is_verified_b_is_verified_b_follows_a_like,TE_a_count_combined_tweet_type_language_like,TE_a_user_fer_count_delta_time_media_language_like,TE_a_user_fing_count_delta_time_media_language_like,TE_a_user_fering_count_delta_time_tweet_type_language_like,TE_a_user_fing_count_mode_media_language_like,TE_a_user_fer_count_mode_media_language_like,TE_a_user_fering_count_mode_tweet_type_language_like,TE_domains_media_tweet_type_language_like,TE_links_media_tweet_type_language_like,TE_hashtags_media_tweet_type_language_like
0,0,0,0,0,0,2,11,1581262691,0,986,1201,0,1274269909,6237935,94,648,0,1478011810,0,0,0,0,0,0,0,9,6,15,18,-1,0,-1,0,0,0,0,68,17,0,0,0,0,0,0,7,0,0,0,0.426688,0.426688,0.542687,0.555838,0.549265,0.555847,0.476541,0.426688,,,0.409217,0.00051,0.151209,0.009113,0.226961,0.025708,0.013805,0.018372,0.489716,0.489716,0.466438
1,3,1,0,0,0,1,11,1581497241,1,1225,677,0,1255778244,879586,1139,46,0,1540395738,1,0,1,0,1,1,0,12,2,8,-9,-9,-9,-9,-9,-9,-9,0,182,37,1,1,1,1,1,1,12,0,0,661291,,,0.296454,0.313692,0.306074,0.422544,0.613762,0.458258,,0.401419,0.265254,,,,,,,,0.255281,0.255281,0.380288
2,0,2,0,1,1,2,11,1580978528,2,3016,1623,0,1313450503,647103,780,440,0,1432084055,1,0,0,0,1,2,0,6,3,8,2,1,1,1,1,1,1,0,105,24,2,2,2,2,2,2,6,0,0,0,0.494715,0.518982,0.312858,0.547036,0.540292,0.547047,0.524431,0.54735,,,0.51053,0.881142,0.761485,0.574593,0.876912,0.547895,0.724042,0.850262,0.514926,,0.466379
3,0,3,0,0,0,1,54,1581321849,3,2121,16,0,1547717153,13774339,1,45,0,1534313747,0,0,0,0,1,3,0,10,0,8,2,1,1,1,1,1,1,2,103,22,3,3,3,3,3,3,10,198539,2048,616,,0.458217,0.338718,0.454735,0.403551,0.359587,0.441246,0.424459,0.454735,0.454735,0.31687,0.772099,0.808807,0.644922,0.765065,0.605339,0.765459,0.697559,0.309635,0.309635,0.313281
4,0,4,5,0,0,2,11,1580956787,4,813505,200,1,1476348838,13774340,171,388,0,1490166885,0,0,0,0,1,4,0,6,3,2,2,1,1,1,1,1,1,0,237,63,4,4,4,4,4,4,16,0,0,0,0.403325,,0.588863,0.547036,0.540292,0.547047,0.561589,0.341275,,,0.407744,0.881142,0.872921,0.720243,0.876912,0.702097,0.852511,0.850262,0.69748,0.69748,0.670244


In [24]:
train.to_parquet( './data/train-final-te-like-1.parquet' )
gc.collect()

0

In [25]:
print('Elapsed Time is %f minutes'%((time.time()-startNB)/60))

Elapsed Time is 17.765332 minutes
