Copyright (c) 2020, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [1]:
import os, time

In [2]:
import pandas as pd, numpy as np, gc
from datetime import datetime
import joblib

import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [3]:
def save_memory( df ):
    features = df.columns
    for i in range( df.shape[1] ):
        if df.dtypes[i] == 'uint8':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'bool':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'uint32':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'int64':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'float64':
            df[features[i]] = df[features[i]].astype( np.float32 )
            gc.collect()
    

# Load Train

In [4]:
%%time
train = pd.read_parquet( 'train.parquet' )
test0 = pd.read_parquet( 'valid.parquet' )
test1 = pd.read_parquet( 'test.parquet' )
gc.collect()

train['tr'] = 0
test0['tr'] = 1
test1['tr'] = 2

train.shape, test0.shape, test1.shape

CPU times: user 53.8 s, sys: 5min 40s, total: 6min 33s
Wall time: 1min 20s


((106254462, 28), (9760684, 28), (9765321, 28))

In [5]:
train = pd.concat( (train,test0,test1), sort=False )
gc.collect()
del test0, test1
gc.collect()
train.shape

(125780467, 28)

In [6]:
%%time
train = train.sort_values('id').reset_index(drop=True) 
gc.collect()

CPU times: user 23.5 s, sys: 24.8 s, total: 48.3 s
Wall time: 48.2 s


0

In [7]:
train.loc[ train.reply>0, 'reply' ] = 1
train.loc[ train.retweet>0, 'retweet' ] = 1
train.loc[ train.retweet_comment>0, 'retweet_comment' ] = 1
train.loc[ train.like>0, 'like' ] = 1
gc.collect()

0

In [8]:
train.head(10)

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,tr,len_hashtags,len_domains,len_links
0,1,0,5,0,0,2,11,1581472519,0,11005,10795,False,1437797708,12074272,4,6,False,1568940527,True,0,0,0,0,0,0,2,0,0
1,0,1,0,0,0,0,59,1581124265,1,425,260,False,1496802112,2019671,225,185,False,1541013180,True,0,0,0,1,1,0,0,0,0
2,0,2,0,0,0,2,38,1581166895,2,298336,8,False,1327933644,12074273,4,72,False,1573996260,False,0,0,0,1,2,0,0,0,0
3,0,3,0,0,0,2,11,1581375781,3,1115,1210,False,1266471640,485172,780,440,False,1432084055,True,0,0,0,1,3,0,0,0,0
4,0,4,0,0,0,1,54,1580996019,4,63473,5424,False,1340804123,12074274,180,32,False,1490688761,False,0,0,0,0,4,0,0,0,0
5,0,5,7,0,0,2,59,1581004518,5,29358,18528,False,1539180042,1257240,929,928,False,1577292324,True,1,0,0,0,5,0,0,0,0
6,3,6,0,0,0,1,54,1581209689,6,562,568,False,1497201827,1257240,929,928,False,1577292324,True,0,0,0,1,6,0,2,0,0
7,0,7,0,0,0,2,54,1581511341,7,529009,221,True,1311451697,10926470,139,952,False,1335110299,False,0,0,0,1,7,0,0,0,0
8,5,8,9,0,0,2,59,1581193484,8,26730,1840,False,1237321687,12074275,286,522,False,1439811227,False,0,0,0,0,8,0,1,0,0
9,0,9,0,0,0,2,47,1581433245,9,2664,303,False,1452867918,10674902,48,140,False,1568691506,False,0,0,0,0,9,0,0,0,0


In [9]:
save_memory(train)
gc.collect()

0

In [10]:
train.dtypes

hashtags              int32
tweet_id              int32
media                  int8
links                 int32
domains               int32
tweet_type             int8
language               int8
timestamp             int32
a_user_id             int32
a_follower_count      int32
a_following_count     int32
a_is_verified          int8
a_account_creation    int32
b_user_id             int32
b_follower_count      int32
b_following_count     int32
b_is_verified          int8
b_account_creation    int32
b_follows_a            int8
reply                 int32
retweet               int32
retweet_comment       int32
like                  int32
id                    int32
tr                    int32
len_hashtags          int32
len_domains           int32
len_links             int32
dtype: object

In [11]:
train['dt_day']  = pd.to_datetime( train['timestamp'] , unit='s' ).dt.day.values.astype( np.int8 )
train['dt_dow']  = pd.to_datetime( train['timestamp'] , unit='s' ).dt.dayofweek.values.astype( np.int8 )
train['dt_hour'] = pd.to_datetime( train['timestamp'] , unit='s' ).dt.hour.values.astype( np.int8 )
_=gc.collect()
train.groupby('dt_day')['id'].agg('count')

dt_day
6     14568832
7     15437848
8     15922044
9     15411437
10    15555316
11    15248999
12    14109986
13     2794759
14     2684842
15     2889749
16     2806908
17     3016117
18     2799282
19     2534348
Name: id, dtype: int64

In [12]:
dt = pd.read_parquet( 'a_count_combined-final.parquet' )
dt.head()

Unnamed: 0,id,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode
0,0,4,1,1,1,1,1,1
1,1,4,1,1,1,1,1,1
2,2,4,1,1,1,1,1,1
3,3,4,1,1,1,1,1,1
4,4,3,0,0,0,0,0,0


In [13]:
train['a_count_combined']             = dt['a_count_combined']
train['a_user_fer_count_delta_time']  = dt['a_user_fer_count_delta_time']
train['a_user_fing_count_delta_time'] = dt['a_user_fing_count_delta_time']
train['a_user_fering_count_delta_time']=dt['a_user_fering_count_delta_time']
train['a_user_fing_count_mode']       = dt['a_user_fing_count_mode']
train['a_user_fer_count_mode']        = dt['a_user_fer_count_mode']
train['a_user_fering_count_mode']     = dt['a_user_fering_count_mode']

train.loc[ (train.dt_day==12)|(train.dt_day==18) ,['a_count_combined','a_user_fer_count_delta_time','a_user_fing_count_delta_time','a_user_fering_count_delta_time','a_user_fing_count_mode','a_user_fer_count_mode','a_user_fering_count_mode']] = -9

del dt; _=gc.collect()
train.head()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,tr,len_hashtags,len_domains,len_links,dt_day,dt_dow,dt_hour,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode
0,1,0,5,0,0,2,11,1581472519,0,11005,10795,0,1437797708,12074272,4,6,0,1568940527,1,0,0,0,0,0,0,2,0,0,12,2,1,-9,-9,-9,-9,-9,-9,-9
1,0,1,0,0,0,0,59,1581124265,1,425,260,0,1496802112,2019671,225,185,0,1541013180,1,0,0,0,1,1,0,0,0,0,8,5,1,4,1,1,1,1,1,1
2,0,2,0,0,0,2,38,1581166895,2,298336,8,0,1327933644,12074273,4,72,0,1573996260,0,0,0,0,1,2,0,0,0,0,8,5,13,4,1,1,1,1,1,1
3,0,3,0,0,0,2,11,1581375781,3,1115,1210,0,1266471640,485172,780,440,0,1432084055,1,0,0,0,1,3,0,0,0,0,10,0,23,4,1,1,1,1,1,1
4,0,4,0,0,0,1,54,1580996019,4,63473,5424,0,1340804123,12074274,180,32,0,1490688761,0,0,0,0,0,4,0,0,0,0,6,3,13,3,0,0,0,0,0,0


In [14]:
train.dtypes

hashtags                          int32
tweet_id                          int32
media                              int8
links                             int32
domains                           int32
tweet_type                         int8
language                           int8
timestamp                         int32
a_user_id                         int32
a_follower_count                  int32
a_following_count                 int32
a_is_verified                      int8
a_account_creation                int32
b_user_id                         int32
b_follower_count                  int32
b_following_count                 int32
b_is_verified                      int8
b_account_creation                int32
b_follows_a                        int8
reply                             int32
retweet                           int32
retweet_comment                   int32
like                              int32
id                                int32
tr                                int32


In [15]:
dt = pd.read_parquet( 'text-processings-1.parquet' )
dt.sort_values('id', inplace=True)
dt.head()

Unnamed: 0,id,count_ats,count_char,count_words,tw_hash,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,tw_hash0,tw_hash1,tw_rt_uhash
0,0,0,196,39,0,0,0,0,0,0,23,0,0,0
1,1,0,51,8,1,1,1,1,1,1,5,0,0,0
2,2,0,79,10,2,2,2,2,2,2,8,0,0,0
3,3,0,186,47,3,3,3,3,3,3,18,0,0,0
4,4,0,158,24,4,4,4,4,4,4,13,0,0,1870


In [16]:
train['count_ats']     = dt['count_ats']
train['count_char']    = dt['count_char']
train['count_words']   = dt['count_words']
train['tw_hash']       = dt['tw_hash']
train['tw_freq_hash']  = dt['tw_freq_hash']
train['tw_first_word'] = dt['tw_first_word']
train['tw_second_word']= dt['tw_second_word']
train['tw_last_word']  = dt['tw_last_word']
train['tw_llast_word'] = dt['tw_llast_word']
train['tw_len']        = dt['tw_len']
train['tw_hash0']      = dt['tw_hash0']
train['tw_hash1']      = dt['tw_hash1']
train['tw_rt_uhash']   = dt['tw_rt_uhash']
del dt
gc.collect()

0

In [17]:
train.tail()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,tr,len_hashtags,len_domains,len_links,dt_day,dt_dow,dt_hour,a_count_combined,a_user_fer_count_delta_time,a_user_fing_count_delta_time,a_user_fering_count_delta_time,a_user_fing_count_mode,a_user_fer_count_mode,a_user_fering_count_mode,count_ats,count_char,count_words,tw_hash,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,tw_hash0,tw_hash1,tw_rt_uhash
125780462,10450,61226760,5,0,0,2,11,1581719195,1007938,1472,2681,0,1295918678,6943825,717,464,0,1501554925,1,0,0,0,1,125780462,2,6,0,0,14,4,22,4,1,1,1,1,1,1,0,300,95,42948009,38527677,3043671,1958989,557,339,32,0,0,0
125780463,530272,61226761,0,0,0,2,11,1581665518,582771,4236,4119,0,1524226898,6943825,717,464,0,1501554925,1,0,0,0,1,125780463,2,1,0,0,14,4,7,4,1,1,1,1,1,1,0,161,42,42948010,38527678,6896228,1040991,655,1633,15,0,0,0
125780464,0,61226762,0,0,0,2,11,1581602829,7112774,1943,2881,0,1468355247,6943825,717,464,0,1501554925,1,0,0,0,1,125780464,2,0,0,0,13,3,14,7,-1,-1,-1,-1,-1,-1,0,189,52,42948011,38527679,5001556,1866905,648,3596,23,0,0,0
125780465,14428,61226763,0,0,0,2,11,1581722172,188957,4884,4428,0,1373131817,6943825,717,464,0,1501554925,1,0,0,0,1,125780465,2,4,0,0,14,4,23,4,1,1,1,1,1,1,0,84,28,42948012,38527680,3464427,1103839,23,2032,16,0,0,0
125780466,0,61226764,0,0,0,2,11,1581813247,914057,309,285,0,1575377853,6943825,717,464,0,1501554925,1,0,0,0,1,125780466,2,0,0,0,16,6,0,4,1,1,1,1,1,1,1,28,3,42948013,54,54,9,51,9,1,2401528,0,2401528


In [18]:
train.groupby('tr')['id'].agg('count')

tr
0    106254462
1      9760684
2      9765321
Name: id, dtype: int64

In [19]:
data = train
train = data.loc[(data.tr == 0)]
valid = data.loc[(data.tr == 1)]
test = data.loc[(data.tr == 2)]

In [20]:
train[:1000000].to_parquet( 'train_final_s.parquet' )
valid[:100000].to_parquet( 'valid_final_s.parquet' )
test[:100000].to_parquet( 'test_final_s.parquet' )

In [21]:
train.to_parquet( 'train_final.parquet' )
valid.to_parquet( 'valid_final.parquet' )
test.to_parquet( 'test_final.parquet' )
