Copyright (c) 2020, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm 
import glob
import gc
import os.path

import hashlib

In [2]:
def save_memory( df ):
    features = df.columns
    for i in range( df.shape[1] ):
        if df.dtypes[i] == 'uint8':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'bool':
            df[features[i]] = df[features[i]].astype( np.int8 )
            gc.collect()
        elif df.dtypes[i] == 'uint32':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'int64':
            df[features[i]] = df[features[i]].astype( np.int32 )
            gc.collect()
        elif df.dtypes[i] == 'float64':
            df[features[i]] = df[features[i]].astype( np.float32 )
            gc.collect()
    print( df.dtypes )

In [3]:
def extract_hash(text, split_text='@', no=0):
    text = text.lower()
    uhash = ''
    text_split = text.split('@')
    if len(text_split)>(no+1):
        text_split = text_split[no+1].split(' ')
        cl_loop = True
        uhash += clean_text(text_split[0])
        while cl_loop:
            if len(text_split)>1:
                if text_split[1] in ['_']:
                    uhash += clean_text(text_split[1]) + clean_text(text_split[2])
                    text_split = text_split[2:]
                else:
                    cl_loop = False
            else:
                cl_loop = False
    hash_object = hashlib.md5(uhash.encode('utf-8'))
    return hash_object.hexdigest()

In [4]:
def clean_text(text):
    if len(text)>1:
        if text[-1] in ['!', '?', ':', ';', '.', ',']:
            return(text[:-1])
    return(text)

In [5]:
train = pd.read_parquet( 'train-tweet-1.parquet' )
test0 = pd.read_parquet( 'test0-tweet-1.parquet' )
test1 = pd.read_parquet( 'test1-tweet-1.parquet' )
train.shape, test0.shape, test1.shape

((121386431, 3), (12434735, 3), (12434838, 3))

In [11]:
#train = train.head(100000)
#test0 = test0.head(100000)
#test1 = test1.head(100000)

In [6]:
%%time

WORDS = {}
DF = []
for tweet in train['tweet'].unique():
    words = tweet.split(' ')
    for w in words:
        if w not in WORDS:
            WORDS[w] = 1
        else:
            WORDS[w]+= 1
gc.collect()

for tweet in test0['tweet'].unique():
    words = tweet.split(' ')
    for w in words:
        if w not in WORDS:
            WORDS[w] = 1
        else:
            WORDS[w]+= 1
gc.collect()
for tweet in test1['tweet'].unique():
    words = tweet.split(' ')
    for w in words:
        if w not in WORDS:
            WORDS[w] = 1
        else:
            WORDS[w]+= 1
gc.collect()
                
len(WORDS)

CPU times: user 12min 35s, sys: 55.8 s, total: 13min 31s
Wall time: 13min 31s


In [7]:
%%time
count=0
for w in WORDS:
    WORDS[w] = [ WORDS[w], count ]
    count+=1
gc.collect()

CPU times: user 1min 11s, sys: 6.22 s, total: 1min 17s
Wall time: 1min 17s


In [8]:
WORDS['marvel']

[2293, 205667]

In [9]:
def freq_encode_words( vs ):
    li=[]
    lf=[]
    for v in vs.split(' '):
        if v not in ['','[',']','.','!','@','_','#']:
            f,i = WORDS[v]
            if f<100000:
                if f>2:
                    li.append( str(i) )
                    #li.append( v )
                    lf.append( f )
    return ' '.join( list((np.array(li)[np.argsort(lf)] )) )    
    
#freq_encode_words( train['tweet'].values[191019] )

In [10]:
def ret_word( x, rw=0 ):
    x = x.split(' ')
    if rw==0:
        if len(x)>=1:
            return x[0]
    elif rw==1:
        if len(x)>=2:
            return x[1]
    elif rw== -1:
        if len(x)>=1:
            return x[-1]
    elif rw== -2:
        if len(x)>=2:
            return x[-2]

    return '-1'

In [11]:
%%time

DF = []
train['tweet_nortsign'] = train['tweet'].str.replace('\[CLS\] RT @', '')
train['count_words']    = train['tweet'].str.count(' ')
train['count_char']     = train['tweet'].apply(lambda x: len(x))
train['count_ats']      = train['tweet_nortsign'].str.count('@')
train['hash0']          = train['tweet_nortsign'].apply(lambda x: extract_hash(x))
train['hash1']          = train['tweet_nortsign'].apply(lambda x: extract_hash(x, no=1))
train['tw_uhash']       = train['tweet'].apply(lambda x: extract_hash(x, split_text='RT @', no=0))
train['tw_hash']        = train['tweet'].apply(lambda x: hash(x)%1000000000 )

train['tweet']          = train['tweet'].apply(lambda x: freq_encode_words(x) )
train['tw_freq_hash']   = train['tweet'].apply(lambda x: hash(x)%1000000000 )
train['tw_first_word']  = train['tweet'].apply(lambda x: ret_word(x,0) )
train['tw_second_word'] = train['tweet'].apply(lambda x: ret_word(x,1) )
train['tw_last_word']   = train['tweet'].apply(lambda x: ret_word(x,-1) )
train['tw_llast_word']  = train['tweet'].apply(lambda x: ret_word(x,-2) )
train['tw_len']         = train['tweet'].apply(lambda x: len(x.split(' ')) )

DF.append( train[['id','count_ats', 'count_char', 'count_words', 'hash0', 'hash1', 'tw_uhash','tw_hash','tw_freq_hash','tw_first_word','tw_second_word','tw_last_word','tw_llast_word','tw_len']] )
del train
gc.collect()
    

test0['tweet_nortsign'] = test0['tweet'].str.replace('\[CLS\] RT @', '')
test0['count_words']    = test0['tweet'].str.count(' ')
test0['count_char']     = test0['tweet'].apply(lambda x: len(x))
test0['count_ats']      = test0['tweet_nortsign'].str.count('@')
test0['hash0']          = test0['tweet_nortsign'].apply(lambda x: extract_hash(x))
test0['hash1']          = test0['tweet_nortsign'].apply(lambda x: extract_hash(x, no=1))
test0['tw_uhash']       = test0['tweet'].apply(lambda x: extract_hash(x, split_text='RT @', no=0))
test0['tw_hash']        = test0['tweet'].apply(lambda x: hash(x)%1000000000 )

test0['tweet']          = test0['tweet'].apply(lambda x: freq_encode_words(x) )
test0['tw_freq_hash']   = test0['tweet'].apply(lambda x: hash(x)%1000000000 )
test0['tw_first_word']  = test0['tweet'].apply(lambda x: ret_word(x,0) )
test0['tw_second_word'] = test0['tweet'].apply(lambda x: ret_word(x,1) )
test0['tw_last_word']   = test0['tweet'].apply(lambda x: ret_word(x,-1) )
test0['tw_llast_word']  = test0['tweet'].apply(lambda x: ret_word(x,-2) )
test0['tw_len']         = test0['tweet'].apply(lambda x: len(x.split(' ')) )

DF.append( test0[['id','count_ats', 'count_char', 'count_words', 'hash0', 'hash1', 'tw_uhash','tw_hash','tw_freq_hash','tw_first_word','tw_second_word','tw_last_word','tw_llast_word','tw_len']] )
del test0
gc.collect()
    

test1['tweet_nortsign'] = test1['tweet'].str.replace('\[CLS\] RT @', '')
test1['count_words']    = test1['tweet'].str.count(' ')
test1['count_char']     = test1['tweet'].apply(lambda x: len(x))
test1['count_ats']      = test1['tweet_nortsign'].str.count('@')
test1['hash0']          = test1['tweet_nortsign'].apply(lambda x: extract_hash(x))
test1['hash1']          = test1['tweet_nortsign'].apply(lambda x: extract_hash(x, no=1))
test1['tw_uhash']       = test1['tweet'].apply(lambda x: extract_hash(x, split_text='RT @', no=0))
test1['tw_hash']        = test1['tweet'].apply(lambda x: hash(x)%1000000000 )

test1['tweet']          = test1['tweet'].apply(lambda x: freq_encode_words(x) )
test1['tw_freq_hash']   = test1['tweet'].apply(lambda x: hash(x)%1000000000 )
test1['tw_first_word']  = test1['tweet'].apply(lambda x: ret_word(x,0) )
test1['tw_second_word'] = test1['tweet'].apply(lambda x: ret_word(x,1) )
test1['tw_last_word']   = test1['tweet'].apply(lambda x: ret_word(x,-1) )
test1['tw_llast_word']  = test1['tweet'].apply(lambda x: ret_word(x,-2) )
test1['tw_len']         = test1['tweet'].apply(lambda x: len(x.split(' ')) )

DF.append( test1[['id','count_ats', 'count_char', 'count_words', 'hash0', 'hash1', 'tw_uhash','tw_hash','tw_freq_hash','tw_first_word','tw_second_word','tw_last_word','tw_llast_word','tw_len']] )
del test1
gc.collect()


DF = pd.concat( DF )
gc.collect()

save_memory( DF )
DF = DF.reset_index( drop=True )
gc.collect()
#DF.to_parquet( '../input/text-processings-1.parquet' )
DF.shape

id                 int32
count_ats          int32
count_char         int32
count_words        int32
hash0             object
hash1             object
tw_uhash          object
tw_hash            int32
tw_freq_hash       int32
tw_first_word     object
tw_second_word    object
tw_last_word      object
tw_llast_word     object
tw_len             int32
dtype: object
CPU times: user 2h 19min 29s, sys: 2h 13min 56s, total: 4h 33min 25s
Wall time: 4h 31min 43s


In [12]:
uhashes = pd.concat([DF['hash0'], DF['hash1'], DF['tw_uhash']], axis=0)
gc.collect()
uhashes = uhashes.value_counts()
gc.collect()
uhashes = uhashes.reset_index().reset_index()
gc.collect()
uhashes['uid'] = np.arange(0,uhashes.shape[0] )
print( uhashes.shape )
uhashes.head()

(6690786, 4)


Unnamed: 0,level_0,index,0,uid
0,0,d41d8cd98f00b204e9800998ecf8427e,347428021,0
1,1,9b9672de2cbe5ddd7ebd7538945a970a,1294764,1
2,2,b14a7b8059d9c055954c92674ce60032,976238,2
3,3,31cf2397511c7cfce33506bef80e25b7,660605,3
4,4,ba9bf05693b9fa202d922dd43a08f281,194430,4


In [13]:
DF['tw_hash0']    = pd.merge( DF[['hash0']]  , uhashes[['index','uid']], left_on='hash0'  , right_on='index', how='left' )['uid']
gc.collect()
DF['tw_hash1']    = pd.merge( DF[['hash1']]  , uhashes[['index','uid']], left_on='hash1'  , right_on='index', how='left' )['uid']
gc.collect()
DF['tw_rt_uhash'] = pd.merge( DF[['tw_uhash']], uhashes[['index','uid']], left_on='tw_uhash', right_on='index', how='left' )['uid']
gc.collect()
DF.head(20)

Unnamed: 0,id,count_ats,count_char,count_words,hash0,hash1,tw_uhash,tw_hash,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,tw_hash0,tw_hash1,tw_rt_uhash
0,0,0,68,17,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,417904705,14077494,9,16,15,8,7,0,0,0
1,1,0,182,37,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,0151f99e5c67748adcbb0c694d7e0b1d,254921337,594291702,24,19,36,21,12,0,0,661291
2,2,0,105,24,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,917286447,46081915,58,52,57,59,6,0,0,0
3,3,2,103,22,bd562f502b23a877c219369fc82e1dd2,73fa9240c65d0231cfca427bb6b9dc88,184b3f811e4e3f2eb9d763cd33dffdf6,456659357,983140245,65,64,76,75,10,198539,2048,616
4,4,0,237,63,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,559483437,636979676,129,101,89,117,16,0,0,0
5,5,0,124,20,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,457368886,984799237,138,132,131,135,13,0,0,0
6,6,0,79,11,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,564036868,94306893,154,148,152,150,4,0,0,0
7,7,0,304,52,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,565022653,780989501,172,168,192,169,29,0,0,0
8,8,0,94,12,d41d8cd98f00b204e9800998ecf8427e,d41d8cd98f00b204e9800998ecf8427e,46c2dda06911cbfb91b3291026e41e31,807354930,233871987,196,198,200,201,5,0,0,4983
9,9,1,98,11,c17c54808d0e99a9abfc83e3d10dad0b,d41d8cd98f00b204e9800998ecf8427e,c17c54808d0e99a9abfc83e3d10dad0b,963395355,711673564,211,207,208,206,5,1738,0,1738


In [14]:
del DF['hash0'],DF['hash1'],DF['tw_uhash']
gc.collect()
save_memory( DF )

id                 int32
count_ats          int32
count_char         int32
count_words        int32
tw_hash            int32
tw_freq_hash       int32
tw_first_word     object
tw_second_word    object
tw_last_word      object
tw_llast_word     object
tw_len             int32
tw_hash0           int32
tw_hash1           int32
tw_rt_uhash        int32
dtype: object


In [15]:
DF['tw_hash']        = pd.factorize( DF['tw_hash'] )[0]
DF['tw_freq_hash']   = pd.factorize( DF['tw_freq_hash'] )[0]
DF['tw_first_word']  = pd.factorize( DF['tw_first_word'] )[0]
DF['tw_second_word'] = pd.factorize( DF['tw_second_word'] )[0]
DF['tw_last_word']   = pd.factorize( DF['tw_last_word'] )[0]
DF['tw_llast_word']  = pd.factorize( DF['tw_llast_word'] )[0]
gc.collect()

0

In [16]:
DF['tw_hash']        = DF['tw_hash'].astype(np.int32)
DF['tw_freq_hash']   = DF['tw_freq_hash'].astype(np.int32)
DF['tw_first_word']  = DF['tw_first_word'].astype(np.int32)
DF['tw_second_word'] = DF['tw_second_word'].astype(np.int32)
DF['tw_last_word']   = DF['tw_last_word'].astype(np.int32)
DF['tw_llast_word']  = DF['tw_llast_word'].astype(np.int32)
gc.collect()

0

In [17]:
DF.head(10)

Unnamed: 0,id,count_ats,count_char,count_words,tw_hash,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,tw_hash0,tw_hash1,tw_rt_uhash
0,0,0,68,17,0,0,0,0,0,0,7,0,0,0
1,1,0,182,37,1,1,1,1,1,1,12,0,0,661291
2,2,0,105,24,2,2,2,2,2,2,6,0,0,0
3,3,2,103,22,3,3,3,3,3,3,10,198539,2048,616
4,4,0,237,63,4,4,4,4,4,4,16,0,0,0
5,5,0,124,20,5,5,5,5,5,5,13,0,0,0
6,6,0,79,11,6,6,6,6,6,6,4,0,0,0
7,7,0,304,52,7,7,7,7,7,7,29,0,0,0
8,8,0,94,12,8,8,8,8,8,8,5,0,0,4983
9,9,1,98,11,9,9,9,9,9,9,5,1738,0,1738


In [18]:
DF.to_parquet( 'text-processings-1.parquet' )
gc.collect()

0

In [19]:
DF.dtypes

id                int32
count_ats         int32
count_char        int32
count_words       int32
tw_hash           int32
tw_freq_hash      int32
tw_first_word     int32
tw_second_word    int32
tw_last_word      int32
tw_llast_word     int32
tw_len            int32
tw_hash0          int32
tw_hash1          int32
tw_rt_uhash       int32
dtype: object