In [1]:
!nvidia-smi

Fri Jan 29 17:33:58 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 455.23.05    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:15:00.0 Off |                    0 |
| N/A   34C    P0    39W / 300W |      0MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from cudf import DataFrame
import cudf
import math
from math import log
import nvstrings, nvcategory
from numba import cuda, float32
import time
import rmm 
import numpy as np
from cudf.utils import cudautils

### Read file

In [3]:
%%time
# 1.6M tweets data
text_sents = cudf.read_csv('/data/tweet_data.csv', delimiter=',', names=['note'],skiprows=1)

CPU times: user 365 ms, sys: 412 ms, total: 777 ms
Wall time: 786 ms


In [4]:
text_sents.head()

Unnamed: 0,note
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,is upset that he can't update his Facebook by ...
2,@Kenichan I dived many times for the ball. Man...
3,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all...."


### Remove special characters

In [5]:
def rm_special_characters(text_sents):
    nvtext_sents = text_sents.note.data
    nvtext_sents = nvtext_sents.fillna('-9999')
    nvtext_sents = nvtext_sents.replace('[^\w\s]','')
    nvtext_sents = nvtext_sents.replace('_', '')
    nvtext_sents = nvtext_sents.replace('\s+', ' ')
    nvtext_sents = nvtext_sents.lstrip().rstrip()
    nvtext_sents_clean = nvtext_sents.lower()
    return nvtext_sents_clean

In [6]:
%%time
nvtext_sents_clean = rm_special_characters(text_sents)
docs_length = len(nvtext_sents_clean)

CPU times: user 107 ms, sys: 376 ms, total: 483 ms
Wall time: 480 ms


### Create frequencies of each word 

In [7]:
# determine the message id, message length for each word.
@cuda.jit
def _initDictionary(length_array, id, doc_length, cnt, n, col, row):
    tx = cuda.threadIdx.x
    bx = cuda.blockIdx.x
    bw = cuda.blockDim.x
    pos = tx + bx * bw
    if pos < n :
        idx = pos % col
        id[pos] = idx + 1
        doc_length[pos] = length_array[idx] + 1
        cnt[pos] = 1

def initDictionary(length_array, col, row ):
    from math import ceil
    n = col * row
    d_length_array = cuda.to_device(length_array)
    id = cuda.device_array(n, dtype=np.int32)
    doc_length = cuda.device_array(n, dtype=np.int32)
    cnt = cuda.device_array(n, dtype=np.int32)
    blocks, threads = ceil(n / 256), 256
    _initDictionary[blocks, threads](d_length_array, id, doc_length, cnt, n, col, row)
    
    return id, doc_length, cnt

In [None]:
def cat_to_series(cat,size):
    # keep values of category in GPU device
    device_array = rmm.device_array(size, dtype=np.int32)
    cat.values(devptr=device_array.device_ctypes_pointer.value)
    return cudf.Series(device_array)

def create_freq_dict(nvtext_sents_clean):
    # split strings by space
    nvtext_sents_clean_v1 = nvtext_sents_clean.split(' ')
    # get the actual length of each message or document
    length_array = nvtext_sents_clean.count(pat = ' ') # actual len = len + 1
    # create all parameters of dict in GPU
    id, doc_length, cnt = initDictionary(length_array, len(nvtext_sents_clean_v1[0]), len(nvtext_sents_clean_v1))
    # category strings
    cat = nvcategory.from_strings(*nvtext_sents_clean_v1)
    # keep values of category in GPU device
    df_v = cat_to_series(cat,len(id))
    key_v = cat_to_series(nvcategory.from_strings(cat.keys()),len(cat.keys()))
    # Create cudf dataFrame
    key = DataFrame({'str': cat.keys(), 'cat': key_v})
    df = DataFrame({'cat' : df_v,'cnt' : cnt ,'id' : id,'doc_length':doc_length})
   
    # create a local dict
    #Total number of terms in the document
    df1 = df.groupby(['cat','id','doc_length'], method='hash', as_index=False).count()
    freqDict_list_local = key.merge(df1, left_on=['cat'],right_on=['cat'])
    
    #create a global dict
    #Number of documents with term t in it
    gdf = df1[['cat','cnt']]
    gdf['cnt'] = 1
    gdf = gdf.groupby(['cat'],method='hash', as_index=False).count()
    freqDict_list_global = key.merge(gdf, left_on=['cat'],right_on=['cat'])
    freqDict_list_global['global_cnt'] = freqDict_list_global['cnt']
    freqDict_list_global.drop_column('cat')
    
    return freqDict_list_local,freqDict_list_global

In [None]:
%%time
freqDict_list_local,freqDict_list_global = create_freq_dict(nvtext_sents_clean)

### Compute TF, IDF and TF-IDF

In [None]:
@cuda.jit
def logFunction(array,n):
    tx = cuda.threadIdx.x
    ty = cuda.blockIdx.x
    bw = cuda.blockDim.x
    pos = tx + ty * bw
    if pos < n:
        array[pos] = log(array[pos])

def log_function(df, incol, outcol):
    from math import ceil
    x1 = df[incol].to_gpu_array()
    n = x1.size
    blocks,threads = ceil(n / 256),256
    logFunction[blocks,threads](x1,n)
    df[outcol] = x1
    return df

In [None]:
# TF: (number of times that word w occurs in tweet t) ÷ (number of words in t)
def computeTF(freqDict_list):
    TF_scores = freqDict_list
    TF_scores['TF_scores'] = TF_scores['cnt'] / TF_scores['doc_length']
    TF_scores.drop_column('doc_length')
    TF_scores.drop_column('cnt')
    return TF_scores

# IDF: log ((total number of tweets) ÷ ( number of tweets where the word w appears ))
def computeIDF(TF_scores,freqDict_list_global,docs_length):
    IDF_scores = TF_scores.merge(freqDict_list_global,left_on=['str'],right_on=['str'])
    IDF_scores['IDF_scores'] = docs_length/IDF_scores['global_cnt']
    IDF_scores = log_function(IDF_scores,'IDF_scores','IDF_scores')
    return IDF_scores

# TF-IDF = TF * IDF
def computeTFIDF(TFIDF_scores):
    TFIDF_scores['TFIDF_score'] = TFIDF_scores['IDF_scores']*TFIDF_scores['TF_scores']
    TFIDF_scores.drop_column('TF_scores')
    TFIDF_scores.drop_column('IDF_scores')
    TFIDF_scores.drop_column('cat')
    TFIDF_scores.drop_column('cnt')
    TFIDF_scores.drop_column('global_cnt')
    return TFIDF_scores

In [None]:
%%time
#4 computeTF()
TF_scores = computeTF(freqDict_list_local)
#5 computeIDF()
IDF_scores = computeIDF(TF_scores,freqDict_list_global,docs_length)
#6 computeTFIDF()
TFIDF_scores = computeTFIDF(IDF_scores)
TFIDF_scores = TFIDF_scores[TFIDF_scores.str.data.isalpha()] 

### Show top 5 words with the highest TFIDF score

In [None]:
%%time
TFIDF_scores = TFIDF_scores.sort_values(by=['TFIDF_score', 'id'] ,ascending = False)

In [None]:
TFIDF_scores[:5]