# RAPIDS NLP:

### Credits:
https://medium.com/rapids-ai/show-me-the-word-count-3146e1173801


### Objective: Show case nlp capabilties of nvstrings+cudf

### Import libraries

In [1]:
import cudf
import nvcategory
import os
import numpy as np
import nvtext
import cuml
import nvstrings
try:
    import nltk
except ModuleNotFoundError:
    os.system('pip install nltk')
    import nltk
from numba import cuda

### Set Data File 

In [2]:
data_file = 'data/enwiki-latest.preprocess.txt.gz'
output_dir = 'data'

## Read Text Frame

#### Read helper functions

In [3]:
import gensim.utils
import tqdm
import sys

ONE_GB = 10 ** 8

def get_non_empty_lines(path):
    """
        returns non empty lines from a list of lines
    """
    chunked_output = []
    for line in get_txt_lines(path):
        chunked_output += [line]
        if sys.getsizeof(chunked_output) > ONE_GB:
            print('Chunk Size:', sys.getsizeof(chunked_output), 'bytes')
            print('yielding')
            yield chunked_output
            chunked_output = []
    yield chunked_output

def get_txt_lines(path):
    """
        Read text lines from gutenberg texts
        returns (text_ls,fname_ls) where 
        text_ls= input_text_lines and fname_ls = list of fnames
    """
    with gensim.utils.open(path, 'rb', encoding='utf-8') as infile:
        for i, line in tqdm.tqdm(enumerate(infile)):
            yield line

### Read text lines into a cudf dataframe

#### Full Pre-processing Pipe-Line

In [4]:
filters = [ '!', '"', '#', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/',  '\\', ':', ';', '<', '=', '>',
           '?', '@', '[', ']', '^', '_', '`', '{', '|', '}', '\~', '\t','\\n',"'",",",'~' , '—']

def preprocess_text(input_strs , filters=filters):
    """
        * filter punctuation
        * to_lower
        * remove stop words (from nltk corpus)
        * remove multiple spaces with one
        * remove leading spaces    
    """
    
    # filter punctuation and case conversion
    input_strs = input_strs.str.replace_multi(filters, ' ', regex=False)
    input_strs = input_strs.str.lower()
        
    # replace multiple spaces with single one and strip leading/trailing spaces
    input_strs = input_strs.str.replace(r"\s+", ' ', regex=True)
    input_strs = input_strs.str.strip(' ')
    
    return input_strs

def preprocess_text_df(df, text_cols=['text'], **kwargs):
    for col in text_cols:
        df[col] = preprocess_text(df[col], **kwargs)
    return  df

With our function defined, we can execute it to preprocess the entire dataset.

In [5]:
for i, txt_ls in enumerate(get_non_empty_lines(data_file)):
    df = cudf.DataFrame()

    df['text'] = nvstrings.to_device(txt_ls)
    print('chunk',i,': read into GPU')

    df = preprocess_text_df(df)
    print('chunk',i,': cleaned')

    df.to_csv(path=data_file+'.clean.'+str(i), columns=['text'], header=False, index=False)
    print('chunk',i,': written to file')


11463627it [00:51, 216029.61it/s]

Chunk Size: 103184048 bytes
yielding
chunk 0 : read into GPU


11463627it [01:10, 216029.61it/s]

chunk 0 : cleaned


11497620it [03:12, 60.60it/s]    

chunk 0 : written to file


22921547it [04:05, 216389.33it/s]

Chunk Size: 103184048 bytes
yielding
chunk 1 : read into GPU


22921547it [04:20, 216389.33it/s]

chunk 1 : cleaned


22972078it [11:00, 134.75it/s]   

chunk 1 : written to file


34373928it [11:52, 220394.76it/s]

Chunk Size: 103184048 bytes
yielding
chunk 2 : read into GPU


34373928it [12:10, 220394.76it/s]

chunk 2 : cleaned


34436000it [16:05, 556.84it/s]   

chunk 2 : written to file


45848669it [16:57, 222182.59it/s]

Chunk Size: 103184048 bytes
yielding
chunk 3 : read into GPU


45848669it [17:10, 222182.59it/s]

chunk 3 : cleaned


45902061it [21:45, 256.03it/s]   

chunk 3 : written to file


57311390it [22:37, 225728.60it/s]

Chunk Size: 103184048 bytes
yielding
chunk 4 : read into GPU


57311390it [22:50, 225728.60it/s]

chunk 4 : cleaned


57370255it [34:13, 127.22it/s]   

chunk 4 : written to file


68771002it [35:04, 221624.14it/s]

Chunk Size: 103184048 bytes
yielding
chunk 5 : read into GPU


68771002it [35:20, 221624.14it/s]

chunk 5 : cleaned


68833389it [39:22, 481.84it/s]   

chunk 5 : written to file


80232391it [40:13, 222966.02it/s]

Chunk Size: 103184048 bytes
yielding
chunk 6 : read into GPU


80232391it [40:30, 222966.02it/s]

chunk 6 : cleaned


80296891it [44:45, 543.15it/s]   

chunk 6 : written to file


91703873it [45:37, 213736.55it/s]

Chunk Size: 103184048 bytes
yielding
chunk 7 : read into GPU


91703873it [45:50, 213736.55it/s]

chunk 7 : cleaned


91762305it [51:31, 291.52it/s]   

chunk 7 : written to file


103172123it [52:22, 220283.52it/s]

Chunk Size: 103184048 bytes
yielding
chunk 8 : read into GPU


103172123it [52:40, 220283.52it/s]

chunk 8 : cleaned


103226511it [56:32, 321.48it/s]   

chunk 8 : written to file


114628358it [57:24, 218358.25it/s]

Chunk Size: 103184048 bytes
yielding
chunk 9 : read into GPU


114628358it [57:40, 218358.25it/s]

chunk 9 : cleaned


114691982it [1:05:28, 287.07it/s] 

chunk 9 : written to file


121169182it [1:05:58, 30613.34it/s] 


chunk 10 : read into GPU
chunk 10 : cleaned
chunk 10 : written to file
