#### Imports

In [None]:
import time
import pandas as pd

from preprocess_nlp import preprocess_nlp
from preprocess_nlp import async_call_preprocess

#### Read Data

In [None]:
df = pd.read_excel('data.xlsx', nrows=30000)

# Make sure there are no Null values and the data type is <str>
df.dropna(inplace=True)
df['body'] = df['body'].astype('str')

print("Total strings", len(df['body'].tolist()))
df.head()

#### Define Stages

In [None]:
# Default Stages
stages = {'remove_tags_nonascii': True, 
          'lower_case': True,
          'expand_contractions': False,
          'remove_escape_chars': True,
          'remove_punctuation': True,
          'remove_stopwords': False,
          'remove_numbers': True,
          'lemmatize': False,
          'stemming': False,
          'min_word_len': 2}

#### Sequential Processing

<font color='red'>Note: Press <Shift+Tab> to access the docstring of <B><I>preprocess_nlp</B></I>, which contains various default parameters for stages of processing</font>

In [None]:
start_time = time.time()

# Processes data sequential without creating processes (Params - (Strings_to_be_processed, Dict_of_stages))
processed_text_seq = preprocess_nlp(df['body'].tolist(), stages)

print("Time Elapsed:", time.time()-start_time)

#### Parallel Processing

<font color='red'>Note: Press <Shift+Tab> to access the docstring of <B><I>async_call_preprocess</B></I>, which contains various default parameters for stages of processing</font>

In [None]:
# Order is not maintained
start_time = time.time()

# Processes data simultaneously by creating processes (Params - (Strings_to_be_processed, Dict_of_stages, Number_of_processes))
processed_text_par = async_call_preprocess(df['body'].tolist(), stages, 2)

print("Time Elapsed:", time.time()-start_time)

#### Write to Disk

In [None]:
df_new = pd.DataFrame({'id': df['id'].tolist(), 'processed_text': processed_text})
df_new.to_excel('processed.xlsx', index=False)
df_new.head()

<hr>
<B><I>IGNORE - Trials for Multi-Thread</I></B><br>
<font color='purple'>Turns out processes are faster and run simultaneously allowing parallel processing. Threads are better for I/O sequences</font>

In [None]:
from multiprocessing.pool import ThreadPool

def async_call_preprocess(strings, stages, n_processes=3):
    """
    Function to create async threads for faster processing. Automatically creates threads and assigns data to each thread call
    
    :param strings: A list of strings to be preprocessed
    :param stages: A dictionary with keys as stages and values as Boolean/Integer. Can be used to customize the stages in preprocessing
    :param n_processes: Integer value of number of threads to be created
    (Default parameters for stages)
    {'remove_tags_nonascii': True, 'lower_case': True,'expand_contractions': False, 'remove_punctuation': True, 'remove_escape_chars': True, 'remove_stopwords': False, 'remove_numbers': True, 'lemmatize': False, 'stemming': False, 'min_word_len': 2
    
    <Returns a list of preprocessed strings, aggregated from threads>
    """
    pool = ThreadPool(processes=n_processes)
    
    # Note the start time
    start_time = time.time()
    
    # Calculate the indices of strings to be passed to multiple processes
    ranges = calculate_ranges(len(strings), n_processes)
    print(ranges)
    
    # Create processes and then pass data
    process_dict = dict()
    for i in range(len(ranges)-1):
        string_set = strings[ranges[i] : ranges[i+1]]
        process_dict[i] = pool.apply_async(preprocess_nlp, (string_set, stages)) # tuple of args for foo
    
    # Join the results
    processed_strings = list()
    for i in range(len(ranges)-1):
        processed_strings.append(process_dict[i].get())
    
    for i in range(len(ranges)-1):
        print(len(process_dict[i].get()))
    
    print("Time Elapsed:", time.time()-start_time)
    return processed_strings