#### Imports

In [None]:
import time
import pandas as pd

from feature_extraction import get_features
from feature_extraction import async_call_get_features

#### Read Data

In [None]:
df = pd.read_excel("data.xlsx")

# Make sure there are no Null values and the data type is <str>
df.dropna(subset=['text'])
df['text'] = df['text'].astype('str')

# Strings from which features are to be extracted
docs = df['text'].tolist()
len(docs)

<font color='red'>Note: Press <Shift+Tab> to access the docstring of <B><I>various functions</B></I>, which contains various default parameters for stages of processing</font>

#### Define Stages

In [None]:
stages = {'nouns': True,
          'verbs': True,
          'adjs': True,
          'noun_phrases': False,
          'keywords': False,
          'ner': False,
          'numbers': False,}

#### Sequential Processing

In [None]:
start_time = time.time()

# Processes data sequential without creating processes (Params - (Strings_to_be_processed, Dict_of_stages))
nouns_list, verbs_list, adjs_list, ners_list, noun_chunks, yake_keywords, numbers_list = \
                                            get_features(docs, stages)

print("Time Elapsed:", time.time()-start_time)

#### Parallel Processing

In [None]:
# Input Order is mainted, output is according to the input order.
start_time = time.time()

# Processes data simultaneously by creating processes (Params - (Strings_to_be_processed, Dict_of_stages, Number_of_processes))
nouns_list2, verbs_list2, adjs_list2, ners_list2, noun_chunks2, yake_keywords2, numbers_list2 = \
                                    async_call_get_features(docs, stages, n_processes=2)

print("Time Elapsed:", time.time()-start_time)

#### Segregate the NERS into ORG, PER, LOC

In [None]:
per_list = list()
loc_list = list()
org_list = list()
for each_ner_set in ners_list:
    per_list.append(each_ner_set.get('PER', ''))
    loc_list.append(each_ner_set.get('LOC', ''))
    org_list.append(each_ner_set.get('ORG', ''))

#### Write to Disk

In [None]:
# Remove the ones that are not extracted from the below code
df_features = pd.DataFrame({'id':df['id'].tolist(),
                            'text': docs,
                            'nouns': nouns_list,
                            'verbs': verbs_list,
                            'adjs':adjs_list,
                            'noun_phrases':noun_chunks,
                            'keywords':yake_keywords,
                            'numbers': numbers_list,
                            'Person': per_list,
                            'Organization': org_list,
                            'Location': loc_list,
                           })

df_features.to_excel('trail.xlsx', index=False)
df_features.head()

<hr>
<B><I>IGNORE - Trials for Manager instead of Pipe</I></B><br>
<font color='purple'>Turns out processes in Manager donot return values in an order. Hence order is not maintained</font>

In [None]:
def async_call_get_features_manager(strings, stages={}, n_processes=3):
    """
    Function to create async processes for faster processing. Automatically creates processe and assigns data to each process call.
    This function uses Manager instead of Pipe, so order is not mainted in this function. IGNORE this function.
    
    :param strings: A list of strings to be processed or extracted features from
    :param stages: Dictionary that contains stages to be executed
    :param n_processes: Integer value of number of processess to be created
    
    <Returns a list of extracted features, 7 list items> \n
    
    (default_stages = {
        'nouns': True,
        'verbs': True,
        'adjs': False,
        'noun_phrases': False,
        'keywords': False,
        'ner': False,
        'numbers': False,
        })
    """
    # Calculate the indices of strings to be passed to multiple processes
    ranges = calculate_ranges(len(strings), n_processes)

    # Create a Job Manager to share a dictionary that could store results of multiple processes 
    jobs = []    
    manager = multiprocessing.Manager()
    return_dict = manager.dict()

    # Start creating processes and pass the records/strings according to the indices generated
    for i in range(len(ranges)-1):
        string_set = strings[ranges[i] : ranges[i+1]]
        p = multiprocessing.Process(target=get_features, args=(string_set, stages, i, return_dict))
        jobs.append(p)
        p.start()

    # Wait for the result of each process
    for proc in jobs:
        proc.join()
        
    all_list = [[], [], [], [], [], [], []]
    for k in return_dict.keys():
        for i, j in enumerate(return_dict[k]):
            all_list[i] += j
        
    return all_list