In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np

In [2]:
# Create test dataframe
lst = [['index 1',  'test text 1'],
       ['index 2',  'test text 2'],
       ['index 3',  'test text 3'],
       ['index 4',  'test text 4'],
       ['index 5',  'test text 5'],
       ['index 6',  'test text 6'],
       ['index 7',  'test text 7'],
       ['index 8',  'test text 8'],
       ['index 9',  'test text 9'],
       ['index 10', 'test text 10']
      ]

df = pd.DataFrame(lst, columns = ['Index_Name', 'Notes'])
print(df.head(10))

  Index_Name         Notes
0    index 1   test text 1
1    index 2   test text 2
2    index 3   test text 3
3    index 4   test text 4
4    index 5   test text 5
5    index 6   test text 6
6    index 7   test text 7
7    index 8   test text 8
8    index 9   test text 9
9   index 10  test text 10


In [3]:
def append_text(dataFrame,
                data_column,
                iteration_text,
                lower=None,
                upper=None):
  
  print(f'Processing {iteration_text}')
   
  column_index = dataFrame.columns.get_loc(data_column)
   
  dataFrame.iloc[:,column_index] = dataFrame.iloc[:,column_index].apply(lambda x: x + ' ' + iteration_text)
  #return f'Finish: {iteration_text}'
  return dataFrame

In [4]:
append_text(dataFrame=df,
            data_column='Notes',
            iteration_text='456')
print(df.head(10))

Processing 456
  Index_Name             Notes
0    index 1   test text 1 456
1    index 2   test text 2 456
2    index 3   test text 3 456
3    index 4   test text 4 456
4    index 5   test text 5 456
5    index 6   test text 6 456
6    index 7   test text 7 456
7    index 8   test text 8 456
8    index 9   test text 9 456
9   index 10  test text 10 456


In [5]:
from multiprocessing import Pool
from functools import partial

starmap_input = [[df, 'Notes', 'single1']
                ]

with Pool(processes=1) as pool:  
    results = pool.starmap(append_text, starmap_input)

print(df.head(10))


Processing single1
  Index_Name             Notes
0    index 1   test text 1 456
1    index 2   test text 2 456
2    index 3   test text 3 456
3    index 4   test text 4 456
4    index 5   test text 5 456
5    index 6   test text 6 456
6    index 7   test text 7 456
7    index 8   test text 8 456
8    index 9   test text 9 456
9   index 10  test text 10 456


In [6]:
# Test creation of df breakdown and building input to functions

func_input = []

df_split = np.array_split(df, 4)
for count, df_part in enumerate(df_split, start=1):
    func_input.append([df_part, 'Notes', f'counter_{count}'])

#print (func_input)

[[  Index_Name            Notes
0    index 1  test text 1 456
1    index 2  test text 2 456
2    index 3  test text 3 456, 'Notes', 'counter_1'], [  Index_Name            Notes
3    index 4  test text 4 456
4    index 5  test text 5 456
5    index 6  test text 6 456, 'Notes', 'counter_2'], [  Index_Name            Notes
6    index 7  test text 7 456
7    index 8  test text 8 456, 'Notes', 'counter_3'], [  Index_Name             Notes
8    index 9   test text 9 456
9   index 10  test text 10 456, 'Notes', 'counter_4']]


In [9]:
def parallelize_dataframe(func_input, n_cores=4):
    pool = Pool(n_cores)
    final_df = pd.concat(pool.starmap(append_text, func_input))
    pool.close()
    pool.join()
    return final_df

In [10]:
newDF = parallelize_dataframe(func_input, n_cores=4)
print(newDF.head(10))

Processing counter_2Processing counter_3Processing counter_1Processing counter_4



  Index_Name                       Notes
0    index 1   test text 1 456 counter_1
1    index 2   test text 2 456 counter_1
2    index 3   test text 3 456 counter_1
3    index 4   test text 4 456 counter_2
4    index 5   test text 5 456 counter_2
5    index 6   test text 6 456 counter_2
6    index 7   test text 7 456 counter_3
7    index 8   test text 8 456 counter_3
8    index 9   test text 9 456 counter_4
9   index 10  test text 10 456 counter_4


In [22]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

In [63]:
text = "Nick likes to play football, however he is not too fond of tennis. "

text_tokens = word_tokenize(text)

cleaned_text = ' '.join([word for word in text_tokens if not word in stopwords.words()])
print(f'Original: {text}')
print(f'Cleaned:  {cleaned_text}')

Original: Nick likes to play football, however he is not too fond of tennis. 
Cleaned:  Nick likes play football , fond tennis .


In [59]:
import re
test_string = 'b big dog A a c D b123 b456'
print(f'Original string:           {test_string}')

# remove all single characters
no_single = re.sub(r'\s+[a-zA-Z]\s+', ' ', test_string)
print(f'All singles removed:       {no_single}')

# Remove single characters from the start
no_single2 = re.sub(r'\^[a-zA-Z]\s+', ' ', test_string) 
print(f'Remove singles from start: {no_single2}')

# Removing prefixed 'b'
no_prefixed_b = re.sub(r'^b\s+', '', test_string)
print(f'No prefixed b:             {no_prefixed_b}')

Original string:           b big dog A a c D b123 b456
All singles removed:       b big dog a D b123 b456
Remove singles from start: b big dog A a c D b123 b456
No prefixed b:             big dog A a c D b123 b456
