In [31]:
# Jon Dinu video 3.7
import string
import json 
import pickle as pkl
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pyspark as ps
from collections import Counter
import numpy as np

In [None]:
sc = ps.SparkContext()

In [33]:
test_strings = ['the quick brown fox jumps over the brown fence.',
              'the boy paints a tall fence brown!',
              'basketball players are tall.',
              'quick basketball players jump high']

In [34]:
# Create the tokenize function - split the text into words
import nltk, string

def word_tokenizer(text):
    tokens = [] # start with an empty list
    
    #parse all the words in the text and add to the list if it is not a stopword, punctuation or ''''''
    for word in nltk.word_tokenize(text):
        if word not in nltk.corpus.stopwords.words('english')\
        and word not in string.punctuation \
        and word != '``':  # this is specific to the document he was processing
            tokens.append(word)
            
    return tokens
    

In [35]:
# test_strings contains multiple documents/rows. We apply the tokeniser to each document/row. 
# and we create our RDD now so we can use map to perform our transforms for us. 
test_tokens = sc.parallelize(test_strings).map(word_tokenizer)
test_tokens.collect()

[['quick', 'brown', 'fox', 'jumps', 'brown', 'fence'],
 ['boy', 'paints', 'tall', 'fence', 'brown'],
 ['basketball', 'players', 'tall'],
 ['quick', 'basketball', 'players', 'jump', 'high']]

In [36]:
# Now we want to vectorise the documents. We do this by counting the number of words but first we need the vocabulary
vocab = test_tokens.flatMap(lambda words: words).distinct()

In [37]:
vocab.collect()

['quick',
 'jump',
 'high',
 'brown',
 'players',
 'tall',
 'paints',
 'boy',
 'jumps',
 'basketball',
 'fox',
 'fence']

In [38]:
# So we need to write a function that can be applied to every row and will return a vector for every row 
# Some clever things below: 
# using the broadcast of the vocab list. 
# vector creation command! Nice

from collections import Counter
import numpy as np

broadcastVocab = sc.broadcast(vocab.collect())
def bow_vectorize(row_of_tokens):
    
    counts = Counter(row_of_tokens)
    
    vec = [counts[word] if word in counts else 0 for word in broadcastVocab.value]
    return np.array(vec)

In [39]:
vector = test_tokens.map(bow_vectorize)

In [40]:
vector.collect()

[array([1, 0, 0, 2, 0, 0, 0, 0, 1, 0, 1, 1]),
 array([0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1]),
 array([0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0]),
 array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0])]

In [43]:

temp = test_tokens.map(lambda line: np.array(line))

In [44]:
temp.collect()

[array(['quick', 'brown', 'fox', 'jumps', 'brown', 'fence'], 
       dtype='|S5'), array(['boy', 'paints', 'tall', 'fence', 'brown'], 
       dtype='|S6'), array(['basketball', 'players', 'tall'], 
       dtype='|S10'), array(['quick', 'basketball', 'players', 'jump', 'high'], 
       dtype='|S10')]

In [45]:
temp

PythonRDD[21] at collect at <ipython-input-44-92b1853a940d>:1