In [1]:
def tokenize(document):
    return (document
            .replace(',','')
            .replace('.','')
            .split())

In [2]:
documents = [
    "This paper presents a kernel-based principal component analysis, kernel PCA, to extract critical features for improving the performance of a stock trading model. ",
    "The feature extraction method is one of the techniques to solve dimensionality reduction problems.",
    "The kernel PCA is a feature extraction approach which has been applied to data transformation from known variables to capture critical information.",
    "The kernel PCA is a kernel-based data map- ping tool that has characteristics of both principal component analysis and non-linear mapping.",
    "The feature selection method is another DRP technique that selects only a small set of features from known variables, but these features still indicate possible collinearity problems that fail to reflect clear information.",
    "However, most feature extraction methods use a variable mapping application to eliminate noisy and collinear variables. In this research, we use the kernel-PCA method in a stock trading model to transform stock technical indices which allows features of smaller dimension to be formed.",
    "The kernel-PCA method has been applied to various stocks and sliding window testing methods using both half-year and 1-year testing strategies. The experimental results show that the proposed method generates more profits than other DRP methods on the America stock market.",
    "This stock trading model is very practical for real-world application, and it can be implemented in a real-time environment."
]

In [3]:
tokenized_documents = [tokenize(document) for document in documents]

In [11]:
tokenized_documents[0][:5]

['This', 'paper', 'presents', 'a', 'kernel-based']

In [13]:
def wc_mapper(document):
    for word in tokenize(document):
        yield (word, 1)

In [14]:
def wc_reducer(word, counts):
    yield (word, sum(counts))

In [30]:
from collections import defaultdict
def word_count(documents):
    
    collector = defaultdict(list)
    
    for document in documents:
        for word, count in wc_mapper(document):
            collector[word].append(count)
            
  #  return collector
    return [output for word, counts in collector.items()
            for output in wc_reducer(word, counts)]

In [31]:
documents_test = ["data science", "big data", "science fiction"]


In [32]:
word_count(documents_test)

[('data', 2), ('science', 2), ('big', 1), ('fiction', 1)]

In [34]:
map_0 = wc_mapper(documents_test[0])

In [36]:
type(map_0)

generator

In [37]:
next(map_0)

('data', 1)

In [38]:
next(map_0)

('science', 1)

In [39]:
from collections import defaultdict

In [40]:
collector = defaultdict(list)
for document in documents_test:
    for word, count in wc_mapper(document):
        collector[word].append(count)

In [41]:
collector

defaultdict(list,
            {'big': [1], 'data': [1, 1], 'fiction': [1], 'science': [1, 1]})

In [42]:
[output for word, counts in collector.items()
            for output in wc_reducer(word, counts)]

[('data', 2), ('science', 2), ('big', 1), ('fiction', 1)]

In [43]:
def map_reduce(inputs, mapper, reducer):
    collector = defaultdict(list)
    
    # can be rewritten as a parfor
    for input in inputs:
        for key, value in mapper(input):
            collector[key].append(value)
    
    # can be rewritten as a parfor
    return [output
            for key, values in collector.items()
            for output in reducer(key, values)]

In [44]:
map_reduce(documents, wc_mapper, wc_reducer)

[('This', 2),
 ('paper', 1),
 ('presents', 1),
 ('a', 8),
 ('kernel-based', 2),
 ('principal', 2),
 ('component', 2),
 ('analysis', 2),
 ('kernel', 3),
 ('PCA', 3),
 ('to', 9),
 ('extract', 1),
 ('critical', 2),
 ('features', 4),
 ('for', 2),
 ('improving', 1),
 ('the', 5),
 ('performance', 1),
 ('of', 5),
 ('stock', 5),
 ('trading', 3),
 ('model', 3),
 ('The', 6),
 ('feature', 4),
 ('extraction', 3),
 ('method', 5),
 ('is', 5),
 ('one', 1),
 ('techniques', 1),
 ('solve', 1),
 ('dimensionality', 1),
 ('reduction', 1),
 ('problems', 2),
 ('approach', 1),
 ('which', 2),
 ('has', 3),
 ('been', 2),
 ('applied', 2),
 ('data', 2),
 ('transformation', 1),
 ('from', 2),
 ('known', 2),
 ('variables', 3),
 ('capture', 1),
 ('information', 2),
 ('map-', 1),
 ('ping', 1),
 ('tool', 1),
 ('that', 4),
 ('characteristics', 1),
 ('both', 2),
 ('and', 5),
 ('non-linear', 1),
 ('mapping', 2),
 ('selection', 1),
 ('another', 1),
 ('DRP', 2),
 ('technique', 1),
 ('selects', 1),
 ('only', 1),
 ('small', 1)