In [None]:
import dask.bag as db
import string
import csv
import graphviz
import itertools 
import collections

def create_word_count_dictionary(text) :
    """Creates a word/count dictionary for all of the words in a set of text.
    >>> create_word_count_dictionary("Hello world")
    {'Hello': 1, 'world': 1}

    """
    dict = {}
    for word in text.split() :
        if word in dict.keys() :
            dict[word] = dict[word] + 1
        else :
            dict[word] = 1
    return dict

def remove_punctuation(text) :
    """Removes punctuation from text.
    >>> remove_punctuation("Hello, world")
    'Hello world'

    """
    return ''.join(filter(lambda x: x not in string.punctuation, text))

stop_words = ["a", "an", "the", "and", "but", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "can", "will", "just"]

def remove_stop_words(text) :
    """Removes stop words from text.
    >>> remove_stop_words("The quick brown fox jumps over the lazy dog.")
    'quick brown fox jumps lazy dog.'

    """
    
    return ' '.join(filter(lambda x: x.lower() not in stop_words, text.split()))


In [None]:
def extract_reason(csvfile) :
    """Extracts the reason text minus punctuation from csv files.
    >>> extract_reason(open("test.csv", newline=''))
    '  Undeclared Nnitrosodiethylamine NDEA Undeclared colloidal silver and lack of sterility assurance'

    """
    reader = csv.DictReader(csvfile, delimiter="|")
    clean_text = " "
    for row in reader:    
        clean_text = " ".join((clean_text, remove_punctuation(row["Reason"])))
    return clean_text

In [None]:
def load_from_year(year) :
    filename = "recalls{}.csv".format(year)
    csvfile = open(filename, newline='')
    clean_text = extract_reason(csvfile)
    return clean_text

In [None]:
def add_dictionaries(dict1, dict2) :
    """Adds two dictionaries together.
    >>> add_dictionaries({"a": 1, "b": 2}, {"a":3, "c":3})
    defaultdict(<class 'int'>, {'a': 4, 'b': 2, 'c': 3})

    """
    c = collections.defaultdict(int)
 
    # iterating key, val with chain()
    for key, val in itertools.chain(dict1.items(), dict2.items()):
        c[key] += val
    return c
    

In [None]:
bag = db.from_sequence([2018, 2019, 2020, 2021, 2022, 2023, 2024]) \
    .map(load_from_year)    \
    .map(remove_stop_words) \
    .map(create_word_count_dictionary) \
    .fold(add_dictionaries)
bag.visualize()

In [None]:
result = bag.compute()
result

In [None]:
val_based_rev = {k: v for k, v in sorted(result[0].items(), key=lambda item: item[1], reverse=True)}
print(val_based_rev)

In [None]:
bag = db.from_sequence([2018, 2019, 2020, 2021, 2022, 2023, 2024]) \
    .map(load_from_year)    \
    .map(remove_stop_words) \
    .str.split()            \
    .flatten()              \
    .frequencies()

bag.visualize()

result = bag.compute()
result