In [None]:
import dask
import dask.dataframe as dd
import string
from functools import partial

def remove_punctuation(text) :
    """Removes punctuation from text.
    >>> remove_punctuation("Hello, world")
    'Hello world'

    """
    return ''.join(filter(lambda x: x not in string.punctuation, text))

stop_words = ["a", "an", "the", "and", "but", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "can", "will", "just"]

def remove_stop_words(text) :
    """Removes stop words from text.
    >>> remove_stop_words("The quick brown fox jumps over the lazy dog.")
    'quick brown fox jumps lazy dog.'

    """
    
    return ' '.join(filter(lambda x: x.lower() not in stop_words, text.split()))

In [None]:
# Apply the function to all string columns
def apply_to_strings(df, func, col_name):
    for col in df.columns:
        if df[col].dtype == 'object' and col == col_name :  # Check if the column is of string type
                df[col] = df[col].map(func)
    return df

In [None]:
def read_csv(filename) :
    df = dd.read_csv(filename, delimiter="|", sample=10000)
    df.astype({'Year': int, 'Brand': str, 'Company': str, 'Date': str, 'ProductDescription': str,'Url': object} )
    del(df["Url"])
    return df

In [None]:
df = read_csv("recalls2018.csv")
for x in range (2019, 2025) :
    new_df = read_csv("recalls{}.csv".format(x))
    df = dask.dataframe.concat([df, new_df])

ddd = df.map_partitions(partial(apply_to_strings, func=remove_punctuation, col_name="Reason")) \
    .map_partitions(partial(apply_to_strings, func=remove_stop_words, col_name="Reason")) \
    .assign(reason_list = lambda x: x["Reason"].str.split()) \
    .explode("reason_list") \
    .groupby("reason_list") \
    .count() \
    .sort_values("Year", ascending=False)

ddd.visualize()

In [None]:
ddd.compute()
ddd.head(10)