# Exercise 2: Text Processing and Classification using Spark

In [19]:
from pyspark.sql import SparkSession
from operator import add
import re

spark: SparkSession = SparkSession.builder.getOrCreate()

In [20]:
with open('../data/stopwords.txt') as f:
    stopwords = set(f.read().splitlines())

In [21]:
%%time
K = 75
FILE_PATH = 'hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json' # devset
# FILE_PATH = 'hdfs:///user/dic24_shared/amazon-reviews/full/reviewscombined.json' # full dataset

rdd = spark.read.json(FILE_PATH).rdd

CPU times: user 2.11 ms, sys: 2.1 ms, total: 4.2 ms
Wall time: 788 ms


                                                                                

In [22]:
%%time
def preprocessing(row):
    category = row['category']

    # lower text
    # tokenises each line by using whitespaces, tabs, digits, and the characters ()[]{}.!?,;:+=-_"'`~#@&*%€$§\/ as delimiters
    tokens = re.split(r'[^a-zA-Z<>^|]+', row['reviewText'].lower())

    # remove stopwords
    tokens = filter(lambda token: len(token) > 1 and (token not in stopwords), tokens)

    # remove duplicates
    tokens = set(tokens)

    # count all documents in category
    yield (category, None), 1

    # count all documents in category containing token
    for token in tokens:
        yield (category, token), 1


def token_to_key(row):
    (category, token), count = row
    return token, (category, count)


def token_sum(row):
    token, values = row
    counts = {category: count for category, count in values}
    n_t = sum(counts.values())

    for category, count in counts.items():
        yield category, (token, count, n_t)


def chi_squared(row):
    category, values = row

    # dictionary of tokens with their counts and total number of documents
    counts = {token: (count, n_t) for token, count, n_t in values}
    
    # total number of documents in category and dataset
    n_c, n = counts.pop(None)
    
    result = []

    for token, (a, n_t) in counts.items():
        b = n_t - a
        c = n_c - a
        d = n - a - b - c

        chi_squared = n * ((a * d - b * c) ** 2) / ((a + b) * (a + c) * (b + d) * (c + d))
        result.append((chi_squared, token))

    return category, sorted(result, key=lambda x: (-x[0], x[1]))[:K]

CPU times: user 23 µs, sys: 0 ns, total: 23 µs
Wall time: 26.7 µs


In [23]:
%%time
topk = rdd.flatMap(preprocessing) \
    .reduceByKey(add) \
    .map(token_to_key) \
    .groupByKey() \
    .flatMap(token_sum) \
    .groupByKey() \
    .map(chi_squared) \
    .sortByKey()

                                                                                

CPU times: user 33.1 ms, sys: 13.8 ms, total: 46.9 ms
Wall time: 10.4 s


In [24]:
%%time
with open('../output_rdd.txt', 'w') as f:
    tokens = set()

    for category, values in topk.toLocalIterator():
        tokens.update(map(lambda x: x[1], values))
        value_strings = [f'{value[1]}:{value[0]}' for value in values]
        print(' '.join([f'<{category}>'] + value_strings), file=f)

    print(' '.join(sorted(tokens)), file=f)

CPU times: user 11.2 ms, sys: 1.62 ms, total: 12.8 ms
Wall time: 740 ms
