# Exercise 2: Text Processing and Classification using Spark

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import Window

spark: SparkSession = SparkSession.builder.getOrCreate()

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark/jars/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
24/05/13 17:07:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/13 17:07:44 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [2]:
K = 75
FILE_PATH = 'hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json' # devset
# FILE_PATH = 'hdfs:///user/dic24_shared/amazon-reviews/full/reviewscombined.json' # full dataset

df = spark.read.json(FILE_PATH)
# df.head()

                                                                                

In [3]:
with open('../data/stopwords.txt') as f:
    stopwords = set(f.read().splitlines())

In [4]:
stopwords_col = F.array([F.lit(word) for word in stopwords])

tokens = F.lower(F.col('reviewText'))
tokens = F.split(tokens, r'[^a-zA-Z<>^|]+')
tokens = F.filter(tokens, lambda token: F.length(token) > 1)
tokens = F.array_except(tokens, stopwords_col)
tokens = F.array_distinct(tokens)
tokens = F.explode(tokens)

tokenized = df.withColumn('token', tokens)
tokenized = tokenized[['asin', 'reviewerID', 'category', 'token']]
# tokenized.head()

In [5]:
# tokenized[['asin', 'reviewerID']].dropDuplicates().count()

In [6]:
counts = tokenized.withColumn('t_c_d', 1 / F.count(F.expr('*')).over(Window.partitionBy('category', 'asin', 'reviewerID')))
counts = counts.withColumn('t_d', 1 / F.count(F.expr('*')).over(Window.partitionBy('asin', 'reviewerID')))
# counts = counts.groupBy(['category', 'token']).agg(F.count(F.expr('*')).alias('n_c_t'), F.sum('t_c_d').alias('t_c_d'))
counts = counts.groupBy(['category', 'token']).agg(F.count(F.expr('*')).alias('n_c_t'), F.sum('t_c_d').alias('t_c_d'), F.sum('t_d').alias('t_d'))
counts = counts.withColumn('n_c', F.round(F.sum('t_c_d').over(Window.partitionBy('category'))).cast('integer'))
counts = counts.withColumn('n_t', F.sum('n_c_t').over(Window.partitionBy('token')))
# counts = counts.withColumn('n', F.lit(df.count())) # TODO: check if it is faster with global window
counts = counts.withColumn('n', F.round(F.sum('t_c_d').over(Window.partitionBy(F.lit(0)))).cast('integer'))
counts = counts[['category', 'token', 'n_c_t', 'n_c', 'n_t', 'n']]
# counts.head()

In [7]:
chisq = counts.withColumn('a', F.col('n_c_t'))
chisq = chisq.withColumn('b', F.col('n_c') - F.col('a'))
chisq = chisq.withColumn('c', F.col('n_t') - F.col('a'))
chisq = chisq.withColumn('d', F.col('n') - F.col('a') - F.col('b') - F.col('c'))
chisq = chisq.withColumn('chi_squared', F.col('n') * ((F.col('a') * F.col('d') - F.col('b') * F.col('c')) ** 2) / ((F.col('a') + F.col('b')) * (F.col('c') + F.col('d')) * (F.col('a') + F.col('c')) * (F.col('b') + F.col('d'))))
chisq = chisq[['category', 'token', 'chi_squared']]
# chisq.head()

In [8]:
topk = chisq.withColumn('rank', F.row_number().over(Window.partitionBy('category').orderBy(F.desc('chi_squared'), F.asc('token'))))
topk = topk.filter(F.col('rank') <= K)
topk = topk.withColumn('token_chisq', F.array('token', 'chi_squared'))
topk = topk.groupBy('category').agg(F.collect_list('token_chisq').alias('topk'))
topk = topk.sort('category')
# topk.head(5)

In [10]:
with open('../output_rdd.txt', 'w') as f:
    tokens = set()

    for row in topk.toLocalIterator():
        tokens.update(map(lambda x: x[0], row['topk']))
        value_strings = [f'{value[0]}:{value[1]}' for value in row['topk']]
        print(' '.join([f'<{row["category"]}>'] + value_strings), file=f)

    print(' '.join(sorted(tokens)), file=f)