# Setup
Before starting with the tasks of the assignment, initialize spark.


In [1]:
import sys
from pathlib import Path

from pyspark.sql.types import FloatType

p = Path().resolve()
BASE_PATH =  p.parent if p.name == "src" else p

# To be sure that the src path is in PYTHONPATH
# TODO: Check if this requirement exists
sys.path.append(str(BASE_PATH / "src"))

import operator
import json
import numpy as np
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark import RDD, SparkContext
from pyspark.ml.feature import (
    IDF,
    ChiSqSelector,
    ChiSqSelectorModel,
    CountVectorizer,
    RegexTokenizer,
    HashingTF,
    StopWordsRemover,
    StringIndexer,
    VectorIndexer,
)

from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.sql.functions import col
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from exercise2.model.review import Review
from exercise2.split_text import split_text
from exercise2.task1.util import calculate_chi_squares, merge_dicts, printable_category, calculate_chi_square_per_token

LOCAL = True

if LOCAL:
    spark: SparkSession = SparkSession.builder \
        .appName("local") \
        .config("spark.driver.host", "localhost") \
        .config("spark.driver.bindAddress", "localhost") \
        .getOrCreate()
    sc: SparkContext = spark.sparkContext
else:
    spark: SparkSession = SparkSession.builder \
        .appName("cluster") \
        .config("spark.executor.instances", 435) \
        .getOrCreate()
    sc: SparkContext = spark.sparkContext
    sc.addPyFile(str(BASE_PATH / "src" / "exercise2.zip"))

24/05/27 15:38:30 WARN Utils: Your hostname, Julians-Laptop.local resolves to a loopback address: 127.0.0.1; using 192.168.0.178 instead (on interface en0)
24/05/27 15:38:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/27 15:38:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Example 1: RDD
Redo the first assignment, this time utilizing RDDs. 

Start by loading the reviews dataset:

In [2]:
if LOCAL:
    reviews_location = BASE_PATH / "resource" / "reviews_devset_first1000.json" #"reviews_devset.json"
else:
    # reviews_location = "hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json"
    reviews_location = "hdfs:///user/dic24_shared/amazon-reviews/full/reviewscombined.json"
reviews: RDD = sc.textFile(str(reviews_location)).map(json.loads)
reviews_cnt = reviews.count()

Once the reviews are available, count the documents per category.

In [3]:
category_counts = reviews.map(lambda r: (r["category"], 1)) \
        .reduceByKey(operator.add) \
        .collectAsMap()



Split the reviews text into separate tokens, filter them using the stopwords (loaded from disk)
and reduce the produced counts into a map of maps with token on the top level, each token is 
assigned to a map, which contains the review counts for each category.

In [4]:
with open(BASE_PATH / "resource" / "stopwords.txt", "r") as file:
    stopwords = set([line.strip() for line in file.readlines()])

# <term>: {<cat>: cnt, <cat>: cnt, ...}
category_counts_per_token = reviews \
        .flatMap(lambda r: ([(token, r["category"]) for token in set(split_text(r["reviewText"]))])) \
        .filter(lambda r: r[1] not in stopwords) \
        .mapValues(lambda category: {category: 1}) \
        .reduceByKey(lambda dict1, dict2: merge_dicts(dict1, dict2))

Now that we have the counts per token and category, we calculate the chi square values and sort
the values to filter for the top 75 tokens per category.

In [5]:
top75_tokens_per_category: RDD = category_counts_per_token \
        .flatMap(lambda cur_category_counts: calculate_chi_square_per_token(cur_category_counts, category_counts, reviews_cnt)) \
        .groupByKey().mapValues(list) \
        .mapValues(lambda val: sorted(val, key=lambda val: val[1])[:-75:-1]) \
        .sortByKey()

Prepare the job result by concenating all tokens to a list of all top tokens 
and convert the lists to strings for printing.

In [6]:
top75_tokens_str = "\n".join(
        top75_tokens_per_category.map(lambda el: printable_category(el[0], el[1])).collect())

top_tokens: list[str] = top75_tokens_per_category \
        .flatMap(lambda el: [tup[0] for tup in el[1]]) \
        .distinct() \
        .sortBy(lambda el: el).collect()
top_tokens_str = " ".join(top_tokens)

result = top_tokens_str + '\n' + top75_tokens_str
result

'always and android app apps available bar besides bookmarks books bottom call can channel chapter choice delight device doing downloading dr especially exactly excellent fast figured fire five fix free friendly gripe hd helps highlights home hopefully in installation installed instant instead involved k kindle laptop listen listings lock masih menu much neat net notes now often operate panel pay pc phone picked police possible pre prime process provides read reading remember scanner screen scripture select setup slide something spend start status synchronized tablet tech their think thisscanner thought till to tremendously user verse very what whisper with worthwhile you\n<Apps_for_Android> app:331.99732798931194 kindle:220.21822229487478 synchronized:165.8324991658325 books:165.8324991658325 bookmarks:165.8324991658325 pc:165.8324991658325 dr:165.8324991658325 android:165.8324991658325 downloading:165.8324991658325 tech:165.8324991658325 whisper:165.8324991658325 police:165.832499165

Write the result to a file.

In [7]:
with open(BASE_PATH / "output_rdd.txt", "w") as file:
    file.writelines(result)

# Example 2: DataFrames: Spark ML & Pipelines


In [8]:
reviews_df = spark.read.json(str(reviews_location))
reviews_df.head()

Row(asin='0981850006', category='Patio_Lawn_and_Garde', helpful=[6, 7], overall=5.0, reviewText="This was a gift for my other husband.  He's making us things from it all the time and we love the food.  Directions are simple, easy to read and interpret, and fun to make.  We all love different kinds of cuisine and Raichlen provides recipes from everywhere along the barbecue trail as he calls it. Get it and just open a page.  Have at it.  You'll love the food and it has provided us with an insight into the culture that produced it. It's all about broadening horizons.  Yum!!", reviewTime='12 3, 2009', reviewerID='A2VNYWOPJ13AFP', reviewerName='Amazon Customer "carringt0n"', summary='Delish', unixReviewTime=1259798400)

In [9]:
tokenizer = RegexTokenizer(
    minTokenLength=2,
    pattern="[\s\d\(\)\[\]{}\.!\?,;:\+=\-_\"'`~#@&\*%€\$§\\\/]",
    inputCol="reviewText",
    outputCol="tokens",
)
stopwords_remover = StopWordsRemover(
    inputCol=tokenizer.getOutputCol(), outputCol="stopWordsFiltered"
)
indexer = StringIndexer(
    inputCol="category",
    outputCol="indexedCategory"
)
vectorizer = CountVectorizer(inputCol=stopwords_remover.getOutputCol(), outputCol="TF")
idf = IDF(inputCol=vectorizer.getOutputCol(), outputCol="TFIDF")
chiSqSelector = ChiSqSelector(
    numTopFeatures=2000,
    featuresCol=idf.getOutputCol(),
    labelCol=indexer.getOutputCol(),
    outputCol="chiSq",
)

  pattern="[\s\d\(\)\[\]{}\.!\?,;:\+=\-_\"'`~#@&\*%€\$§\\\/]",


In [10]:
pipeline = Pipeline(
    stages=[
        tokenizer,
        stopwords_remover,
        indexer,
        vectorizer,
        idf,
        chiSqSelector
    ]
)


model = pipeline.fit(reviews_df)
tokenized = model.transform(reviews_df)
tokenized.head(n=10)

24/05/27 15:38:35 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/05/27 15:38:35 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/05/27 15:38:36 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/05/27 15:38:41 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
24/05/27 15:38:42 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB


[Row(asin='0981850006', category='Patio_Lawn_and_Garde', helpful=[6, 7], overall=5.0, reviewText="This was a gift for my other husband.  He's making us things from it all the time and we love the food.  Directions are simple, easy to read and interpret, and fun to make.  We all love different kinds of cuisine and Raichlen provides recipes from everywhere along the barbecue trail as he calls it. Get it and just open a page.  Have at it.  You'll love the food and it has provided us with an insight into the culture that produced it. It's all about broadening horizons.  Yum!!", reviewTime='12 3, 2009', reviewerID='A2VNYWOPJ13AFP', reviewerName='Amazon Customer "carringt0n"', summary='Delish', unixReviewTime=1259798400, tokens=['this', 'was', 'gift', 'for', 'my', 'other', 'husband', 'he', 'making', 'us', 'things', 'from', 'it', 'all', 'the', 'time', 'and', 'we', 'love', 'the', 'food', 'directions', 'are', 'simple', 'easy', 'to', 'read', 'and', 'interpret', 'and', 'fun', 'to', 'make', 'we', 

In [None]:
vocabulary: list[str] = model.stages[3].vocabulary
chiSqSelectorModel: ChiSqSelectorModel = model.stages[5]
selected_tokens = [vocabulary[i] for i in chiSqSelectorModel.selectedFeatures]
len(selected_tokens)

with open(BASE_PATH / "output_ds.txt", "w") as file:
    file.writelines(" ".join(selected_tokens))

First we split the data in a train and a test set.

In [None]:
train_df, test_df = tokenized.randomSplit([8.0, 2.0], seed=1)

The chunk below sets up the cross validation pipeline.
We first set the Support Vector Machine and to use it with a multi classification problem, One vs Rest is used.
For the Cross validation we use the F1 Score to optimize,
The final step of setup is creating the grid according to the task description.
Finally we can create the cross validator.

In [None]:
svc = LinearSVC()
ovr = OneVsRest(classifier=svc, featuresCol="chiSq", labelCol="indexedCategory")
f1_score = MulticlassClassificationEvaluator(metricName="f1", labelCol="indexedCategory")

param_grid = ParamGridBuilder().addGrid(svc.maxIter, [10, 100]).addGrid(svc.regParam, [0, 0.01, 0.1]).addGrid(svc.standardization, [False, True]).build()
cv = CrossValidator(estimator=ovr, estimatorParamMaps=param_grid, evaluator=f1_score, numFolds=2)

Below the cross validator is fit to the training data.

In [12]:
cv_model = cv.fit(train_df)

24/05/27 15:38:42 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
24/05/27 15:38:42 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
24/05/27 15:38:43 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
24/05/27 15:38:43 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/05/27 15:38:43 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/05/27 15:38:43 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
24/05/27 15:38:43 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
24/05/27 15:38:43 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
24/05/27 15:38:43 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
24/05/27 15:38:43 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
24/05/27 15:38:43 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
24/05/27 

Once the classifier is trained we can retrieve the best one from cross validation

In [32]:
best_classifier = cv_model.bestModel
best_maxIter = best_classifier.getClassifier()._java_obj.getMaxIter()
best_regParam = best_classifier.getClassifier()._java_obj.getRegParam()
best_standardization = best_classifier.getClassifier()._java_obj.getStandardization()

print(f"The best model has the parameters: maximum itterations = {best_maxIter}, regularisation = {best_regParam}, standarization = {best_standardization}")

The best model has the parameters: maximum itterations = 10, regularisation = 0.01, standarization = False


In [14]:
train_res = best_classifier.transform(train_df)

In [15]:
train_res.head(1)

24/05/27 15:42:10 WARN DAGScheduler: Broadcasting large task binary with size 4.3 MiB


[Row(asin='0981850006', category='Patio_Lawn_and_Garde', helpful=[6, 7], overall=5.0, reviewText="This was a gift for my other husband.  He's making us things from it all the time and we love the food.  Directions are simple, easy to read and interpret, and fun to make.  We all love different kinds of cuisine and Raichlen provides recipes from everywhere along the barbecue trail as he calls it. Get it and just open a page.  Have at it.  You'll love the food and it has provided us with an insight into the culture that produced it. It's all about broadening horizons.  Yum!!", reviewTime='12 3, 2009', reviewerID='A2VNYWOPJ13AFP', reviewerName='Amazon Customer "carringt0n"', summary='Delish', unixReviewTime=1259798400, tokens=['this', 'was', 'gift', 'for', 'my', 'other', 'husband', 'he', 'making', 'us', 'things', 'from', 'it', 'all', 'the', 'time', 'and', 'we', 'love', 'the', 'food', 'directions', 'are', 'simple', 'easy', 'to', 'read', 'and', 'interpret', 'and', 'fun', 'to', 'make', 'we', 

In [16]:
test_res = best_classifier.transform(test_df)

In [17]:
test_res.head(1)

24/05/27 16:04:49 WARN DAGScheduler: Broadcasting large task binary with size 4.3 MiB
                                                                                

[Row(asin='B00002N8K3', category='Patio_Lawn_and_Garde', helpful=[4, 5], overall=1.0, reviewText="This hose is supposed to be flexible.  Its hard, heavy and unwieldy.  I don't know if it kinks or not, because I could tell, as soon as I removed it from the box, that I would have to return it.  If you want a lightweight, soft hose (and I know they exist because I have one), do not buy this thing.", reviewTime='07 13, 2013', reviewerID='A2SX9YPPGEUADI', reviewerName='HappyCamper "Happy Housewife"', summary='The worst', unixReviewTime=1373673600, tokens=['this', 'hose', 'is', 'supposed', 'to', 'be', 'flexible', 'its', 'hard', 'heavy', 'and', 'unwieldy', 'don', 'know', 'if', 'it', 'kinks', 'or', 'not', 'because', 'could', 'tell', 'as', 'soon', 'as', 'removed', 'it', 'from', 'the', 'box', 'that', 'would', 'have', 'to', 'return', 'it', 'if', 'you', 'want', 'lightweight', 'soft', 'hose', 'and', 'know', 'they', 'exist', 'because', 'have', 'one', 'do', 'not', 'buy', 'this', 'thing'], stopWordsFi

In [None]:
from pyspark.mllib.evaluation import MulticlassMetrics
import pyspark.sql.functions as F
from pyspark.sql import Fl

preds_and_labels = train_res.select(['prediction','indexedCategory']).withColumn('label', F.col('d').cast(FloatType())).orderBy('prediction')

#select only prediction and label columns
preds_and_labels = preds_and_labels.select(['prediction','label'])

metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

print(metrics.confusionMatrix().toArray())