In [1]:
import pyspark.pandas as ps
from pyspark.sql import SparkSession, DataFrame
import pyspark.sql.functions as F
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, NaiveBayes, MultilayerPerceptronClassifier

spark = SparkSession.builder.master('local[*]').config("spark.driver.memory", "6g").config("spark.executor.memory", "6g").config("spark.memory.offHeap.enabled","true").config("spark.memory.offHeap.size","12g").config("spark.driver.maxResultSize", "6g").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/22 23:08:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/01/22 23:08:27 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Calculate tf-idf

In [2]:
df = ps.read_parquet('/data/data.parquet', index_col=['reviewerID', 'asin'])
df['text'] = df['summary'] + ' ' + df['reviewText']
df = df[['category', 'overall', 'text']]
df: DataFrame = df.reset_index().to_spark()
df = df.dropna(subset=['text'])
df = df.sample(fraction=1e-3) # remove this line to run on whole dataset
train, test = df.randomSplit([0.9, 0.1], seed=42)



In [3]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=32)
idf = IDF(inputCol="rawFeatures", outputCol="features")

tfidf_pipeline = Pipeline(stages=[tokenizer, hashingTF, idf])

In [4]:
tfidf_train = tfidf_pipeline.fit(train).transform(train)
tfidf_train.write.parquet('/data/tfidf_train.parquet')

24/01/22 21:12:47 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

In [5]:
tfidf_test = tfidf_pipeline.fit(test).transform(test)
tfidf_test.write.parquet('/data/tfidf_test.parquet')

                                                                                

# Models

In [2]:
train = spark.read.parquet('/data/tfidf_train.parquet')
test = spark.read.parquet('/data/tfidf_test.parquet')

                                                                                

In [3]:
train = train.withColumn('overall', F.col('overall') - 1)
test = test.withColumn('overall', F.col('overall') - 1)

In [4]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd

def evaluate(predictions, model_name):
    evaluator = MulticlassClassificationEvaluator(labelCol="overall", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    evaluator = MulticlassClassificationEvaluator(labelCol="overall", predictionCol="prediction", metricName="weightedPrecision")
    precision = evaluator.evaluate(predictions)

    evaluator = MulticlassClassificationEvaluator(labelCol="overall", predictionCol="prediction", metricName="weightedRecall")
    recall = evaluator.evaluate(predictions)

    evaluator = MulticlassClassificationEvaluator(labelCol="overall", predictionCol="prediction", metricName="f1")
    f1 = evaluator.evaluate(predictions)

    return pd.DataFrame({'Accuracy': [accuracy], 'Precision': [precision], 'Recall': [recall], 'F1': [f1]}, index=[model_name])


# Baseline

In [16]:
baseline_prediction = test.withColumn('prediction', F.lit(4.0))

baseline_eval = evaluate(baseline_prediction, 'Baseline')
baseline_eval

                                                                                

Unnamed: 0,Accuracy,Precision,Recall,F1
Baseline,0.637356,0.406222,0.637356,0.496193


## Logistic Regression

In [5]:
lr = LogisticRegression(featuresCol="features", labelCol="overall", maxIter=10)
lr_model = lr.fit(train)
lr_predictions = lr_model.transform(test)

lr_eval = evaluate(lr_predictions, 'Logistic Regression')
lr_eval

24/01/22 23:08:42 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
24/01/22 23:08:49 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/01/22 23:08:49 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
                                                                                

Unnamed: 0,Accuracy,Precision,Recall,F1
Logistic Regression,0.634888,0.467366,0.634888,0.505608


## Random Forest

In [6]:
rf = RandomForestClassifier(labelCol="overall", featuresCol="features", numTrees=10)
rf_model = rf.fit(train)
rf_predictions = rf_model.transform(test)

rf_eval = evaluate(rf_predictions, 'Random Forest')
rf_eval

                                                                                

Unnamed: 0,Accuracy,Precision,Recall,F1
Random Forest,0.637788,0.567718,0.637788,0.497188


## Naive Bayes

In [7]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol="overall", featuresCol="features")
nb_model = nb.fit(train)
nb_predictions = nb_model.transform(test)

nb_eval = evaluate(nb_predictions, 'Naive Bayes')
nb_eval

                                                                                

Unnamed: 0,Accuracy,Precision,Recall,F1
Naive Bayes,0.634585,0.47329,0.634585,0.501893


## MLP

In [8]:
mlp = MultilayerPerceptronClassifier(labelCol="overall", featuresCol="features", maxIter=100, layers=[32, 16, 5])
mlp_model = mlp.fit(train)
mlp_predictions = mlp_model.transform(test)

mlp_eval = evaluate(mlp_predictions, 'Multi Layer Perceptron')
mlp_eval

                                                                                

Unnamed: 0,Accuracy,Precision,Recall,F1
MLP,0.672512,0.690259,0.672512,0.583876


# Results

In [18]:
pd.concat([baseline_eval, lr_eval, rf_eval, nb_eval, mlp_eval])

                                                                                

Unnamed: 0,Accuracy,Precision,Recall,F1
Baseline,0.637356,0.406222,0.637356,0.496193
Logistic Regression,0.634888,0.467366,0.634888,0.505608
Random Forest,0.637788,0.567718,0.637788,0.497188
Naive Bayes,0.634585,0.47329,0.634585,0.501893
Multi Layer Perceptron,0.672512,0.690259,0.672512,0.583876
