#### `a.` classification model to predict product category

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
conf = SparkConf()
conf.set('spark.executor.memory', '30g')
conf.set('spark.driver.memory', '30g')
# conf.set("spark.executor.instances", 4)
sc = SparkContext(conf=conf)
spark = SparkSession.builder.master("local").appName('Classification').config(conf=conf).getOrCreate()
spark


In [3]:
file_location = './data/train_test/dataset_en_train.json'
df = spark.read.json(file_location)
df = df.na.drop()
df.printSchema()
df.describe().show()

root
 |-- language: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_title: string (nullable = true)
 |-- reviewer_id: string (nullable = true)
 |-- stars: string (nullable = true)

+-------+--------+----------------+------------------+--------------------+----------+------------------+-------------------+------------------+
|summary|language|product_category|        product_id|         review_body| review_id|      review_title|        reviewer_id|             stars|
+-------+--------+----------------+------------------+--------------------+----------+------------------+-------------------+------------------+
|  count|  200000|          200000|            200000|              200000|    200000|            200000|             200000|            200000|
|   mean|    null|            null|              null|                null| 

In [10]:
# testing idfferent lr approach
from pyspark.ml.feature import HashingTF, Tokenizer, StringIndexer, VectorAssembler, StopWordsRemover
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

training_data, test_data = df.randomSplit([0.75, 0.25], seed=123)

# Define the stages of the pipeline
tokenizer = Tokenizer(inputCol="review_body", outputCol="review_words")
stopwordremoverbody = StopWordsRemover(inputCol='review_words', outputCol='filtered_review_words')
titleTokenizer = Tokenizer(inputCol="review_title", outputCol="title_words")
stopwordremovertitle = StopWordsRemover(inputCol='title_words', outputCol='filtered_title_words')
indexer = StringIndexer(inputCol="product_category", outputCol="label").setHandleInvalid('keep')
idIndexer = StringIndexer(inputCol="product_id", outputCol="id_feature").setHandleInvalid('keep')
hashingTF = HashingTF(inputCol="filtered_review_words", outputCol="review_features", numFeatures=262144)
titleHashingTF = HashingTF(inputCol="filtered_title_words", outputCol="title_features", numFeatures=262144)
assembler = VectorAssembler(inputCols=["id_feature", "review_features", "title_features"], outputCol="features")
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

# Define the pipeline
pipeline = Pipeline(stages=[tokenizer, stopwordremoverbody, titleTokenizer, stopwordremovertitle, indexer, idIndexer, hashingTF, titleHashingTF, assembler, lr])

# Fit the pipeline to the training data
model = pipeline.fit(training_data)


In [11]:
# Make predictions on the testing set
predictions = model.transform(test_data)

# Evaluate the performance of the model
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % accuracy)


Accuracy = 0.349406


played around with few iterations. considered logistic regression model where i combine product id, review title, and review body in singular column called 'words'. considered an idf with stop words. decided against that as perhaps some of the stop words would be useful. 

the highest accuracy i got was combining review body, review title, product id into a words column, used stopwords remover and then attempted to get product_category. only got 40% accuracy however....y

attempted a lean model with just the review body and product category and got 30% accuracy.

probably would need to spend time tuning parameters. 

#### `b.` predict customer ratings
note: thought about using stop word however realised words like not etc may provide valuable data. 

In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, Tokenizer
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
from pyspark.ml import Pipeline

data = df.select(col("review_id"), col("product_id"), col("reviewer_id"), col("stars").cast("double"), col("review_body"))

# Split the data into training and testing sets
(trainingData, testData) = data.randomSplit([0.7, 0.3], seed=1234)

# Convert string columns to index values, # Combine all features into a single feature vector
tokenizer = Tokenizer(inputCol="review_body", outputCol="review_words")
hashingTF = HashingTF(inputCol="review_words", outputCol="review_features", numFeatures=1000)
productIdIndexer = StringIndexer(inputCol="product_id", outputCol="product_id_index").setHandleInvalid('keep')
reviewerIdIndexer = StringIndexer(inputCol="reviewer_id", outputCol="reviewer_id_index").setHandleInvalid('keep')
assembler = VectorAssembler(inputCols=["product_id_index", "reviewer_id_index", "review_features"], outputCol="features")
lr = LinearRegression(featuresCol="features", labelCol="stars", maxIter=10, regParam=0.3, elasticNetParam=0.8)

#pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, productIdIndexer, reviewerIdIndexer, assembler, lr])


# Fit the model 
model = pipeline.fit(trainingData)

# Make predictions on the testing set
predictions = model.transform(testData)

# Evaluate the performance of the model
evaluator = RegressionEvaluator(labelCol="stars", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)

evaluator_r2 = RegressionEvaluator(predictionCol='prediction', labelCol='stars', metricName='r2')
r2 = evaluator_r2.evaluate(predictions)

print('R-Squared (R2):', r2)
print("Root Mean Squared Error (RMSE) = %g" % rmse)




ALA model 

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer

# Load the data into a Spark DataFrame
training_reviews = spark.read.json(file_location)
training_reviews = training_reviews.na.drop()
test_reviews = spark.read.json('./data/train_test/dataset_en_test.json')
test_reviews = test_reviews.na.drop()

training_reviews = training_reviews.withColumn('stars', training_reviews['stars'].cast('float'))
test_reviews = test_reviews.withColumn('stars', test_reviews['stars'].cast('float'))

# Transform reviewer_id and product_id columns into indices
reviewer_id_indexer = StringIndexer(inputCol="reviewer_id", outputCol="reviewer_id_index").setHandleInvalid('keep')
product_id_indexer = StringIndexer(inputCol="product_id", outputCol="product_id_index").setHandleInvalid('keep')
training_reviews = reviewer_id_indexer.fit(training_reviews).transform(training_reviews)
training_reviews = product_id_indexer.fit(training_reviews).transform(training_reviews)
test_reviews = reviewer_id_indexer.fit(test_reviews).transform(test_reviews)
test_reviews = product_id_indexer.fit(test_reviews).transform(test_reviews)

# Train a collaborative filtering model using ALS algorithm
als = ALS(rank=10, maxIter=5, regParam=0.01, userCol="reviewer_id_index", itemCol="product_id_index", ratingCol="stars")
model = als.fit(training_reviews)

# Evaluate the model on the test data
predictions = model.transform(test_reviews)

evaluator = RegressionEvaluator(metricName='rmse', labelCol='stars', predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print('Root Mean Squared Error =', rmse)


In [None]:
# Generate top 5 product recommendations for each customer
userRecs = model.recommendForAllUsers(5)

userRecs.show()

In [None]:
# 1. Identify the most popular products based on the number of reviews and ratings
most_popular_products = test_reviews.groupBy(["product_id", 'product_category']).agg({"stars": "mean", "review_id": "count"}).withColumnRenamed("avg(stars)", "avg_rating").withColumnRenamed("count(review_id)", "num_reviews").orderBy(col("num_reviews").desc()).show(25)

# 2. Explore the performance of the model on different subsets of the data
perf_by_language = predictions.groupBy("language").agg({"stars": "mean", "prediction": "mean", "review_id": "count"}).withColumnRenamed("avg(stars)", "avg_actual_rating").withColumnRenamed("avg(prediction)", "avg_predicted_rating").withColumnRenamed("count(review_id)", "num_reviews").orderBy(col("num_reviews").desc())
perf_by_language.show()
