In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType

spark=SparkSession \
    .builder \
    .appName("ML") \
    .master("local") \
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017/") \
    .config("spark.mongodb.output.uri", "mongodb://localhost:27017/") \
    .config("spark.driver.memory","12g")\
    .config("spark.executor.memory", "12g") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .getOrCreate()


df = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("pipeline", "[{'$sample': {'size': 1000000} }]")\
    .option("uri", "mongodb://localhost:27017/"+"amazon"+"."+"data")\
    .option("partitioner", "MongoSinglePartitioner") \
    .option("partitionkey", "asin")\
    .load()

In [2]:
df = df.drop('_id','reviewText','reviewerName','summary','unixReviewTime','verified')
df.show()

+----------+-------+--------------+
|      asin|overall|    reviewerID|
+----------+-------+--------------+
|0912006684|    4.0|A2SAI3ADSWEBYQ|
|1441766073|    4.0|A3KUEM2NAPZF61|
|B015AUYDS4|    5.0| AAG4HS8SPN8B4|
|B00JKOMISY|    3.0|A137UV8Z648ZM4|
|1613757409|    5.0|A3QG1VOMYJTO6T|
|B00M7QSHCS|    5.0| AIWZZYQMVMF6Z|
|B00WDXQO66|    5.0|A2GTSFEAHPY4EL|
|0385344422|    5.0|A20Z4OSXY7JATM|
|B002AB7X6Q|    5.0|A1QFXFZM9Z4V8N|
|0007350899|    5.0| ASBC6R8ZWO46O|
|B011UK0P3A|    5.0| AOOFGDPE1DLA6|
|1565124766|    5.0|A16R0NTMJA7O6Q|
|0985911077|    4.0|A1S328ADAE6BAS|
|1530850649|    4.0|A1J9D2BSI6AH5D|
|B00005MM0K|    5.0| ADJXL0W0RJ7VV|
|1441310584|    4.0|A2ALTWSLTVJ553|
|B01GL8CI8Q|    3.0|A1BO3NH4KKASUT|
|0800719980|    5.0| AQX3IAMUNZ2L9|
|1568364784|    5.0|A28FFHMLZMCKM1|
|B0076DI6EG|    5.0|A2NTP6U0F38U2B|
+----------+-------+--------------+
only showing top 20 rows



In [3]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(
    inputCols=['asin', 'reviewerID'],
    outputCols=['asin_index', 'reviewerID_index'],
    handleInvalid='keep'
)
final_df = indexer.fit(df).transform(df)
final_df.show()

+----------+-------+--------------+----------+----------------+
|      asin|overall|    reviewerID|asin_index|reviewerID_index|
+----------+-------+--------------+----------+----------------+
|0743246136|    4.0| ASIU59L8NHMC3|  595779.0|        910631.0|
|B00KO4518I|    5.0|A1MIFEZ821SZ4Z|   11336.0|        910631.0|
|B018EUJLAW|    5.0|A3KWUH5MNCU7YO|  559060.0|        640075.0|
|1932458328|    5.0|  AN733OOJ7WDB|  595779.0|        910631.0|
|B0001FI51U|    5.0|A1XPBG9RU276SZ|  595779.0|        910631.0|
|0679405127|    4.0|A3L5BQIQHKQ84C|  595779.0|        910631.0|
|B002BG0SO4|    5.0| ANR071R0BJH1M|  331468.0|        910631.0|
|B01A01X154|    1.0|A1TEGFXSN7FMUR|  595779.0|        243861.0|
|B00006IBFA|    5.0| AL5TTPDVUC81W|   49001.0|        818085.0|
|B005T43EAU|    2.0|A3LDVURZL4VJL8|  595779.0|        910631.0|
|0825305888|    1.0|A1UB8YE85TKSP5|  186398.0|        910631.0|
|B000FMNWRQ|    5.0|A3QQVDTQ9MAYTR|  595779.0|        910631.0|
|B0002H3ZLM|    5.0|A3ONWERRPKZ3HI|    2

In [4]:
# Drop duplicates based on 'asin_indexed' and 'reviewerID_indexed' columns
final_df = final_df.dropDuplicates(['asin_index', 'reviewerID_index'])

# Drop NaN values
final_df = final_df.na.drop()

In [5]:
final_df = final_df.drop('reviewerID','asin')

In [6]:
final_df.show()

+-------+----------+----------------+
|overall|asin_index|reviewerID_index|
+-------+----------+----------------+
|    5.0|       0.0|           705.0|
|    5.0|       0.0|          1899.0|
|    4.0|       0.0|          2963.0|
|    4.0|       0.0|         11383.0|
|    4.0|       0.0|         31074.0|
|    2.0|       0.0|         40235.0|
|    5.0|       0.0|        107757.0|
|    4.0|       0.0|        147599.0|
|    3.0|       0.0|        213614.0|
|    5.0|       0.0|        250967.0|
|    5.0|       0.0|        267808.0|
|    5.0|       0.0|        268444.0|
|    4.0|       0.0|        288227.0|
|    2.0|       0.0|        291214.0|
|    4.0|       0.0|        298574.0|
|    3.0|       0.0|        306687.0|
|    4.0|       0.0|        307641.0|
|    5.0|       0.0|        322971.0|
|    5.0|       0.0|        353635.0|
|    2.0|       0.0|        360187.0|
+-------+----------+----------------+
only showing top 20 rows



In [7]:
df_maj=final_df[final_df['overall']==5.0]
df_min = final_df[final_df['overall']!=5.0]

In [8]:
from pyspark.sql.functions import col

upsample_df = df_min.sample(True, float(df_maj.count())/float(df_min.count()), seed=50)
upsample_df = upsample_df.union(df_maj)
upsample_df.groupBy("overall").count().orderBy(col("count").desc()).show()

+-------+------+
|overall| count|
+-------+------+
|    5.0|201126|
|    4.0| 97230|
|    3.0| 43214|
|    1.0| 34609|
|    2.0| 24298|
+-------+------+



In [9]:
final_df = upsample_df

In [10]:
train_data, test_data = final_df.randomSplit([0.70,0.20])

In [11]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline


# Train the ALS model on the training set
als_mf = ALS(maxIter=10, regParam=0.01, userCol="reviewerID_index", itemCol="asin_index", ratingCol="overall", coldStartStrategy="drop")
pipeline_mf = Pipeline(stages=[als_mf])
model_mf = pipeline_mf.fit(train_data)

# Make predictions on the testing set
predictions = model_mf.transform(test_data)
predictions = predictions.dropna()


In [12]:
# Evaluate the model using RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="overall", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 3.3615945234348246


In [13]:
evaluator_mae = RegressionEvaluator(metricName="mae", labelCol="overall", predictionCol="prediction")
mae = evaluator_mae.evaluate(predictions)
print("Mean absolute error (MAE):", mae)

Mean absolute error (MAE): 2.130344226155689


In [14]:
target_stats = train_data.selectExpr('percentile(overall, 0.25) as q1', 'percentile(overall, 0.75) as q3', 'avg(overall) as mean', 'stddev(overall) as std_dev').collect()[0]
iqr = target_stats.q3 - target_stats.q1
mean = target_stats.mean
std_dev = target_stats.std_dev

In [15]:
print("Interquartile Range (IQR):", iqr)
print("Mean:", mean)
print("Standard Deviation:", std_dev)

Interquartile Range (IQR): 2.0
Mean: 4.013246800002571
Standard Deviation: 1.278798577842928


In [None]:
# Define the path where you want to save the model
path = "file:///home/hashim/Downloads/RegModel"

# Save the model in the Hadoop file system
model_mf.save(path)


RMSE: The standard deviation of the ratings is 1.278798577842928, and the RMSE is 3.3615945234348246. Since the RMSE is higher than the standard deviation, it indicates that the model's predictions have some level of error. 

MAE: The mean absolute error (MAE) is 2.130344226155689. The IQR (Interquartile Range) is 2.0, which gives us an idea about the spread of the ratings. Since the MAE is close to the IQR, it suggests that, on average, the model's predictions are within a reasonable range of the actual ratings.