In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Recommendation_system').getOrCreate()

In [4]:
data = spark.read.json("Musical_Instruments_5.json")

In [5]:
data.show(5,truncate=True)

+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|      asin| helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|
+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|1384719342|  [0, 0]|    5.0|Not much to write...|02 28, 2014|A2IBPI20UZIR0U|cassandra tu "Yea...|                good|    1393545600|
|1384719342|[13, 14]|    5.0|The product does ...|03 16, 2013|A14VAT5EAX3D9S|                Jake|                Jake|    1363392000|
|1384719342|  [1, 1]|    5.0|The primary job o...|08 28, 2013|A195EZSQDW3E21|Rick Bennette "Ri...|It Does The Job Well|    1377648000|
|1384719342|  [0, 0]|    5.0|Nice windscreen p...|02 14, 2014|A2C00NNG1ZQQG2|RustyBill "Sunday...|GOOD WINDSCREEN F...|    1392336000|
|1384719342|  [0, 0]|    5.0|This pop filter i...|02 21

In [6]:
data_sub = data.select(['asin', 'overall', 'reviewerID'])

In [7]:
data_sub.count()

10261

In [8]:
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import isnan, when, count, col

In [9]:
data_sub.show(5, truncate=True)

+----------+-------+--------------+
|      asin|overall|    reviewerID|
+----------+-------+--------------+
|1384719342|    5.0|A2IBPI20UZIR0U|
|1384719342|    5.0|A14VAT5EAX3D9S|
|1384719342|    5.0|A195EZSQDW3E21|
|1384719342|    5.0|A2C00NNG1ZQQG2|
|1384719342|    5.0| A94QU4C90B1AX|
+----------+-------+--------------+
only showing top 5 rows



In [10]:
data_sub.select([count(when(col(c).isNull(), c)).alias(c) for c in data_sub.columns]).toPandas().T

Unnamed: 0,0
asin,0
overall,0
reviewerID,0


In [11]:
# Distinct users and movies
users = data_sub.select("reviewerID").distinct().count()
products = data_sub.select("asin").distinct().count()
numerator = data_sub.count()

In [12]:
display(numerator, users, products)

10261

1429

900

In [13]:
# Number of ratings matrix could contain if no empty cells
denominator = users * products
denominator

1286100

In [14]:
#Calculating sparsity
sparsity = 1 - (numerator*1.0 / denominator)
print ("Sparsity: "), sparsity

Sparsity: 


(None, 0.992021615737501)

In [15]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [16]:
# Converting String to index
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col


In [17]:
# Create an indexer
indexer = StringIndexer(inputCol='asin',  outputCol='asin_idx')
# Indexer identifies categories in the data
indexer_model = indexer.fit(data_sub)
# Indexer creates a new column with numeric index values
data_indexed = indexer_model.transform(data_sub)
# Repeat the process for the other categorical feature
indexer1 = StringIndexer(inputCol='reviewerID', outputCol='reviewerID_idx')
indexer1_model = indexer1.fit(data_indexed)
data_indexed = indexer1_model.transform(data_indexed)

In [18]:
data_indexed.show(5, truncate=True)

+----------+-------+--------------+--------+--------------+
|      asin|overall|    reviewerID|asin_idx|reviewerID_idx|
+----------+-------+--------------+--------+--------------+
|1384719342|    5.0|A2IBPI20UZIR0U|   703.0|          66.0|
|1384719342|    5.0|A14VAT5EAX3D9S|   703.0|         266.0|
|1384719342|    5.0|A195EZSQDW3E21|   703.0|         395.0|
|1384719342|    5.0|A2C00NNG1ZQQG2|   703.0|        1048.0|
|1384719342|    5.0| A94QU4C90B1AX|   703.0|        1311.0|
+----------+-------+--------------+--------+--------------+
only showing top 5 rows



In [1]:
data_indexed.select([count(when(col(c).isNull(), c)).alias(c) for c in data_indexed.columns]).toPandas().T

NameError: name 'data_indexed' is not defined

In [20]:
# Smaller dataset so we will use 0.8 / 0.2
(training, test) = data_indexed.randomSplit([0.8, 0.2])

In [21]:
# Creating ALS model and fitting data
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [22]:
als = ALS(maxIter=5, 
          regParam=0.09, 
          rank = 25,
          userCol="reviewerID_idx", 
          itemCol="asin_idx", 
          ratingCol="overall", 
          coldStartStrategy="drop",
          nonnegative=True)
model = als.fit(training)

In [23]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)

In [24]:
predictions.select(["asin_idx", "reviewerID_idx", 
                    "overall", "prediction"]).show(5)

+--------+--------------+-------+----------+
|asin_idx|reviewerID_idx|overall|prediction|
+--------+--------------+-------+----------+
|     5.0|        1342.0|    3.0| 3.5549736|
|   240.0|         833.0|    3.0|  2.816957|
|   150.0|        1238.0|    2.0| 3.9330509|
|   163.0|         243.0|    5.0| 2.9854965|
|    26.0|         737.0|    5.0| 3.5600555|
+--------+--------------+-------+----------+
only showing top 5 rows



In [25]:
evaluator = RegressionEvaluator(metricName="rmse", 
                                labelCol="overall",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.211863230464196


In [26]:
# On average, this model is ~ 1.2 from perfect recommendations.

### Providing Recommendations: for all users

In [27]:
# get 20 recommendations which have highest rating.
user_recs = model.recommendForAllUsers(20)

In [28]:
for user in user_recs.head(5):
    print(user)
    print("\n")

Row(reviewerID_idx=12, recommendations=[Row(asin_idx=782, rating=6.236773490905762), Row(asin_idx=829, rating=6.1638102531433105), Row(asin_idx=781, rating=6.152835369110107), Row(asin_idx=460, rating=6.149811267852783), Row(asin_idx=733, rating=6.086832523345947), Row(asin_idx=873, rating=6.0139384269714355), Row(asin_idx=474, rating=5.946422100067139), Row(asin_idx=753, rating=5.920098304748535), Row(asin_idx=857, rating=5.899792671203613), Row(asin_idx=868, rating=5.897313117980957), Row(asin_idx=515, rating=5.887059211730957), Row(asin_idx=802, rating=5.878513813018799), Row(asin_idx=784, rating=5.859480857849121), Row(asin_idx=335, rating=5.848679065704346), Row(asin_idx=339, rating=5.843853950500488), Row(asin_idx=803, rating=5.827096939086914), Row(asin_idx=887, rating=5.825804233551025), Row(asin_idx=809, rating=5.824015140533447), Row(asin_idx=811, rating=5.810795307159424), Row(asin_idx=769, rating=5.809715270996094)])


Row(reviewerID_idx=22, recommendations=[Row(asin_idx=78

### Converting back to string form


In [29]:
import pandas as pd
recs=model.recommendForAllUsers(10).toPandas()
nrecs=recs.recommendations.apply(pd.Series) \
 .merge(recs, right_index = True, left_index = True) \
 .drop(["recommendations"], axis = 1) \
 .melt(id_vars = ['reviewerID_idx'], value_name = "recommendation") \
 .drop("variable", axis = 1) \
 .dropna() 
nrecs=nrecs.sort_values('reviewerID_idx')
nrecs=pd.concat([nrecs['recommendation'].apply(pd.Series), 
 nrecs['reviewerID_idx']], axis = 1)
nrecs.columns = [ 
 'ProductID_index',
 'Rating',
 'UserID_index' 
 ]

In [39]:
md=data_indexed.select(['reviewerID', 'reviewerID_idx', 'asin', 'asin_idx'])
md=md.toPandas()
dict1=dict(zip(md['reviewerID_idx'],md['reviewerID']))
dict2=dict(zip(md['asin_idx'],md['asin']))
nrecs['reviewerID']=nrecs['UserID_index'].map(dict1)
nrecs['asin']=nrecs['ProductID_index'].map(dict2)
nrecs=nrecs.sort_values('reviewerID')
nrecs.reset_index(drop=True, inplace=True)
new=nrecs[['reviewerID','asin','Rating']]
new['recommendations'] = list(zip(new.asin, new.Rating))
res=new[['reviewerID','recommendations']] 
res_new=res['recommendations'].groupby([res.reviewerID]).apply(list).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['recommendations'] = list(zip(new.asin, new.Rating))


In [40]:
res_new

Unnamed: 0,reviewerID,recommendations
0,A00625243BI8W1SSZNLMD,"[(B0002FO9QY, 5.649819850921631), (B0002CZVHI,..."
1,A10044ECXDUVKS,"[(B0007OGTGS, 4.8666911125183105), (B00AHEWBM4..."
2,A102MU6ZC9H1N6,"[(B0038MTH8S, 6.159424781799316), (B001D2TPZU,..."
3,A109JTUZXO61UY,"[(B000SZVYLQ, 6.314543724060059), (B001CJ2QZU,..."
4,A109ME7C09HM2M,"[(B005M0KLGQ, 5.952049255371094), (B0038MTH8S,..."
...,...,...
1424,AZJPNK73JF3XP,"[(B002UXS4JO, 5.936407089233398), (B0007NQH98,..."
1425,AZMHABTPXVLG3,"[(B0002E1OVS, 3.9475741386413574), (B0002M3OVI..."
1426,AZMIKIG4BB6BZ,"[(B000CD3QY2, 5.890352725982666), (B000S5V510,..."
1427,AZPDO6FLSMLFP,"[(B0002FO9QY, 5.378147125244141), (B0002M3OVI,..."
