In [23]:
from pyspark.sql import SparkSession

In [24]:
spark=SparkSession.builder.appName("recommendation").getOrCreate()

In [25]:
input_file_path="file:///C:/Users/ckp43_000/Documents/rating_movie.csv"

In [26]:
data=spark.read.csv(input_file_path,inferSchema=True,header=True)

In [27]:
data.show(6)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|     31|   2.5|1260759144|
|     1|   1029|   3.0|1260759179|
|     1|   1061|   3.0|1260759182|
|     1|   1129|   2.0|1260759185|
|     1|   1172|   4.0|1260759205|
|     1|   1263|   2.0|1260759151|
+------+-------+------+----------+
only showing top 6 rows



In [28]:
data.describe().show()

+-------+------------------+------------------+------------------+--------------------+
|summary|            userId|           movieId|            rating|           timestamp|
+-------+------------------+------------------+------------------+--------------------+
|  count|               783|               783|               783|                 783|
|   mean| 6.176245210727969| 4687.625798212005| 3.743933588761175|1.0212141871813538E9|
| stddev|3.0573541711797336|14206.983857744697|1.0550140206742291|1.5899546246065554E8|
|    min|                 1|                 1|               0.5|           835355395|
|    max|                12|            106487|               5.0|          1391658667|
+-------+------------------+------------------+------------------+--------------------+



In [29]:
data=data.select(['userId','movieId','rating'])

In [30]:
type(data)

pyspark.sql.dataframe.DataFrame

In [40]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|            userId|           movieId|            rating|
+-------+------------------+------------------+------------------+
|  count|               783|               783|               783|
|   mean| 6.176245210727969| 4687.625798212005| 3.743933588761175|
| stddev|3.0573541711797336|14206.983857744697|1.0550140206742291|
|    min|                 1|                 1|               0.5|
|    max|                12|            106487|               5.0|
+-------+------------------+------------------+------------------+



In [32]:
training,test=data.randomSplit([0.8,0.2])

In [33]:
from pyspark.ml.recommendation import ALS

In [43]:
from pyspark.ml.evaluation import RegressionEvaluator

In [35]:
als=ALS(maxIter=5,regParam=0.01,userCol='userId',ratingCol='rating',itemCol='movieId')

In [36]:
model=als.fit(training)

In [37]:
predictions=model.transform(test)

In [38]:
predictions.show()

+------+-------+------+-----------+
|userId|movieId|rating| prediction|
+------+-------+------+-----------+
|     4|   2659|   3.0|        NaN|
|     8|    858|   5.0|-0.20757097|
|    10|   1127|   4.0| -2.0286589|
|     1|     31|   2.5|-0.11918874|
|     4|   1270|   5.0| 0.49066654|
|     1|   1339|   3.5|        NaN|
|    11|  80906|   3.0|        NaN|
|     3|   1884|   4.0|        NaN|
|     3|    588|   3.0|  1.0511138|
|     2|    588|   3.0| 0.44383755|
|    11|    296|   5.0|  6.0667095|
|     2|    296|   4.0|  1.7123222|
|     4|   1016|   4.0|        NaN|
|    12|    673|   1.0|        NaN|
|     2|    593|   3.0| 0.02749765|
|     3|   2513|   3.0|        NaN|
|     1|   2294|   2.0| -0.6934731|
|     7|     34|   4.0|  0.7658078|
|     8|   1198|   4.0| 0.73110175|
|     7|   1198|   5.0| 0.03956452|
+------+-------+------+-----------+
only showing top 20 rows



In [52]:
predictions=predictions.dropna()

In [53]:
evaluator=RegressionEvaluator(metricName='rmse',
                              labelCol='rating',
                              predictionCol='prediction')

In [54]:
rmse=evaluator.evaluate(predictions)

In [55]:
print('rmse')
rmse

rmse


3.780300568576098

In [63]:
single_user=test.filter(test['userId']==1).select(['userId','movieId'])

In [64]:
single_user.show()

+------+-------+
|userId|movieId|
+------+-------+
|     1|     31|
|     1|   1029|
|     1|   1061|
|     1|   1129|
|     1|   1263|
|     1|   1287|
|     1|   1339|
|     1|   1953|
|     1|   2294|
+------+-------+



In [65]:
recommendation=model.transform(single_user)

In [66]:
recommendation.orderBy('prediction',ascending=False).show()

+------+-------+-----------+
|userId|movieId| prediction|
+------+-------+-----------+
|     1|   1339|        NaN|
|     1|   1029|        NaN|
|     1|   1263|        NaN|
|     1|   1061|        NaN|
|     1|   1129|        NaN|
|     1|   1953|  1.7302117|
|     1|     31|-0.11918874|
|     1|   1287|-0.15891829|
|     1|   2294| -0.6934731|
+------+-------+-----------+

