In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [2]:
spark = SparkSession.builder.appName('Recommender System CF').getOrCreate()

In [3]:
rat = spark.read.csv('datasets/rat.csv', inferSchema=True, header=True)

In [4]:
rat.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [5]:
data = rat.drop('timestamp')

In [6]:
data.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|     31|   2.5|
|     1|   1029|   3.0|
|     1|   1061|   3.0|
|     1|   1129|   2.0|
|     1|   1172|   4.0|
|     1|   1263|   2.0|
|     1|   1287|   2.0|
|     1|   1293|   2.0|
|     1|   1339|   3.5|
|     1|   1343|   2.0|
|     1|   1371|   2.5|
|     1|   1405|   1.0|
|     1|   1953|   4.0|
|     1|   2105|   4.0|
|     1|   2150|   3.0|
|     1|   2193|   2.0|
|     1|   2294|   2.0|
|     1|   2455|   2.5|
|     1|   2968|   1.0|
|     1|   3671|   3.0|
+------+-------+------+
only showing top 20 rows



In [7]:
data.select(('rating')).describe().show()

+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|            100004|
|   mean| 3.543608255669773|
| stddev|1.0580641091070326|
|    min|               0.5|
|    max|               5.0|
+-------+------------------+



In [8]:
(training, test) = data.randomSplit([0.8, 0.2])

In [9]:
als = ALS(maxIter=5,regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating')

In [10]:
model = als.fit(training)

In [11]:
predictions = model.transform(test)

In [12]:
predictions.show() #predictions user to rate movie

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   534|    463|   4.0| 4.0021644|
|    30|    463|   4.0|  2.989187|
|   311|    463|   3.0| 3.0584733|
|   602|    471|   3.0|  4.070516|
|   292|    471|   3.5| 4.2181563|
|   452|    471|   3.0| 3.5134454|
|   607|    471|   4.0| 3.8449872|
|   659|    471|   4.0| 2.9842432|
|   537|    471|   5.0|   4.42451|
|   574|    471|   3.5| 3.7981517|
|   529|    471|   4.0|  3.234109|
|   184|    471|   5.0| 4.4052963|
|   311|    471|   0.5| 3.4139504|
|   412|    833|   1.0| 2.3591518|
|   294|    833|   2.0| 2.0872242|
|   516|   1088|   3.0| 3.4330618|
|    52|   1088|   4.0| 3.5411594|
|    57|   1088|   4.0| 3.6720736|
|   607|   1088|   2.0| 3.0233717|
|   547|   1088|   5.0| 1.8062032|
+------+-------+------+----------+
only showing top 20 rows



In [13]:
mov = spark.read.csv('datasets/mov.csv', header=True, inferSchema=True)

In [14]:
res=predictions.join(mov, on=['movieId'], how='left')

In [15]:
res.show()

+-------+------+------+----------+--------------------+--------------------+
|movieId|userId|rating|prediction|               title|              genres|
+-------+------+------+----------+--------------------+--------------------+
|    463|   534|   4.0| 4.0021644|Guilty as Sin (1993)|Crime|Drama|Thriller|
|    463|    30|   4.0|  2.989187|Guilty as Sin (1993)|Crime|Drama|Thriller|
|    463|   311|   3.0| 3.0584733|Guilty as Sin (1993)|Crime|Drama|Thriller|
|    471|   602|   3.0|  4.070516|Hudsucker Proxy, ...|              Comedy|
|    471|   292|   3.5| 4.2181563|Hudsucker Proxy, ...|              Comedy|
|    471|   452|   3.0| 3.5134454|Hudsucker Proxy, ...|              Comedy|
|    471|   607|   4.0| 3.8449872|Hudsucker Proxy, ...|              Comedy|
|    471|   659|   4.0| 2.9842432|Hudsucker Proxy, ...|              Comedy|
|    471|   537|   5.0|   4.42451|Hudsucker Proxy, ...|              Comedy|
|    471|   574|   3.5| 3.7981517|Hudsucker Proxy, ...|              Comedy|

In [16]:
user_126 = res.filter(res['userId']==126).select(['userId', 'title', 'genres']) #get recommendation for user 126

In [17]:
user_564 = res.filter(res['userId']==564).select(['userId', 'title', 'genres']) #get recommendation for user 564

In [18]:
print('Recommendation Film for User 126')
user_126.show()

Recommendation Film for User 126
+------+--------------------+--------------------+
|userId|               title|              genres|
+------+--------------------+--------------------+
|   126|Silence of the La...|Crime|Horror|Thri...|
|   126|    Toy Story (1995)|Adventure|Animati...|
|   126|        Speed (1994)|Action|Romance|Th...|
|   126|Jurassic Park (1993)|Action|Adventure|...|
|   126| Broken Arrow (1996)|Action|Adventure|...|
|   126|Beauty and the Be...|Animation|Childre...|
|   126|       Batman (1989)|Action|Crime|Thri...|
|   126|    Firm, The (1993)|      Drama|Thriller|
|   126|Four Weddings and...|      Comedy|Romance|
|   126|         I.Q. (1994)|      Comedy|Romance|
+------+--------------------+--------------------+



In [19]:
print('Recommendation Film for User 564')
user_564.show()

Recommendation Film for User 564
+------+--------------------+--------------------+
|userId|               title|              genres|
+------+--------------------+--------------------+
|   564|   Local Hero (1983)|              Comedy|
|   564|   Abyss, The (1989)|Action|Adventure|...|
|   564|      Titanic (1997)|       Drama|Romance|
|   564|Angela's Ashes (1...|               Drama|
|   564|Running Man, The ...|       Action|Sci-Fi|
|   564|Princess Caraboo ...|               Drama|
|   564|Fear and Loathing...|Adventure|Comedy|...|
|   564|Friday the 13th P...|              Horror|
|   564|Allan Quatermain ...|Action|Adventure|...|
|   564|   Jack Frost (1998)|Children|Comedy|D...|
|   564|William Shakespea...|      Comedy|Fantasy|
|   564|      Aladdin (1992)|Adventure|Animati...|
|   564|She's the One (1996)|      Comedy|Romance|
|   564| Pulp Fiction (1994)|Comedy|Crime|Dram...|
|   564|Denise Calls Up (...|              Comedy|
|   564|       Gloria (1999)|      Drama|Thriller

In [20]:
user_126.collect()

[Row(userId=126, title='Silence of the Lambs, The (1991)', genres='Crime|Horror|Thriller'),
 Row(userId=126, title='Toy Story (1995)', genres='Adventure|Animation|Children|Comedy|Fantasy'),
 Row(userId=126, title='Speed (1994)', genres='Action|Romance|Thriller'),
 Row(userId=126, title='Jurassic Park (1993)', genres='Action|Adventure|Sci-Fi|Thriller'),
 Row(userId=126, title='Broken Arrow (1996)', genres='Action|Adventure|Thriller'),
 Row(userId=126, title='Beauty and the Beast (1991)', genres='Animation|Children|Fantasy|Musical|Romance|IMAX'),
 Row(userId=126, title='Batman (1989)', genres='Action|Crime|Thriller'),
 Row(userId=126, title='Firm, The (1993)', genres='Drama|Thriller'),
 Row(userId=126, title='Four Weddings and a Funeral (1994)', genres='Comedy|Romance'),
 Row(userId=126, title='I.Q. (1994)', genres='Comedy|Romance')]

In [21]:
user_564.collect()

[Row(userId=564, title='Local Hero (1983)', genres='Comedy'),
 Row(userId=564, title='Abyss, The (1989)', genres='Action|Adventure|Sci-Fi|Thriller'),
 Row(userId=564, title='Titanic (1997)', genres='Drama|Romance'),
 Row(userId=564, title="Angela's Ashes (1999)", genres='Drama'),
 Row(userId=564, title='Running Man, The (1987)', genres='Action|Sci-Fi'),
 Row(userId=564, title='Princess Caraboo (1994)', genres='Drama'),
 Row(userId=564, title='Fear and Loathing in Las Vegas (1998)', genres='Adventure|Comedy|Drama'),
 Row(userId=564, title='Friday the 13th Part 2 (1981)', genres='Horror'),
 Row(userId=564, title='Allan Quatermain and the Lost City of Gold (1987)', genres='Action|Adventure|Comedy'),
 Row(userId=564, title='Jack Frost (1998)', genres='Children|Comedy|Drama'),
 Row(userId=564, title="William Shakespeare's A Midsummer Night's Dream (1999)", genres='Comedy|Fantasy'),
 Row(userId=564, title='Aladdin (1992)', genres='Adventure|Animation|Children|Comedy|Musical'),
 Row(userId=56