# Final Project - Recommendation System using Collaborative Filtering 

## 1. Spark Initialization

In [19]:
# Import findspark to make pyspark importable as a regular library
import findspark
findspark.init()

In [20]:
# Import SparkSession
from pyspark.sql import SparkSession

In [21]:
# Create Spark Session
spark = SparkSession.builder.appName("MRS").getOrCreate()

In [22]:
# Print spark object ID
print(spark)

<pyspark.sql.session.SparkSession object at 0x113ace630>


## 2. Load Dataset

### 2.2 Listening Count Data

In [23]:
import os
dataset_path = '/Users/mocatfrio/Documents/big-data/final-project/app/lastfm-dataset'

In [24]:
lc_file = os.path.join(dataset_path, 'csv/user_artists.csv')
lc_df = spark.read.load(lc_file, format="csv", sep=",", inferSchema="true", header="true")

In [25]:
# Show dataset
lc_df.show()

+------+--------+------+
|userID|artistID|weight|
+------+--------+------+
|     2|      51| 13883|
|     2|      52| 11690|
|     2|      53| 11351|
|     2|      54| 10300|
|     2|      55|  8983|
|     2|      56|  6152|
|     2|      57|  5955|
|     2|      58|  4616|
|     2|      59|  4337|
|     2|      60|  4147|
|     2|      61|  3923|
|     2|      62|  3782|
|     2|      63|  3735|
|     2|      64|  3644|
|     2|      65|  3579|
|     2|      66|  3312|
|     2|      67|  3301|
|     2|      68|  2927|
|     2|      69|  2720|
|     2|      70|  2686|
+------+--------+------+
only showing top 20 rows



In [26]:
lc_df.printSchema()

root
 |-- userID: integer (nullable = true)
 |-- artistID: integer (nullable = true)
 |-- weight: integer (nullable = true)



In [27]:
lc_df.count()

92834

In [29]:
listening_count_df = []
    
for i in range(0, 3):
    lc_file_path = os.path.join(dataset_path, 'batch/batch' + str(i) + '.txt')
    listening_count_df.append(spark.read.csv(lc_file_path, header=None, inferSchema=True).na.drop())
    listening_count_df[i] = listening_count_df[i].selectExpr("_c0 as user_id", "_c1 as artist_id", "_c2 as weight")

In [36]:
listening_count_df[1].show()

+-------+---------+------+
|user_id|artist_id|weight|
+-------+---------+------+
|    677|     6258|   872|
|    677|     6658|   212|
|    677|     7917|   222|
|    677|     7918|   226|
|    677|     9207|  1355|
|    677|     9208|   800|
|    677|     9209|   423|
|    677|     9210|   321|
|    677|     9211|   317|
|    677|     9212|   300|
|    677|     9213|   271|
|    677|     9214|   258|
|    677|     9215|   232|
|    677|     9216|   228|
|    677|     9217|   214|
|    677|     9218|   166|
|    677|     9219|   166|
|    677|     9220|   164|
|    677|     9221|   159|
|    677|     9222|   158|
+-------+---------+------+
only showing top 20 rows



In [31]:
listening_count_df[0].count()

31000

In [32]:
new_df = listening_count_df[0]

In [33]:
new_df.count()

31000

In [34]:
new_df = new_df.union(listening_count_df[1])

In [35]:
new_df.count()

62000

### 2.1 Artists Data

In [None]:
# Load the dataset
artists_file = "/Users/mocatfrio/Documents/big-data/final-project/app/lastfm-dataset/csv/artists.csv"
artists_df = spark.read.load(artists_file, format="csv", sep=",", inferSchema="true", header="true")

In [None]:
# Show dataset
artists_df.show()

In [None]:
artists_df.printSchema()

In [None]:
artists_df.count()

## 3. Collaborative Filtering using ALS (Alternating Least Squares) Algorithm
**Collaborative filtering** is commonly used for recommender systems. These techniques aim to fill in the missing entries of a user-item association matrix. `spark.ml` library uses the **Alternating least squares (ALS) algorithm** to learn these latent factors. We can evaluate the recommendation model by measuring the root-mean-square error of rating prediction. 

In [38]:
# Import ALS algorithm
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [None]:
# Split ratings data become training set and test set
(training, test) = lc_df.randomSplit([0.8, 0.2])

In [54]:
als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="artist_id", ratingCol="weight", coldStartStrategy="drop")
model = []
new_df = []
new_df.append(listening_count_df[0])
new_df.append(new_df[0].union(listening_count_df[1]))
new_df.append(new_df[1].union(listening_count_df[2]))

for i in range(0, 3):
    model.append(als.fit(new_df[i]))
    print('{} : {}'.format(i, new_df[i].count()))

0 : 31000
1 : 62000
2 : 92834


In [51]:
# Generate top 10 books recommendations for each user 
userRecs = model[0].recommendForAllUsers(10)
userRecs.show(truncate=False)

+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                                                                                        |
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|471    |[[3478, 3422.281], [458, 2579.2283], [2044, 1972.1602], [72, 1958.7572], [903, 1728.5991], [2070, 1640.283], [3280, 1554.135], [461, 1549.312], [1673, 1458.2598], [321, 1455.417]]    |
|463    |[[187, 622.39526], [487, 602.99316], [3280, 527.1836], [2200, 495.24976], [51, 377.65173], [56, 361.7568], [233, 358.96234], [52, 318.47473], [2026, 292.80392], [681, 285.14563]]     |
|496    |[[2044, 6322.6772], [

In [52]:
# Generate top 10 books recommendations for each user 
userRecs = model[1].recommendForAllUsers(10)
userRecs.show(truncate=False)

+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                                                                                          |
+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|471    |[[378, 1998.0315], [687, 1844.9742], [3280, 1823.7478], [101, 1795.9752], [195, 1708.4152], [511, 1585.5088], [56, 1321.8867], [51, 1310.023], [72, 1189.54], [3478, 1187.3569]]         |
|1342   |[[2179, 58821.645], [163, 30976.398], [458, 28980.799], [932, 22935.842], [487, 21919.68], [101, 19870.443], [304, 18755.975], [679, 17633.17], [1246, 15374.705], [2924, 15054.013]]    |
|463    |[[8388, 298

In [53]:
# Generate top 10 books recommendations for each user 
userRecs = model[2].recommendForAllUsers(10)
userRecs.show(truncate=False)

+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                                                                                           |
+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1580   |[[8388, 6767.8096], [72, 5951.423], [486, 5670.152], [5391, 5228.4463], [8292, 5077.2197], [14986, 5038.8594], [14185, 4989.2773], [82, 4704.866], [14987, 4624.1274], [504, 4483.3887]]  |
|471    |[[6150, 3244.3665], [687, 3158.265], [2102, 2470.399], [1014, 2193.378], [8388, 2064.8643], [3176, 1865.0927], [486, 1856.7439], [195, 1736.6077], [2179, 1696.9053], [3280, 1559.6804]]  |
|1591   |[[203,

In [None]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="weight",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

In [None]:
# Generate top 10 user recommendations for each artist
artistRecs = model.recommendForAllItems(10)
artistRecs.show(truncate=False)

In [None]:
# Generate top 10 movie recommendations for a specified set of users
users = lc_df.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
userSubsetRecs.show(truncate=False)

In [None]:
# Generate top 10 user recommendations for a specified set of movies
artist = lc_df.select(als.getItemCol()).distinct().limit(3)
artistSubSetRecs = model.recommendForItemSubset(artist, 10)
artistSubSetRecs.show(truncate=False)

## 6. References

* https://spark.apache.org/docs/latest/ml-collaborative-filtering.html