# Final Project - Recommendation System using Collaborative Filtering 

## 1. Spark Initialization

In [1]:
# Import findspark to make pyspark importable as a regular library
import findspark
findspark.init()

In [2]:
# Import SparkSession
from pyspark.sql import SparkSession

In [3]:
# Create Spark Session
spark = SparkSession.builder.appName("MRS").getOrCreate()

In [4]:
# Print spark object ID
print(spark)

<pyspark.sql.session.SparkSession object at 0x10d785828>


## 2. Load Dataset

### 2.2 Listening Count Data

In [5]:
import os
dataset_path = '/Users/mocatfrio/Documents/big-data/final-project/app/lastfm-dataset'

In [6]:
lc_file = os.path.join(dataset_path, 'csv/user_artists.csv')
lc_df = spark.read.load(lc_file, format="csv", sep=",", inferSchema="true", header="true")

In [7]:
# Show dataset
lc_df.show()

+------+--------+------+
|userID|artistID|weight|
+------+--------+------+
|     2|      51| 13883|
|     2|      52| 11690|
|     2|      53| 11351|
|     2|      54| 10300|
|     2|      55|  8983|
|     2|      56|  6152|
|     2|      57|  5955|
|     2|      58|  4616|
|     2|      59|  4337|
|     2|      60|  4147|
|     2|      61|  3923|
|     2|      62|  3782|
|     2|      63|  3735|
|     2|      64|  3644|
|     2|      65|  3579|
|     2|      66|  3312|
|     2|      67|  3301|
|     2|      68|  2927|
|     2|      69|  2720|
|     2|      70|  2686|
+------+--------+------+
only showing top 20 rows



In [8]:
lc_df.printSchema()

root
 |-- userID: integer (nullable = true)
 |-- artistID: integer (nullable = true)
 |-- weight: integer (nullable = true)



In [9]:
lc_df.count()

92834

In [10]:
listening_count_df = []
    
for i in range(0, 3):
    lc_file_path = os.path.join(dataset_path, 'batch/batch' + str(i) + '.txt')
    listening_count_df.append(spark.read.csv(lc_file_path, header=None, inferSchema=True).na.drop())
    listening_count_df[i] = listening_count_df[i].selectExpr("_c0 as user_id", "_c1 as artist_id", "_c2 as weight")

In [11]:
listening_count_df[1].show()

+-------+---------+------+
|user_id|artist_id|weight|
+-------+---------+------+
|    677|     6258|   872|
|    677|     6658|   212|
|    677|     7917|   222|
|    677|     7918|   226|
|    677|     9207|  1355|
|    677|     9208|   800|
|    677|     9209|   423|
|    677|     9210|   321|
|    677|     9211|   317|
|    677|     9212|   300|
|    677|     9213|   271|
|    677|     9214|   258|
|    677|     9215|   232|
|    677|     9216|   228|
|    677|     9217|   214|
|    677|     9218|   166|
|    677|     9219|   166|
|    677|     9220|   164|
|    677|     9221|   159|
|    677|     9222|   158|
+-------+---------+------+
only showing top 20 rows



### 2.1 Artists Data

In [12]:
# Load the dataset
artists_file = "/Users/mocatfrio/Documents/big-data/final-project/app/lastfm-dataset/csv/artists.csv"
artists_df = spark.read.load(artists_file, format="csv", sep=",", inferSchema="true", header="true")

In [13]:
# Show dataset
artists_df.show()

+---+--------------------+--------------------+--------------------+
| id|                name|                 url|          pictureURL|
+---+--------------------+--------------------+--------------------+
|  1|        MALICE MIZER|http://www.last.f...|http://userserve-...|
|  2|     Diary of Dreams|http://www.last.f...|http://userserve-...|
|  3|   Carpathian Forest|http://www.last.f...|http://userserve-...|
|  4|        Moi dix Mois|http://www.last.f...|http://userserve-...|
|  5|         Bella Morte|http://www.last.f...|http://userserve-...|
|  6|           Moonspell|http://www.last.f...|http://userserve-...|
|  7|      Marilyn Manson|http://www.last.f...|http://userserve-...|
|  8|         DIR EN GREY|http://www.last.f...|http://userserve-...|
|  9|         Combichrist|http://www.last.f...|http://userserve-...|
| 10|             Grendel|http://www.last.f...|http://userserve-...|
| 11|            Agonoize|http://www.last.f...|http://userserve-...|
| 12|            Behemoth|http://w

In [14]:
artists_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- url: string (nullable = true)
 |-- pictureURL: string (nullable = true)



In [15]:
artists_df.count()

17632

## 3. Collaborative Filtering using ALS (Alternating Least Squares) Algorithm
**Collaborative filtering** is commonly used for recommender systems. These techniques aim to fill in the missing entries of a user-item association matrix. `spark.ml` library uses the **Alternating least squares (ALS) algorithm** to learn these latent factors. We can evaluate the recommendation model by measuring the root-mean-square error of rating prediction. 

In [16]:
# Import ALS algorithm
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [17]:
# Split ratings data become training set and test set
(training, test) = lc_df.randomSplit([0.8, 0.2])

In [18]:
als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="artist_id", ratingCol="weight", coldStartStrategy="drop")
model = []
new_df = []
new_df.append(listening_count_df[0])
new_df.append(new_df[0].union(listening_count_df[1]))
new_df.append(new_df[1].union(listening_count_df[2]))

for i in range(0, 3):
    model.append(als.fit(new_df[i]))
    print('{} : {}'.format(i, new_df[i].count()))

0 : 31000
1 : 62000
2 : 92834


In [19]:
# Generate top 10 books recommendations for each user 
userRecs = model[0].recommendForAllUsers(10)
userRecs.show(truncate=False)

+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                                                                                          |
+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|471    |[[1672, 1858.2396], [265, 1704.5765], [3478, 1609.8387], [72, 1431.117], [2044, 1300.3689], [227, 1177.0496], [1673, 1112.5366], [1081, 1096.8915], [1643, 1037.8265], [279, 1031.5619]] |
|463    |[[187, 1407.3424], [458, 1085.1812], [289, 763.30255], [3280, 699.84204], [2548, 665.5842], [707, 651.5114], [2973, 605.8409], [503, 582.6389], [917, 580.27014], [51, 570.3952]]        |
|496    |[[3501, 655

In [20]:
# Generate top 10 books recommendations for each user 
userRecs = model[1].recommendForAllUsers(10)
userRecs.show(truncate=False)

+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                                                                                             |
+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|471    |[[378, 2856.579], [3478, 2451.587], [3501, 2394.5999], [687, 2271.2212], [701, 2142.7366], [6410, 2016.6627], [187, 1880.1895], [233, 1739.5109], [5267, 1636.7878], [889, 1617.4727]]      |
|1342   |[[163, 30968.682], [6410, 26323.477], [3198, 22548.0], [101, 22042.18], [51, 20480.262], [377, 15251.57], [486, 14937.672], [1984, 14574.154], [458, 13632.558], [187, 13546.45]]           |
|463 

In [21]:
# Generate top 10 books recommendations for each user 
userRecs = model[2].recommendForAllUsers(10)
userRecs.show(truncate=False)

+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                                                                                            |
+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1580   |[[792, 41373.62], [687, 13041.467], [8388, 10257.295], [72, 7509.628], [2548, 6131.05], [4230, 6117.7935], [6150, 5420.0547], [187, 5264.0205], [679, 4917.1704], [1114, 4906.021]]        |
|471    |[[687, 4389.3516], [1673, 4003.4268], [5391, 3879.8525], [8388, 3828.3804], [14986, 3754.507], [486, 3674.5225], [14987, 3445.4854], [378, 3279.7905], [3478, 3160.066], [6410, 3026.4084]]|
|1591   |[

## 6. References

* https://spark.apache.org/docs/latest/ml-collaborative-filtering.html