# Importing Spark

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!chmod 755  /content/drive/MyDrive/spark-3.0.1-bin-hadoop2.7/bin/*

In [3]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [4]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/drive/MyDrive/spark-3.0.1-bin-hadoop2.7"

In [6]:
!pip install -q findspark

In [7]:
import findspark
findspark.init()

In [9]:
# import findspark
# findspark.init()
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Python Spark").getOrCreate()
sc = spark.sparkContext

# Loading the data

In [10]:
df_ratings = spark.read\
    .option("delimiter", "\t")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .csv('/content/drive/MyDrive/data/u.data')

# Exercice 1 - constructing the recommender system

In [11]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [12]:
df_ratings.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestamp: integer (nullable = true)



In [13]:
training, test = df_ratings.randomSplit([0.8, 0.2])

In [14]:
als = ALS(
    maxIter=5, regParam=0.05, userCol="user_id", itemCol="item_id", ratingCol="rating", 
    coldStartStrategy="drop", rank=10
)
model = als.fit(training)

In [15]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(
    metricName="rmse", labelCol="rating", predictionCol="prediction"
)

In [16]:
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.9520318512095949


In [17]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

In [19]:
userRecs.take(10)

[Row(user_id=471, recommendations=[Row(item_id=1192, rating=7.104902744293213), Row(item_id=1391, rating=7.078414440155029), Row(item_id=1446, rating=6.914592266082764), Row(item_id=1389, rating=6.732880592346191), Row(item_id=1166, rating=6.69061803817749), Row(item_id=776, rating=6.686971187591553), Row(item_id=769, rating=6.648403167724609), Row(item_id=450, rating=6.627342700958252), Row(item_id=884, rating=6.576409816741943), Row(item_id=1155, rating=6.451395034790039)]),
 Row(user_id=463, recommendations=[Row(item_id=867, rating=4.85713529586792), Row(item_id=697, rating=4.76533317565918), Row(item_id=889, rating=4.761417388916016), Row(item_id=557, rating=4.660772800445557), Row(item_id=59, rating=4.552568435668945), Row(item_id=1131, rating=4.4769287109375), Row(item_id=1005, rating=4.4649553298950195), Row(item_id=61, rating=4.452428817749023), Row(item_id=1623, rating=4.447282791137695), Row(item_id=20, rating=4.4351630210876465)]),
 Row(user_id=833, recommendations=[Row(item

In [20]:
user_npy = model.userFactors.filter('id==10').collect()[0].features

In [21]:
item_npy = model.itemFactors.filter('id==10').collect()[0].features

In [22]:
import numpy as np
np.dot(user_npy, item_npy)

4.053353155526006

In [23]:
from pyspark.sql.types import ArrayType, StructField,\
StructType, StringType, IntegerType, DecimalType

In [24]:
data = [(10, 10)]
schema = StructType([
    StructField('user_id', IntegerType(), True),
    StructField('item_id', IntegerType(), True)
])

df = spark.createDataFrame(spark.sparkContext.parallelize(data),schema)

In [25]:
model.transform(df).show()

+-------+-------+----------+
|user_id|item_id|prediction|
+-------+-------+----------+
|     10|     10| 4.0533533|
+-------+-------+----------+

