# Importing Spark

In [1]:
# import findspark
# findspark.init()
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Python Spark").getOrCreate()
sc = spark.sparkContext

# Loading the data

In [2]:
df_ratings = spark.read\
    .option("delimiter", "\t")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .csv('data/u.data')

# Exercice 1 - constructing the recommender system

In [3]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [4]:
df_ratings.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestamp: integer (nullable = true)



In [5]:
training, test = df_ratings.randomSplit([0.8, 0.2])

In [6]:
als = ALS(
    maxIter=5, regParam=0.05, userCol="user_id", itemCol="item_id", ratingCol="rating", 
    coldStartStrategy="drop", rank=10
)
model = als.fit(training)

In [7]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(
    metricName="rmse", labelCol="rating", predictionCol="prediction"
)

In [8]:
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.9602474330257212


In [9]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

In [10]:
1+1

2

In [22]:
user_npy = model.userFactors.filter('id==10').collect()[0].features

In [23]:
item_npy = model.itemFactors.filter('id==10').collect()[0].features

In [26]:
import numpy as np
np.dot(user_npy, item_npy)

3.8694755134008254

In [28]:
from pyspark.sql.types import ArrayType, StructField,\
StructType, StringType, IntegerType, DecimalType

In [31]:
data = [(10, 10)]
schema = StructType([
    StructField('user_id', IntegerType(), True),
    StructField('item_id', IntegerType(), True)
])

df = spark.createDataFrame(spark.sparkContext.parallelize(data),schema)

In [33]:
model.transform(df).show()

+-------+-------+----------+
|user_id|item_id|prediction|
+-------+-------+----------+
|     10|     10| 3.8694754|
+-------+-------+----------+

