# Importing Spark

In [None]:
!pip install pyspark
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Python Spark").getOrCreate()
sc = spark.sparkContext

Collecting pyspark
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 68 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 46.8 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=4eb99becd3a6b03ab86d4b39d368ca975de5385a980d244d1e5c888f19435efa
  Stored in directory: /root/.cache/pip/wheels/a5/0a/c1/9561f6fecb759579a7d863dcd846daaa95f598744e71b02c77
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


# Loading the data

In [None]:
df_ratings = spark.read\
    .option("delimiter", "\t")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .csv('u.data')

In [None]:
df_ratings.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestamp: integer (nullable = true)



# Exercice 1 - constructing the recommender system

- split train/test
- fit
- predict/evaluate

In [None]:
from pyspark.ml.recommendation import ALS

train, test = df_ratings.randomSplit([0.8,0.2])

als = ALS(rank=10, seed=0, userCol="user_id", itemCol="item_id", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(train)

model.setPredictionCol("newPrediction")
prediction = model.transform(test)

In [None]:
prediction.show(5)

+-------+-------+------+---------+-------------+
|user_id|item_id|rating|timestamp|newPrediction|
+-------+-------+------+---------+-------------+
|     26|    148|     3|891377540|     2.591428|
|    601|    148|     3|876348140|    1.7689452|
|    190|    148|     4|891033742|    3.1869822|
|    224|    148|     3|888104154|    3.2157774|
|    435|    148|     3|884133284|     3.086592|
+-------+-------+------+---------+-------------+
only showing top 5 rows



In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol="newPrediction", labelCol="rating", metricName="rmse")
rmse = evaluator.evaluate(prediction)
rmse

0.9168445404827733

- To show that you understand well how ALS works, retrieve a user and a movie vector and make a prediciton using numpy.

In [None]:
import numpy as np

model.itemFactors.show(5)
user26_f = model.userFactors.filter(model.userFactors.id==26).select("features").collect()[0].features
item148_f = model.itemFactors.filter(model.itemFactors.id==148).select("features").collect()[0].features

np.dot(user26_f, item148_f)

+---+--------------------+
| id|            features|
+---+--------------------+
| 10|[0.90370226, -0.6...|
| 20|[1.4221983, -0.32...|
| 30|[0.8560657, -0.34...|
| 40|[0.78859633, -0.0...|
| 50|[0.66837937, 0.23...|
+---+--------------------+
only showing top 5 rows



2.591428205926537

In [1]:
# Create a schema


from pyspark.sql.types import ArrayType, StructField,\
StructType, StringType, IntegerType, DecimalType

data = [(10, 10)]
schema = StructType([
    StructField('user_id', IntegerType(), True),
    StructField('item_id', IntegerType(), True)
])

df = spark.createDataFrame(spark.sparkContext.parallelize(data),schema)
model.transform(df).show()

ModuleNotFoundError: ignored