# Importing Spark

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/f0/26/198fc8c0b98580f617cb03cb298c6056587b8f0447e20fa40c5b634ced77/pyspark-3.0.1.tar.gz (204.2MB)
[K     |████████████████████████████████| 204.2MB 61kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 43.2MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612243 sha256=621a63c014b9647696bcd3b696c9265292dd189981603dd5289f9028dbe04744
  Stored in directory: /root/.cache/pip/wheels/5e/bd/07/031766ca628adec8435bb40f0bd83bb676ce65ff4007f8e73f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1


In [3]:
import pyspark

from pyspark import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext.getOrCreate()
spark = SparkSession.builder.appName("Python Spark").getOrCreate()

# Loading the data

In [5]:
df_ratings = spark.read\
    .option("delimiter", "\t")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .csv('/content/drive/MyDrive/data/u.data')

In [6]:
# print the dataframe schema
df_ratings.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestamp: integer (nullable = true)



In [8]:
# show a sample of the data (the dataframe executes the whole pipeline at this stage)
df_ratings.show(5)

+-------+-------+------+---------+
|user_id|item_id|rating|timestamp|
+-------+-------+------+---------+
|    196|    242|     3|881250949|
|    186|    302|     3|891717742|
|     22|    377|     1|878887116|
|    244|     51|     2|880606923|
|    166|    346|     1|886397596|
+-------+-------+------+---------+
only showing top 5 rows



In [9]:
# set the rdd equivalent of the dataframe
rdd_ratings = df_ratings.rdd

# Basic Queries

#### Exercice 1 - Number of movies per user (using RDD then Dataframe)

Calculer pour chaque utilisateur le nombre de films notés, et afficher le résultat pour l'un d'entre eux. Utilisez dans un premier temps les RDD puis les Dataframes.

In [10]:
# classical RDD approach
result_1 = rdd_ratings.map(lambda r: (r[0], 1)).reduceByKey(lambda v1, v2: v1 + v2).take(1)
print(result_1)

[(196, 39)]


In [11]:
# dataframe approach, filtering on the previous user to compare results
df_ratings.filter(df_ratings['user_id']==result_1[0][0])\
    .groupBy('user_id')\
    .count()\
    .show(1)

+-------+-----+
|user_id|count|
+-------+-----+
|    196|   39|
+-------+-----+



#### Exercice 2 - Average rating per user (using RDD then Dataframe)

Calculer pour chaque utilisateurs la note moyenne donnée et afficher le résultat pour l'un d'entre eux. Utilisez dans un premier temps les RDD puis les Dataframes.

In [12]:
# classical RDD approach
rdd_map = rdd_ratings.map(lambda r: (r[0], int(r[2])))
rdd_agg = rdd_map = rdd_map.aggregateByKey(
    (0, 0), 
    lambda a,b: (a[0] + b,    a[1] + 1), 
    lambda a,b: (a[0] + b[0], a[1] + b[1])
)

rdd_result = rdd_agg.mapValues(lambda v: float(v[0])/v[1])

result_1 = rdd_result.take(1)
print(result_1)

[(196, 3.6153846153846154)]


In [13]:
# dataframe approach
from pyspark.sql.functions import avg

df_ratings.filter(df_ratings['user_id']==result_1[0][0])\
    .groupBy('user_id')\
    .agg(avg('rating'))\
    .show(1)

+-------+------------------+
|user_id|       avg(rating)|
+-------+------------------+
|    196|3.6153846153846154|
+-------+------------------+



#### Exercice 3 - Top-5 movies with at least 15 votes (Dataframe)

Afficher les 5 meilleurs films parmi ceux qui ont reçu au moins 15 votes.

*Indices:*
* Utiliser df_ratings pour calculer la moyenne, filtrer les films qui ont moins de 15 notes et classer les films par ordre décroissant.
* Faire un join avec df_movies pour afficher le nom des films sélectionnés.

In [15]:
df_items = spark.read\
    .option("delimiter", "|")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .csv('/content/drive/MyDrive/data/u.item')

In [16]:
from pyspark.sql.functions import avg, count, col

df_gb = df_ratings.groupBy('item_id')\
    .agg(avg('rating'), count('item_id').alias('count'))

In [17]:
df_gb = df_gb.filter(df_gb['count'] >= 15).sort("avg(rating)", ascending=False)

In [18]:
# join with actual movie features
df_join = df_gb.join(df_items, df_gb['item_id']==df_items['movie_id'])

In [19]:
df_join.select(col("movie_title"), col("avg(rating)")).show(5)  # Java 8 (does not work with Java 12)

+--------------------+------------------+
|         movie_title|       avg(rating)|
+--------------------+------------------+
|It's a Wonderful ...| 4.121212121212121|
|Courage Under Fir...|3.6108597285067874|
|Secret of Roan In...| 3.859154929577465|
|Ghost and the Dar...|          3.203125|
|  Bulletproof (1996)| 3.204081632653061|
+--------------------+------------------+
only showing top 5 rows

