# Homework 9: Part I

## Create Spark Session

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .master('local[*]')\
        .appName('Homework9Part1')\
        .config('spark.driver.maxResultSize', '10g')\
        .config('spark.executor.memory' ,'10g')\
        .config('spark.driver.memory', '10g')\
        .getOrCreate()

from pyspark import SparkContext
sc = SparkContext.getOrCreate()

## Import the train and test dataset
* Train: `trainIdx2_matrix.txt`
* Test: `testTrack_hierarchy.txt`

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

train_schema = StructType([
    StructField('userId', IntegerType()),
    StructField('itemId', IntegerType()),
    StructField('rating', DoubleType())
])

train = spark.read.csv('trainIdx2_matrix.txt',
                       sep='|',
                       header=False,
                       schema=train_schema
                      )

train.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- itemId: integer (nullable = true)
 |-- rating: double (nullable = true)



In [3]:
train.show(5)

+------+------+------+
|userId|itemId|rating|
+------+------+------+
|199808|248969|  90.0|
|199808|  2663|  90.0|
|199808| 28341|  90.0|
|199808| 42563|  90.0|
|199808| 59092|  90.0|
+------+------+------+
only showing top 5 rows



In [4]:
test_schema = StructType([
    StructField('userId', IntegerType()),
    StructField('trackId', IntegerType()),
    StructField('albumId', IntegerType()),
    StructField('artistId', IntegerType()),
    StructField('genreId_1', IntegerType()),
    StructField('genreId_2', IntegerType()),
    StructField('genreId_3', IntegerType()),
    StructField('genreId_4', IntegerType()),
    StructField('genreId_5', IntegerType()),
    StructField('genreId_6', IntegerType()),
    StructField('genreId_7', IntegerType()),
    StructField('genreId_8', IntegerType()),
    StructField('genreId_9', IntegerType()),
    StructField('genreId_10', IntegerType()),
    StructField('genreId_11', IntegerType()),
    StructField('genreId_12', IntegerType()),
    StructField('genreId_13', IntegerType()),
    StructField('genreId_14', IntegerType()),
    StructField('genreId_15', IntegerType()),
    StructField('genreId_16', IntegerType()),
    StructField('genreId_17', IntegerType()),
    StructField('genreId_18', IntegerType()),
    StructField('genreId_19', IntegerType()),
    StructField('genreId_20', IntegerType()),
    StructField('genreId_21', IntegerType()),
])

test = spark.read.csv('testTrack_hierarchy.txt',
                     sep='|',
                     nullValue='None',
                     header=False,
                     schema=test_schema)

test.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- trackId: integer (nullable = true)
 |-- albumId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- genreId_1: integer (nullable = true)
 |-- genreId_2: integer (nullable = true)
 |-- genreId_3: integer (nullable = true)
 |-- genreId_4: integer (nullable = true)
 |-- genreId_5: integer (nullable = true)
 |-- genreId_6: integer (nullable = true)
 |-- genreId_7: integer (nullable = true)
 |-- genreId_8: integer (nullable = true)
 |-- genreId_9: integer (nullable = true)
 |-- genreId_10: integer (nullable = true)
 |-- genreId_11: integer (nullable = true)
 |-- genreId_12: integer (nullable = true)
 |-- genreId_13: integer (nullable = true)
 |-- genreId_14: integer (nullable = true)
 |-- genreId_15: integer (nullable = true)
 |-- genreId_16: integer (nullable = true)
 |-- genreId_17: integer (nullable = true)
 |-- genreId_18: integer (nullable = true)
 |-- genreId_19: integer (nullable = true)
 |-- genreId_20: integer (n

In [5]:
test.select('userId', 'trackId', 'artistId', 'genreId_1', 'genreId_2', 'genreId_3', 'genreId_4', 'genreId_5').show(5)

+------+-------+--------+---------+---------+---------+---------+---------+
|userId|trackId|artistId|genreId_1|genreId_2|genreId_3|genreId_4|genreId_5|
+------+-------+--------+---------+---------+---------+---------+---------+
|199810| 208019|    null|     null|     null|     null|     null|     null|
|199810|  74139|  271146|   113360|   173467|   173655|   192976|   146792|
|199810|   9903|    null|    33722|   123396|    79926|    73523|     null|
|199810| 242681|  244574|    61215|    17453|   274088|     null|     null|
|199810|  18515|   33168|    19913|    48505|   154024|     null|     null|
+------+-------+--------+---------+---------+---------+---------+---------+
only showing top 5 rows



## Only include the userIds that exist in the test dataset within the train dataset for the ALS model

### Get unique users

In [6]:
test_unique_users = test.select('userId').distinct().coalesce(1)

test_unique_users.show(5)
print('The number of unique users: ', test_unique_users.count())

+------+
|userId|
+------+
|199855|
|199976|
|200166|
|200625|
|200878|
+------+
only showing top 5 rows

The number of unique users:  20000


### Filter the trainset to only include userIds that are included within the test dataset

In [7]:
from pyspark.sql.functions import col

spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'true')

train = train.toPandas()

train = train[train.userId.isin(test_unique_users.toPandas().userId)]

train = spark.createDataFrame(train).repartition('userId')

In [8]:
train.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- itemId: integer (nullable = true)
 |-- rating: double (nullable = true)



In [9]:
train.show(5)

+------+------+------+
|userId|itemId|rating|
+------+------+------+
|199855| 33722|   0.0|
|199855|274161|   0.0|
|199855|113360|   0.0|
|199855|149962|   0.0|
|199855|155264|   0.0|
+------+------+------+
only showing top 5 rows



## Train data summary statistics

In [10]:
train.describe().show()

+-------+------------------+-----------------+------------------+
|summary|            userId|           itemId|            rating|
+-------+------------------+-----------------+------------------+
|  count|          10643437|         10643437|          10643437|
|   mean|224380.43626321084|149126.9231043506|47.600189769526516|
| stddev|14393.139199046505|85467.79849512114| 37.99677952998833|
|    min|            199810|                0|               0.0|
|    max|            249010|           296110|             100.0|
+-------+------------------+-----------------+------------------+



## Build ALS Model

In [11]:
from pyspark.ml.recommendation import ALS

als = ALS(userCol='userId', 
          itemCol='itemId',
          ratingCol='rating', 
          rank=5,
          maxIter= 5,
          regParam=0.01,
          nonnegative = True, 
          implicitPrefs = False)

### Train the best model

In [12]:
model = als.fit(train)

In [13]:
train_results = model.transform(train)

In [14]:
train_results.show(5)

+------+------+------+----------+
|userId|itemId|rating|prediction|
+------+------+------+----------+
|205890|   148|  90.0|  67.68551|
|216277|   148|  90.0|  73.53194|
|241707|   148|  60.0|  75.45129|
|226963|   148|  90.0|  65.46061|
|206707|   148|  70.0|  65.30318|
+------+------+------+----------+
only showing top 5 rows



### Training summary

In [15]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

print('RMSE: ', evaluator.evaluate(train_results))

RMSE:  28.553197559911332


In [16]:
train_results.select('rating').describe().show()

+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|          10643437|
|   mean|47.600189769526516|
| stddev|  37.9967795299883|
|    min|               0.0|
|    max|             100.0|
+-------+------------------+



## Make predictions for track, album, and artist on the test data

### Check for null values within `trackId`, `albumId`, and `artistId` columns

In [17]:
print('Number of null trackId: ', test.filter('trackId IS NULL').count())
print('Number of null albumId: ', test.filter('albumId IS NULL').count())
print('Number of null artistId: ', test.filter('artistId IS NULL').count())

Number of null trackId:  0
Number of null albumId:  8572
Number of null artistId:  10891


### Make predictions on track ratings

In [18]:
prediction_track = model.setItemCol('trackId').transform(test)

### Make predictions on album ratings

In [19]:
prediction_album = model.setItemCol('albumId').transform(test.filter('albumId IS NOT NULL'))

### Make predictions on artist ratings 

In [20]:
prediction_artist = model.setItemCol('artistId').transform(test.filter('artistId IS NOT NULL'))

## Add all ratings to the same dataframe, `predictions`

In [21]:
from pyspark.sql.types import IntegerType

predictions = test.select('userId', 'trackId')

predictions = predictions.join(prediction_track.select('userId', 'trackId', 'prediction'), ['userId', 'trackId'], 'left')
predictions = predictions.withColumn('prediction', predictions['prediction'].cast(IntegerType()))\
                         .withColumnRenamed('prediction', 'track_rating')

In [22]:
predictions = predictions.join(prediction_album.select('userId', 'trackId', 'prediction'), ['userId', 'trackId'], 'left')

predictions = predictions.withColumn('prediction', predictions['prediction'].cast(IntegerType()))\
                         .withColumnRenamed('prediction', 'album_rating')

In [24]:
predictions = predictions.join(prediction_artist.select('userId', 'trackId', 'prediction'), ['userId', 'trackId'], 'left')

predictions = predictions.withColumn('prediction', predictions['prediction'].cast(IntegerType()))\
                         .withColumnRenamed('prediction', 'artist_rating')

In [25]:
predictions.show(5)

+------+-------+------------+------------+-------------+
|userId|trackId|track_rating|album_rating|artist_rating|
+------+-------+------------+------------+-------------+
|200072|  29894|          67|          66|           75|
|200124| 162126|          11|          11|           12|
|200174| 137908|          45|          33|           31|
|200400| 263168|          67|          60|           29|
|200427|  82634|          73|          78|           61|
+------+-------+------------+------------+-------------+
only showing top 5 rows



### Check for null values within the `track_rating`, `album_rating`, and `artist_rating`

In [26]:
print('Number of null track_rating: ', predictions.filter('track_rating IS NULL').count())
print('Number of null album_rating: ', predictions.filter('album_rating IS NULL').count())
print('Number of null artist_rating: ', predictions.filter('artist_rating IS NULL').count())

Number of null track_rating:  0
Number of null album_rating:  8572
Number of null artist_rating:  10891


### Replace null values within `album_rating` and `artist_rating` with `0`

In [27]:
predictions = predictions.na.fill(0)

## Count the number of genres per userId-trackId pair

In [29]:
test_genres = test.select('userId', 
                          'trackId', 
                          'genreId_1', 
                          'genreId_2', 
                          'genreId_3', 
                          'genreId_4', 
                          'genreId_5', 
                          'genreId_6', 
                          'genreId_7', 
                          'genreId_8', 
                          'genreId_9', 
                          'genreId_10', 
                          'genreId_11', 
                          'genreId_12', 
                          'genreId_13', 
                          'genreId_14',
                          'genreId_15',
                          'genreId_16',
                          'genreId_17',
                          'genreId_18',
                          'genreId_19',
                          'genreId_20',
                          'genreId_21')

In [32]:
from pyspark.sql.functions import isnull

num_genres = test_genres.select('userId', 'trackId', sum([isnull(test_genres[col]).cast(IntegerType()) for col in test_genres.columns]).alias('num_genres'))

In [33]:
num_genres.show(5)

+------+-------+----------+
|userId|trackId|num_genres|
+------+-------+----------+
|199810| 208019|        21|
|199810|  74139|        14|
|199810|   9903|        17|
|199810| 242681|        18|
|199810|  18515|        18|
+------+-------+----------+
only showing top 5 rows



### Add the number of genres into `predictions` dataframe, 

In [34]:
predictions = predictions.join(num_genres, ['userId', 'trackId'], 'left')

In [35]:
predictions.show(5)

+------+-------+------------+------------+-------------+----------+
|userId|trackId|track_rating|album_rating|artist_rating|num_genres|
+------+-------+------------+------------+-------------+----------+
|200072|  29894|          67|          66|           75|        16|
|200124| 162126|          11|          11|           12|        18|
|200174| 137908|          45|          33|           31|        16|
|200400| 263168|          67|          60|           29|        20|
|200427|  82634|          73|          78|           61|        18|
+------+-------+------------+------------+-------------+----------+
only showing top 5 rows



## Write `predictions` to csv

In [38]:
predictions.coalesce(1).write.csv('ratings.csv', header=True)

## Import the track, album, and artist data
* Track Data: `trackData2.txt`
* Album Data: `albumData2.txt`
* Artist Data: `artistData2.txt`
* Genre Data: `genreData2.txt`

In [14]:
track_schema = StructType([
    StructField('trackId', IntegerType()),
    StructField('albumId', IntegerType()),
    StructField('artistId', IntegerType()),
    StructField('genreId_1', IntegerType()),
    StructField('genreId_2', IntegerType()),
    StructField('genreId_3', IntegerType()),
    StructField('genreId_4', IntegerType()),
    StructField('genreId_5', IntegerType()),
    StructField('genreId_6', IntegerType()),
    StructField('genreId_7', IntegerType()),
    StructField('genreId_8', IntegerType()),
    StructField('genreId_9', IntegerType()),
    StructField('genreId_10', IntegerType()),
    StructField('genreId_11', IntegerType()),
    StructField('genreId_12', IntegerType()),
    StructField('genreId_13', IntegerType()),
    StructField('genreId_14', IntegerType()),
    StructField('genreId_15', IntegerType()),
    StructField('genreId_16', IntegerType()),
    StructField('genreId_17', IntegerType()),
    StructField('genreId_18', IntegerType()),
    StructField('genreId_19', IntegerType()),
    StructField('genreId_20', IntegerType()),
    StructField('genreId_21', IntegerType()),
])

track = spark.read.csv('trackData2.txt',
                     sep='|',
                     nullValue='None',
                     header=False,
                     schema=track_schema)

track.printSchema()

root
 |-- trackId: integer (nullable = true)
 |-- albumId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- genreId_1: integer (nullable = true)
 |-- genreId_2: integer (nullable = true)
 |-- genreId_3: integer (nullable = true)
 |-- genreId_4: integer (nullable = true)
 |-- genreId_5: integer (nullable = true)
 |-- genreId_6: integer (nullable = true)
 |-- genreId_7: integer (nullable = true)
 |-- genreId_8: integer (nullable = true)
 |-- genreId_9: integer (nullable = true)
 |-- genreId_10: integer (nullable = true)
 |-- genreId_11: integer (nullable = true)
 |-- genreId_12: integer (nullable = true)
 |-- genreId_13: integer (nullable = true)
 |-- genreId_14: integer (nullable = true)
 |-- genreId_15: integer (nullable = true)
 |-- genreId_16: integer (nullable = true)
 |-- genreId_17: integer (nullable = true)
 |-- genreId_18: integer (nullable = true)
 |-- genreId_19: integer (nullable = true)
 |-- genreId_20: integer (nullable = true)
 |-- genreId_21: intege

In [15]:
album_schema = StructType([
    StructField('albumId', IntegerType()),
    StructField('artistId', IntegerType()),
    StructField('genreId_1', IntegerType()),
    StructField('genreId_2', IntegerType()),
    StructField('genreId_3', IntegerType()),
    StructField('genreId_4', IntegerType()),
    StructField('genreId_5', IntegerType()),
    StructField('genreId_6', IntegerType()),
    StructField('genreId_7', IntegerType()),
    StructField('genreId_8', IntegerType()),
    StructField('genreId_9', IntegerType()),
    StructField('genreId_10', IntegerType()),
    StructField('genreId_11', IntegerType()),
    StructField('genreId_12', IntegerType()),
    StructField('genreId_13', IntegerType()),
    StructField('genreId_14', IntegerType()),
    StructField('genreId_15', IntegerType()),
    StructField('genreId_16', IntegerType()),
    StructField('genreId_17', IntegerType()),
    StructField('genreId_18', IntegerType()),
    StructField('genreId_19', IntegerType()),
    StructField('genreId_20', IntegerType()),
    StructField('genreId_21', IntegerType()),
])

album = spark.read.csv('albumData2.txt',
                     sep='|',
                     nullValue='None',
                     header=False,
                     schema=album_schema)

album.printSchema()

root
 |-- albumId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- genreId_1: integer (nullable = true)
 |-- genreId_2: integer (nullable = true)
 |-- genreId_3: integer (nullable = true)
 |-- genreId_4: integer (nullable = true)
 |-- genreId_5: integer (nullable = true)
 |-- genreId_6: integer (nullable = true)
 |-- genreId_7: integer (nullable = true)
 |-- genreId_8: integer (nullable = true)
 |-- genreId_9: integer (nullable = true)
 |-- genreId_10: integer (nullable = true)
 |-- genreId_11: integer (nullable = true)
 |-- genreId_12: integer (nullable = true)
 |-- genreId_13: integer (nullable = true)
 |-- genreId_14: integer (nullable = true)
 |-- genreId_15: integer (nullable = true)
 |-- genreId_16: integer (nullable = true)
 |-- genreId_17: integer (nullable = true)
 |-- genreId_18: integer (nullable = true)
 |-- genreId_19: integer (nullable = true)
 |-- genreId_20: integer (nullable = true)
 |-- genreId_21: integer (nullable = true)



In [16]:
artist_schema = StructType([
    StructField('artistId', IntegerType()),
    StructField('genreId_1', IntegerType()),
    StructField('genreId_2', IntegerType()),
    StructField('genreId_3', IntegerType()),
    StructField('genreId_4', IntegerType()),
    StructField('genreId_5', IntegerType()),
    StructField('genreId_6', IntegerType()),
    StructField('genreId_7', IntegerType()),
    StructField('genreId_8', IntegerType()),
    StructField('genreId_9', IntegerType()),
    StructField('genreId_10', IntegerType()),
    StructField('genreId_11', IntegerType()),
    StructField('genreId_12', IntegerType()),
    StructField('genreId_13', IntegerType()),
    StructField('genreId_14', IntegerType()),
    StructField('genreId_15', IntegerType()),
    StructField('genreId_16', IntegerType()),
    StructField('genreId_17', IntegerType()),
    StructField('genreId_18', IntegerType()),
    StructField('genreId_19', IntegerType()),
    StructField('genreId_20', IntegerType()),
    StructField('genreId_21', IntegerType()),
])

artist = spark.read.csv('artistData2.txt',
                     sep='|',
                     nullValue='None',
                     header=False,
                     schema=artist_schema)

album.printSchema()

root
 |-- albumId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- genreId_1: integer (nullable = true)
 |-- genreId_2: integer (nullable = true)
 |-- genreId_3: integer (nullable = true)
 |-- genreId_4: integer (nullable = true)
 |-- genreId_5: integer (nullable = true)
 |-- genreId_6: integer (nullable = true)
 |-- genreId_7: integer (nullable = true)
 |-- genreId_8: integer (nullable = true)
 |-- genreId_9: integer (nullable = true)
 |-- genreId_10: integer (nullable = true)
 |-- genreId_11: integer (nullable = true)
 |-- genreId_12: integer (nullable = true)
 |-- genreId_13: integer (nullable = true)
 |-- genreId_14: integer (nullable = true)
 |-- genreId_15: integer (nullable = true)
 |-- genreId_16: integer (nullable = true)
 |-- genreId_17: integer (nullable = true)
 |-- genreId_18: integer (nullable = true)
 |-- genreId_19: integer (nullable = true)
 |-- genreId_20: integer (nullable = true)
 |-- genreId_21: integer (nullable = true)



In [105]:
genre_schema = StructType([
    StructField('genreId', IntegerType()),
])

genre = spark.read.csv('genreData2.txt',
                     sep='|',
                     nullValue='None',
                     header=False,
                     schema=genre_schema)

genre.printSchema()

root
 |-- genreId: integer (nullable = true)



### Find the `track`, `album`, `artist`, and `genre` hierarchy of the test data

In [17]:
train_ids = train.select('userId', 'itemId', 'rating').coalesce(1)

In [18]:
from pyspark.sql.functions import monotonically_increasing_id

train_ids = train_ids.withColumn('id', monotonically_increasing_id()).persist()

train_ids.show(5)

+------+------+------+---+
|userId|itemId|rating| id|
+------+------+------+---+
|199855| 33722|   0.0|  0|
|199855|274161|   0.0|  1|
|199855|113360|   0.0|  2|
|199855|149962|   0.0|  3|
|199855|155264|   0.0|  4|
+------+------+------+---+
only showing top 5 rows



#### `track`

In [38]:
train_hierarchy_track = train_ids.join(track.withColumnRenamed('trackId', 'itemId'), 'itemId', 'leftsemi')
train_hierarchy_track = train_hierarchy_track.withColumnRenamed('itemId', 'trackId').withColumnRenamed('rating', 'track_rating')

In [39]:
train_hierarchy_track.select('userId', 'trackId', 'track_rating', 'id').show(5)

+------+-------+------------+---+
|userId|trackId|track_rating| id|
+------+-------+------------+---+
|199855| 146264|       100.0| 28|
|199855|  38015|       100.0| 33|
|199855|  18433|       100.0| 36|
|199855|  34230|       100.0| 37|
|199855|  60468|       100.0| 38|
+------+-------+------------+---+
only showing top 5 rows



#### `album`

In [40]:
train_hierarchy_album = train_ids.join(album.withColumnRenamed('albumId', 'itemId'), 'itemId', 'leftsemi')
train_hierarchy_album = train_hierarchy_album.withColumnRenamed('itemId', 'albumId').withColumnRenamed('rating', 'album_rating')

In [41]:
train_hierarchy_album.select('userId', 'albumId', 'album_rating', 'id').show(5)

+------+-------+------------+---+
|userId|albumId|album_rating| id|
+------+-------+------------+---+
|199855|  66885|         0.0| 23|
|199855| 227229|       100.0| 35|
|199855|   8685|       100.0| 45|
|199855| 278330|       100.0| 56|
|199855| 190393|        90.0| 97|
+------+-------+------------+---+
only showing top 5 rows



#### `artist`

In [42]:
train_hierarchy_artist = train_ids.join(artist.withColumnRenamed('artistId', 'itemId'), 'itemId', 'leftsemi')
train_hierarchy_artist = train_hierarchy_artist.withColumnRenamed('itemId', 'artistId').withColumnRenamed('rating', 'artist_rating')

In [43]:
train_hierarchy_artist.select('userId', 'artistId' , 'artist_rating', 'id').show(5)

+------+--------+-------------+---+
|userId|artistId|artist_rating| id|
+------+--------+-------------+---+
|199855|  154852|         70.0| 16|
|199855|  163015|          0.0| 18|
|199855|  285087|          0.0| 19|
|199855|   44857|          0.0| 20|
|199855|  212235|          0.0| 21|
+------+--------+-------------+---+
only showing top 5 rows



#### `genre`

In [107]:
train_hierarchy_genre = train_ids.join(genre.withColumnRenamed('genreId', 'itemId'), 'itemId', 'leftsemi')
train_hierarchy_genre = train_hierarchy_genre.withColumnRenamed('itemId', 'genreId').withColumnRenamed('rating', 'genre_rating')

In [108]:
train_hierarchy_genre.select('userId', 'genreId' , 'genre_rating', 'id').show(5)

+------+-------+------------+---+
|userId|genreId|genre_rating| id|
+------+-------+------------+---+
|199855|  33722|         0.0|  0|
|199855| 274161|         0.0|  1|
|199855| 113360|         0.0|  2|
|199855| 149962|         0.0|  3|
|199855| 155264|         0.0|  4|
+------+-------+------------+---+
only showing top 5 rows



### Join together:
* `train_hierarchy_track`
* `train_hierarchy_album` 
* `train_hierarchy_artist` 

to make `train_hierarchy` dataframe

In [119]:
train_hierarchy = train_ids.select('id', 'userId')

train_hierarchy = train_hierarchy.join(train_hierarchy_track.select('id', 'track_rating', 'trackId'), 'id', 'left')
train_hierarchy = train_hierarchy.join(train_hierarchy_artist.select('id', 'artist_rating', 'artistId'), 'id', 'left')
train_hierarchy = train_hierarchy.join(train_hierarchy_genre.select('id', 'genre_rating', 'genreId'), 'id', 'left')

In [120]:
train_hierarchy.show(5)

+---+------+------------+-------+-------------+--------+------------+-------+
| id|userId|track_rating|trackId|artist_rating|artistId|genre_rating|genreId|
+---+------+------------+-------+-------------+--------+------------+-------+
|  0|199855|        null|   null|         null|    null|         0.0|  33722|
|  1|199855|        null|   null|         null|    null|         0.0| 274161|
|  2|199855|        null|   null|         null|    null|         0.0| 113360|
|  3|199855|        null|   null|         null|    null|         0.0| 149962|
|  4|199855|        null|   null|         null|    null|         0.0| 155264|
+---+------+------------+-------+-------------+--------+------------+-------+
only showing top 5 rows



## Stop Spark Session

In [39]:
spark.stop()