In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Movies data processing").getOrCreate()

In [2]:
from pyspark.sql.types import  *
from pyspark.sql.functions import col

In [3]:
import numpy as np

In [4]:
movies = spark.read.csv("datasets/movies_metadata.csv", header = True) 

In [5]:
movies.dtypes

[('adult', 'string'),
 ('belongs_to_collection', 'string'),
 ('budget', 'string'),
 ('genres', 'string'),
 ('homepage', 'string'),
 ('id', 'string'),
 ('imdb_id', 'string'),
 ('original_language', 'string'),
 ('original_title', 'string'),
 ('overview', 'string'),
 ('popularity', 'string'),
 ('poster_path', 'string'),
 ('production_companies', 'string'),
 ('production_countries', 'string'),
 ('release_date', 'string'),
 ('revenue', 'string'),
 ('runtime', 'string'),
 ('spoken_languages', 'string'),
 ('status', 'string'),
 ('tagline', 'string'),
 ('title', 'string'),
 ('video', 'string'),
 ('vote_average', 'string'),
 ('vote_count', 'string')]

In [6]:
schema = ArrayType(
    StructType([StructField("id", IntegerType()), 
                StructField("name", StringType())]))

In [7]:
from pyspark.sql.functions import from_json

In [8]:
movies = movies.withColumn("adult", (movies.adult).cast("Boolean"))\
         .withColumn("budget", (movies.budget).cast("Integer"))\
         .withColumn("genres", from_json(movies.genres, schema))

In [35]:
movies.count()

45572

In [36]:
movies = movies.filter(movies.genres.isNotNull())

In [37]:
movies.count()

45378

In [38]:
movies.dtypes

[('adult', 'boolean'),
 ('belongs_to_collection', 'string'),
 ('budget', 'int'),
 ('genres', 'array<struct<id:int,name:string>>'),
 ('homepage', 'string'),
 ('id', 'string'),
 ('imdb_id', 'string'),
 ('original_language', 'string'),
 ('original_title', 'string'),
 ('overview', 'string'),
 ('popularity', 'string'),
 ('poster_path', 'string'),
 ('production_companies', 'string'),
 ('production_countries', 'string'),
 ('release_date', 'string'),
 ('revenue', 'string'),
 ('runtime', 'string'),
 ('spoken_languages', 'string'),
 ('status', 'string'),
 ('tagline', 'string'),
 ('title', 'string'),
 ('video', 'string'),
 ('vote_average', 'string'),
 ('vote_count', 'string')]

In [72]:
movies.select(["id","genres"]).show(10, truncate = False)

+-----+-------------------------------------------------------------+
|id   |genres                                                       |
+-----+-------------------------------------------------------------+
|862  |[[16, Animation], [35, Comedy], [10751, Family]]             |
|8844 |[[12, Adventure], [14, Fantasy], [10751, Family]]            |
|15602|[[10749, Romance], [35, Comedy]]                             |
|31357|[[35, Comedy], [18, Drama], [10749, Romance]]                |
|11862|[[35, Comedy]]                                               |
|949  |[[28, Action], [80, Crime], [18, Drama], [53, Thriller]]     |
|11860|[[35, Comedy], [10749, Romance]]                             |
|45325|[[28, Action], [12, Adventure], [18, Drama], [10751, Family]]|
|9091 |[[28, Action], [12, Adventure], [53, Thriller]]              |
|710  |[[12, Adventure], [28, Action], [53, Thriller]]              |
+-----+-------------------------------------------------------------+
only showing top 10 

In [73]:
genres = movies.select(["id","genres"])
rdd=genres.rdd

In [74]:
rdd.collect()

[Row(id='862', genres=[Row(id=16, name='Animation'), Row(id=35, name='Comedy'), Row(id=10751, name='Family')]),
 Row(id='8844', genres=[Row(id=12, name='Adventure'), Row(id=14, name='Fantasy'), Row(id=10751, name='Family')]),
 Row(id='15602', genres=[Row(id=10749, name='Romance'), Row(id=35, name='Comedy')]),
 Row(id='31357', genres=[Row(id=35, name='Comedy'), Row(id=18, name='Drama'), Row(id=10749, name='Romance')]),
 Row(id='11862', genres=[Row(id=35, name='Comedy')]),
 Row(id='949', genres=[Row(id=28, name='Action'), Row(id=80, name='Crime'), Row(id=18, name='Drama'), Row(id=53, name='Thriller')]),
 Row(id='11860', genres=[Row(id=35, name='Comedy'), Row(id=10749, name='Romance')]),
 Row(id='45325', genres=[Row(id=28, name='Action'), Row(id=12, name='Adventure'), Row(id=18, name='Drama'), Row(id=10751, name='Family')]),
 Row(id='9091', genres=[Row(id=28, name='Action'), Row(id=12, name='Adventure'), Row(id=53, name='Thriller')]),
 Row(id='710', genres=[Row(id=12, name='Adventure'), R

In [75]:
movie_genre = rdd.flatMap(lambda r: map(lambda g: (r.id, g.id), r.genres))
genre = rdd.flatMap(lambda r: r.genres)

In [76]:
movie_genre.count()

90881

In [77]:
genre.count()

90881

In [78]:
movie_genre.collect()

[('862', 16),
 ('862', 35),
 ('862', 10751),
 ('8844', 12),
 ('8844', 14),
 ('8844', 10751),
 ('15602', 10749),
 ('15602', 35),
 ('31357', 35),
 ('31357', 18),
 ('31357', 10749),
 ('11862', 35),
 ('949', 28),
 ('949', 80),
 ('949', 18),
 ('949', 53),
 ('11860', 35),
 ('11860', 10749),
 ('45325', 28),
 ('45325', 12),
 ('45325', 18),
 ('45325', 10751),
 ('9091', 28),
 ('9091', 12),
 ('9091', 53),
 ('710', 12),
 ('710', 28),
 ('710', 53),
 ('9087', 35),
 ('9087', 18),
 ('9087', 10749),
 ('12110', 35),
 ('12110', 27),
 ('21032', 10751),
 ('21032', 16),
 ('21032', 12),
 ('10858', 36),
 ('10858', 18),
 ('1408', 28),
 ('1408', 12),
 ('524', 18),
 ('524', 80),
 ('4584', 18),
 ('4584', 10749),
 ('5', 80),
 ('5', 35),
 ('9273', 80),
 ('9273', 35),
 ('9273', 12),
 ('11517', 28),
 ('11517', 35),
 ('11517', 80),
 ('8012', 35),
 ('8012', 53),
 ('8012', 80),
 ('1710', 18),
 ('1710', 53),
 ('9691', 28),
 ('9691', 12),
 ('9691', 80),
 ('9691', 53),
 ('12665', 18),
 ('12665', 14),
 ('12665', 878),
 ('12

In [79]:
genre = genre.map(tuple)

In [80]:
genre.collect()

[(16, 'Animation'),
 (35, 'Comedy'),
 (10751, 'Family'),
 (12, 'Adventure'),
 (14, 'Fantasy'),
 (10751, 'Family'),
 (10749, 'Romance'),
 (35, 'Comedy'),
 (35, 'Comedy'),
 (18, 'Drama'),
 (10749, 'Romance'),
 (35, 'Comedy'),
 (28, 'Action'),
 (80, 'Crime'),
 (18, 'Drama'),
 (53, 'Thriller'),
 (35, 'Comedy'),
 (10749, 'Romance'),
 (28, 'Action'),
 (12, 'Adventure'),
 (18, 'Drama'),
 (10751, 'Family'),
 (28, 'Action'),
 (12, 'Adventure'),
 (53, 'Thriller'),
 (12, 'Adventure'),
 (28, 'Action'),
 (53, 'Thriller'),
 (35, 'Comedy'),
 (18, 'Drama'),
 (10749, 'Romance'),
 (35, 'Comedy'),
 (27, 'Horror'),
 (10751, 'Family'),
 (16, 'Animation'),
 (12, 'Adventure'),
 (36, 'History'),
 (18, 'Drama'),
 (28, 'Action'),
 (12, 'Adventure'),
 (18, 'Drama'),
 (80, 'Crime'),
 (18, 'Drama'),
 (10749, 'Romance'),
 (80, 'Crime'),
 (35, 'Comedy'),
 (80, 'Crime'),
 (35, 'Comedy'),
 (12, 'Adventure'),
 (28, 'Action'),
 (35, 'Comedy'),
 (80, 'Crime'),
 (35, 'Comedy'),
 (53, 'Thriller'),
 (80, 'Crime'),
 (18, 'Dr

In [81]:
genre = genre.reduceByKey(lambda a, b : a)

In [82]:
genre.count()

51

In [83]:
genre.collect()

[(16, 'Animation'),
 (12, 'Adventure'),
 (28, 'Action'),
 (80, 'Crime'),
 (36, 'History'),
 (9648, 'Mystery'),
 (10752, 'War'),
 (6308, 'U.S. Office of War Information'),
 (11176, 'Carousel Productions'),
 (29812, 'Telescene Film Group Productions'),
 (7760, 'BROSTA TV'),
 (8424, 'Fides Films'),
 (18012, 'Pulser Productions'),
 (10749, 'Romance'),
 (53, 'Thriller'),
 (10769, 'Foreign'),
 (37, 'Western'),
 (2957, 'Double Play'),
 (13993, 'Meadway-Claude Productions Company'),
 (8453, 'Télévision Suisse-Romande (TSR)'),
 (7761, 'Mardock Scramble Production Committee'),
 (7105, 'Mwana Productions'),
 (17161, 'Odyssey Media'),
 (18013, 'Rogue State'),
 (14, 'Fantasy'),
 (18, 'Drama'),
 (878, 'Science Fiction'),
 (10402, 'Music'),
 (10770, 'TV Movie'),
 (9386, 'First Motion Pictures Unit US Army Air Forces'),
 (11602, 'Vision View Entertainment'),
 (60250, 'WhiteFlame Productions'),
 (7850, 'Sciapode'),
 (34034, 'LStar Capital'),
 (7106, 'Sylicone'),
 (23822, 'The Cartel'),
 (11778, 'F Comm

In [84]:
genres = genre.sortByKey(ascending=True)