In [1]:
!wget -q https://dlcdn.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz 
!tar xf spark-3.1.2-bin-hadoop2.7.tgz
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/default-java"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"
!pip install -q findspark
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession

!head -5 /content/drive/MyDrive/data/IMDb_movies.csv
!head -5 /content/drive/MyDrive/data/IMDb_ratings.csv
!head -5 /content/drive/MyDrive/data/netflix_dataset.csv



imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,None,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey Depew",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1,2
tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,26-12-1906,"Biography, Crime, Drama",70,Australia,None,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Bella Cola, Will Coyne, Sam Crewes, Jack Ennis, John Forde, Vera Linden, Mr. Marshall, Mr. McKenzie, Frank Mills, Ollie Wilson",True story of notorious Australian outlaw Ned Kelly (1855-80).,6.1,589,$ 2250,,,,7,7
tt0001892,Den sorte drøm,Den sorte drøm,1911,19-08-1911,Drama,53,"Germany,

In [5]:
movie_schema = StructType([
        StructField("imdb_title_id", StringType(), True),
        StructField("title", StringType(), True),
        StructField("original_title", StringType(), True),
        StructField("year", IntegerType(), True),
        StructField("date_published", StringType(), True),
        StructField("genre", StringType(), True),
        StructField("duration", IntegerType(), True),
        StructField("country", StringType(), True),
        StructField("language", StringType(), True),
        StructField("director", StringType(), True)])

df_movie = spark.read.csv('/content/drive/MyDrive/data/IMDb_movies.csv', sep=',',schema = movie_schema, header=True)

df_movie.show()

+-------------+--------------------+--------------------+----+--------------+--------------------+--------+----------------+---------------+--------------------+
|imdb_title_id|               title|      original_title|year|date_published|               genre|duration|         country|       language|            director|
+-------------+--------------------+--------------------+----+--------------+--------------------+--------+----------------+---------------+--------------------+
|    tt0000009|          Miss Jerry|          Miss Jerry|1894|    1894-10-09|             Romance|      45|             USA|           None|     Alexander Black|
|    tt0000574|The Story of the ...|The Story of the ...|1906|    26-12-1906|Biography, Crime,...|      70|       Australia|           None|        Charles Tait|
|    tt0001892|      Den sorte drøm|      Den sorte drøm|1911|    19-08-1911|               Drama|      53|Germany, Denmark|           null|           Urban Gad|
|    tt0002101|           Cl

In [6]:
ratings_schema = StructType([
        StructField("imdb_title_id", StringType(), True),
        StructField("weighted_average_vote", FloatType(), True),
        StructField("total_votes", IntegerType(), True),
        StructField("mean_vote", FloatType(), True),
        StructField("median_vote", IntegerType(), True),
        StructField("vote_10", IntegerType(), True),
        StructField("vote_9", IntegerType(), True),
        StructField("vote_8", IntegerType(), True),
        StructField("vote_7", IntegerType(), True),
        StructField("vote_6", IntegerType(), True)])
      
df_ratings = spark.read.csv('/content/drive/MyDrive/data/IMDb_ratings.csv', sep=',',schema = ratings_schema, header=True)         
df_ratings.show()

+-------------+---------------------+-----------+---------+-----------+-------+------+------+------+------+
|imdb_title_id|weighted_average_vote|total_votes|mean_vote|median_vote|vote_10|vote_9|vote_8|vote_7|vote_6|
+-------------+---------------------+-----------+---------+-----------+-------+------+------+------+------+
|    tt0000009|                  5.9|        154|      5.9|          6|     12|     4|    10|    43|    28|
|    tt0000574|                  6.1|        589|      6.3|          6|     57|    18|    58|   137|   139|
|    tt0001892|                  5.8|        188|      6.0|          6|      6|     6|    17|    44|    52|
|    tt0002101|                  5.2|        446|      5.3|          5|     15|     8|    16|    62|    98|
|    tt0002130|                  7.0|       2237|      6.9|          7|    210|   225|   436|   641|   344|
|    tt0002199|                  5.7|        484|      5.8|          6|     33|    15|    48|    80|   123|
|    tt0002423|             

In [7]:
netflix_schema = StructType([
        StructField("show_id", StringType(), True),
        StructField("type", StringType(), True),
        StructField("title", StringType(), True),
        StructField("director", StringType(), True),
        StructField("cast", StringType(), True),
        StructField("country", StringType(), True),
        StructField("date_added", StringType(), True),
        StructField("release_year", IntegerType(), True),
        StructField("rating", StringType(), True),
        StructField("duration", StringType(), True)])
      
df_netflix = spark.read.csv('/content/drive/MyDrive/data/netflix_dataset.csv', sep=',',schema = netflix_schema, header=True)    
df_netflix.show()

+-------+-------+------+--------------------+--------------------+--------------------+-----------------+------------+------+---------+
|show_id|   type| title|            director|                cast|             country|       date_added|release_year|rating| duration|
+-------+-------+------+--------------------+--------------------+--------------------+-----------------+------------+------+---------+
|     s1|TV Show|    3%|                null|João Miguel, Bian...|              Brazil|  August 14, 2020|        2020| TV-MA|4 Seasons|
|     s2|  Movie| 07:19|   Jorge Michel Grau|Demián Bichir, Hé...|              Mexico|December 23, 2016|        2016| TV-MA|   93 min|
|     s3|  Movie| 23:59|        Gilbert Chan|Tedd Chan, Stella...|           Singapore|December 20, 2018|        2011|     R|   78 min|
|     s4|  Movie|     9|         Shane Acker|Elijah Wood, John...|       United States|November 16, 2017|        2009| PG-13|   80 min|
|     s5|  Movie|    21|      Robert Luketic|Jim

In [8]:
df_movie_model=df_movie.select('imdb_title_id', 'title')
df_movie_model.show( truncate= False)

+-------------+---------------------------------------------------+
|imdb_title_id|title                                              |
+-------------+---------------------------------------------------+
|tt0000009    |Miss Jerry                                         |
|tt0000574    |The Story of the Kelly Gang                        |
|tt0001892    |Den sorte drøm                                     |
|tt0002101    |Cleopatra                                          |
|tt0002130    |L'Inferno                                          |
|tt0002199    |From the Manger to the Cross; or, Jesus of Nazareth|
|tt0002423    |Madame DuBarry                                     |
|tt0002445    |Quo Vadis?                                         |
|tt0002452    |Independenta Romaniei                              |
|tt0002461    |Richard III                                        |
|tt0002646    |Atlantis                                           |
|tt0002844    |Fantômas - À l'ombre de la guillo

In [44]:

df_ratings_model=df_ratings.select('imdb_title_id', 'weighted_average_vote')
df_ratings_model.show( truncate= False)

# this is where I joined the data frames, movie_ratings and imdb_rating, and dropped the duplicate imdb_title_id column that was created with the join 
# union the above created dataframes
result = df_ratings_model.join(df_movie_model,df_ratings_model.imdb_title_id ==  df_movie_model.imdb_title_id,"inner").drop(df_movie_model.imdb_title_id)


result.show()

+-------------+---------------------+
|imdb_title_id|weighted_average_vote|
+-------------+---------------------+
|tt0000009    |5.9                  |
|tt0000574    |6.1                  |
|tt0001892    |5.8                  |
|tt0002101    |5.2                  |
|tt0002130    |7.0                  |
|tt0002199    |5.7                  |
|tt0002423    |6.8                  |
|tt0002445    |6.2                  |
|tt0002452    |6.7                  |
|tt0002461    |5.5                  |
|tt0002646    |6.6                  |
|tt0002844    |7.0                  |
|tt0003014    |7.1                  |
|tt0003037    |7.0                  |
|tt0003102    |6.2                  |
|tt0003131    |6.5                  |
|tt0003165    |7.0                  |
|tt0003167    |5.8                  |
|tt0003419    |6.5                  |
|tt0003471    |6.0                  |
+-------------+---------------------+
only showing top 20 rows

+-------------+---------------------+--------------------+
|im

In [52]:
# this is where I dropped the letters 'tt' at the beginning of the imdb_title_id column and created the imdb_title_id_new

from pyspark.sql.functions import *
import pyspark.sql.functions as F
 
df_replace = result.withColumn('imdb_title_id_new', F.regexp_replace('imdb_title_id', r't', ''))

df_replace.show() 

# this is where I casted the imdb_title_id_new as an integer 

df_text_2_cast = df_replace.withColumn('imdb_title_id_new', col('imdb_title_id_new').cast(IntegerType()))
df_text_2_cast.printSchema()

# this is where I selected all the columns for the als model 

df_replace_2 = df_text_2_cast.select('imdb_title_id_new', 'weighted_average_vote', 'title')
df_replace_2.show()

+-------------+---------------------+--------------------+-----------------+
|imdb_title_id|weighted_average_vote|               title|imdb_title_id_new|
+-------------+---------------------+--------------------+-----------------+
|    tt0000009|                  5.9|          Miss Jerry|          0000009|
|    tt0000574|                  6.1|The Story of the ...|          0000574|
|    tt0001892|                  5.8|      Den sorte drøm|          0001892|
|    tt0002101|                  5.2|           Cleopatra|          0002101|
|    tt0002130|                  7.0|           L'Inferno|          0002130|
|    tt0002199|                  5.7|From the Manger t...|          0002199|
|    tt0002423|                  6.8|      Madame DuBarry|          0002423|
|    tt0002445|                  6.2|          Quo Vadis?|          0002445|
|    tt0002452|                  6.7|Independenta Roma...|          0002452|
|    tt0002461|                  5.5|         Richard III|          0002461|

In [46]:
# columns selected again - scroll down for als model attempt

df_replace_2 = df_replace.select('imdb_title_id_new', 'weighted_average_vote', 'title')
df_replace_2.show()

+-----------------+---------------------+--------------------+
|imdb_title_id_new|weighted_average_vote|               title|
+-----------------+---------------------+--------------------+
|          0000009|                  5.9|          Miss Jerry|
|          0000574|                  6.1|The Story of the ...|
|          0001892|                  5.8|      Den sorte drøm|
|          0002101|                  5.2|           Cleopatra|
|          0002130|                  7.0|           L'Inferno|
|          0002199|                  5.7|From the Manger t...|
|          0002423|                  6.8|      Madame DuBarry|
|          0002445|                  6.2|          Quo Vadis?|
|          0002452|                  6.7|Independenta Roma...|
|          0002461|                  5.5|         Richard III|
|          0002646|                  6.6|            Atlantis|
|          0002844|                  7.0|Fantômas - À l'om...|
|          0003014|                  7.1|Il calvario di

In [10]:
df_movie_ratings = df_ratings_model.join(df_movie_model, 'imdb_title_id',) \
                                                .select(df_movie_model['title'],  
                                                df_ratings_model['imdb_title_id'],
                                                df_ratings_model['weighted_average_vote'])
                                                
df_movie_ratings.show()

+--------------------+-------------+---------------------+
|               title|imdb_title_id|weighted_average_vote|
+--------------------+-------------+---------------------+
|          Miss Jerry|    tt0000009|                  5.9|
|The Story of the ...|    tt0000574|                  6.1|
|      Den sorte drøm|    tt0001892|                  5.8|
|           Cleopatra|    tt0002101|                  5.2|
|           L'Inferno|    tt0002130|                  7.0|
|From the Manger t...|    tt0002199|                  5.7|
|      Madame DuBarry|    tt0002423|                  6.8|
|          Quo Vadis?|    tt0002445|                  6.2|
|Independenta Roma...|    tt0002452|                  6.7|
|         Richard III|    tt0002461|                  5.5|
|            Atlantis|    tt0002646|                  6.6|
|Fantômas - À l'om...|    tt0002844|                  7.0|
|Il calvario di un...|    tt0003014|                  7.1|
|Juve contre Fantômas|    tt0003037|                  7.

In [11]:
df_netflix_model=df_netflix.select('title', 'type', 'rating')
df_netflix_model.show( truncate= False)

+------+-------+------+
|title |type   |rating|
+------+-------+------+
|3%    |TV Show|TV-MA |
|07:19 |Movie  |TV-MA |
|23:59 |Movie  |R     |
|9     |Movie  |PG-13 |
|21    |Movie  |PG-13 |
|46    |TV Show|TV-MA |
|122   |Movie  |TV-MA |
|187   |Movie  |R     |
|706   |Movie  |TV-14 |
|1920  |Movie  |TV-MA |
|1922  |Movie  |TV-MA |
|1983  |TV Show|TV-MA |
|1994  |TV Show|TV-MA |
|2,215 |Movie  |TV-MA |
|3022  |Movie  |R     |
|Oct-01|Movie  |TV-14 |
|Feb-09|TV Show|TV-14 |
|22-Jul|Movie  |R     |
|15-Aug|Movie  |TV-14 |
|'89   |Movie  |TV-PG |
+------+-------+------+
only showing top 20 rows



In [12]:
df_netflix_ratings = df_netflix_model.join(df_movie_ratings, 'title',) \
                                                .select(df_movie_ratings['weighted_average_vote'], 
                                                df_netflix_model['title'],
                                                df_netflix_model['type'])

df_netflix_rating_clean = df_netflix_ratings.na.drop()
df_netflix_rating_clean.show()                                         


+---------------------+---------------+-------+
|weighted_average_vote|          title|   type|
+---------------------+---------------+-------+
|                  6.4|Sherlock Holmes|  Movie|
|                  6.1|      Leap Year|  Movie|
|                  5.8|Sherlock Holmes|  Movie|
|                  6.0|       The Trap|  Movie|
|                  7.1|        Michael|  Movie|
|                  6.3|     The Circle|TV Show|
|                  6.2|    The Monster|  Movie|
|                  6.1|            She|TV Show|
|                  6.9|       The Show|  Movie|
|                  8.1|         Aurora|  Movie|
|                  5.6|       Marianne|TV Show|
|                  7.5|Animal Crackers|  Movie|
|                  6.2|     Borderline|TV Show|
|                  5.4|     Conspiracy|TV Show|
|                  5.2|  Runaway Bride|  Movie|
|                  7.5|        Dracula|TV Show|
|                  6.8|  Private Lives|TV Show|
|                  7.9|         Freaks| 

In [13]:
df_netflix_rating_clean.sort(df_netflix_rating_clean.weighted_average_vote.desc()).show(truncate=False)

+---------------------+-----------------------------+-------+
|weighted_average_vote|title                        |type   |
+---------------------+-----------------------------+-------+
|9.0                  |Breakout                     |TV Show|
|9.0                  |Innocent                     |TV Show|
|8.9                  |Schindler's List             |Movie  |
|8.9                  |Pulp Fiction                 |Movie  |
|8.8                  |Inception                    |Movie  |
|8.6                  |City of God                  |Movie  |
|8.6                  |Oththa Seruppu Size 7        |Movie  |
|8.6                  |Much Ado About Nothing       |Movie  |
|8.6                  |Ani... Dr. Kashinath Ghanekar|Movie  |
|8.6                  |Koshish                      |Movie  |
|8.6                  |Gol Maal                     |Movie  |
|8.6                  |Eh Janam Tumhare Lekhe       |Movie  |
|8.5                  |American History X           |Movie  |
|8.5    

In [14]:
df_netflix = df_netflix_rating_clean.sort(df_netflix_rating_clean.weighted_average_vote.asc()).show(truncate=False)


+---------------------+-------------------------------------------+-------+
|weighted_average_vote|title                                      |type   |
+---------------------+-------------------------------------------+-------+
|1.6                  |Pink                                       |Movie  |
|1.6                  |Aerials                                    |Movie  |
|1.6                  |Welcome to New York                        |Movie  |
|1.7                  |Himmatwala                                 |Movie  |
|1.9                  |End Game                                   |Movie  |
|1.9                  |Kyaa Kool Hain Hum 3                       |Movie  |
|1.9                  |The Vault                                  |Movie  |
|2.0                  |Killers                                    |Movie  |
|2.0                  |Battle                                     |Movie  |
|2.0                  |Jackpot                                    |Movie  |
|2.0        

In [53]:
# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create test and train set
(training, test) = df_replace_2.randomSplit([0.8, 0.2], seed = 1234)
print (training.first())

Row(imdb_title_id_new=9, weighted_average_vote=5.900000095367432, title='Miss Jerry')


In [55]:
# this did not work - but I feel like we may need to index the titles ?
# Create ALS model
als = ALS(maxIter=5, regParam=0.01, userCol="title", itemCol="imdb_title_id_new", ratingCol="weighted_average_vote",
          coldStartStrategy="drop")
model = als.fit(training)

IllegalArgumentException: ignored