In [1]:
#create a spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Trabalho prático").getOrCreate()

#print the spark version
print(spark.version)


3.5.0


### Lendo o arquivo de metadados do filme

In [2]:
df_titles = spark.read.csv('/home/jovyan/data/title_basics.tsv', \
                            header=True, \
                            inferSchema=True, \
                            sep='\t')

#print the dataframe schema
df_titles.printSchema()


root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



In [3]:
#show me the dataframe df_titles
df_titles.show()

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|     \N|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|     \N|             1|        Comedy

In [4]:
df_ratings = spark.read.csv('/home/jovyan/data/title_ratings.tsv', \
                            header=True, sep='\t')
df_ratings.show(5)

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.7|    1809|
|tt0000002|          6.0|     233|
|tt0000003|          6.5|    1560|
|tt0000004|          6.1|     152|
|tt0000005|          6.2|    2383|
+---------+-------------+--------+
only showing top 5 rows



In [5]:
#join the dataframes df_titles and df_ratings by key tconst
movies = df_titles.join(df_ratings, on='tconst', how='left')

#show me df_films
movies.show(5)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+---------------+-------------+--------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|         genres|averageRating|numVotes|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+---------------+-------------+--------+
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|Animation,Short|          6.0|     233|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|Animation,Short|          6.1|     152|
|tt1198615|    video|       More Than You|       More Than You|      0|     2006|     \N|            85|          Drama|         NULL|    NULL|
|tt7800446|tvEpisode|      Episode #1.142|      Episode #1.142|      0|     2005|     \N|            \N|  Drama,Romance|         NULL|  

In [6]:
movies.count()

8203690

In [7]:
df_titles.filter(df_titles['startYear'] == 2015).count()

358054

In [8]:
#Quantos filmes (incluindo os da televisão) foram lançados no ano de 2015?
movies.filter(movies['startYear'] == 2015).count()

358054

In [9]:
#convert column genres of string to array format
from pyspark.sql.functions import split

# Assuming 'movies' is your DataFrame
movies = movies.withColumn("genres_array", split("genres", ","))

# Now 'genres_array' column contains the genres as an array of strings
movies.show()  # Display the DataFrame to verify the changes

+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+-------------+--------+--------------------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|           genres|averageRating|numVotes|        genres_array|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+-------------+--------+--------------------+
| tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|  Animation,Short|          6.0|     233|  [Animation, Short]|
| tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|  Animation,Short|          6.1|     152|  [Animation, Short]|
| tt0000008|    short|Edison Kinetoscop...|Edison Kinetoscop...|      0|     1894|     \N|             1|Documentary,Short|          5.5|    1965|[Documentary, 

In [10]:
from pyspark.sql import functions as F

# Define the function to get top movies
def get_top_movies(movies):
    movies = (
    movies.groupBy('primaryTitle', 'startYear')
    .agg(F.sum('numVotes').alias('totalNumVotes'), F.avg('averageRating').alias('avgRating'))
    .orderBy(F.desc('totalNumVotes'), F.desc('avgRating'))
    .limit(5)
)
    return movies


In [11]:
top_movies_df = get_top_movies(movies)
top_movies_df.show()

+--------------------+---------+-------------+-----------------+
|        primaryTitle|startYear|totalNumVotes|        avgRating|
+--------------------+---------+-------------+-----------------+
|The Shawshank Red...|     1994|    2449517.0|              9.3|
|     The Dark Knight|     2008|    2405322.0|             8.45|
|           Inception|     2010|    2157686.0|7.366666666666667|
|          Fight Club|     1999|    1930108.0|              8.8|
|        Pulp Fiction|     1994|    1898801.0|              8.9|
+--------------------+---------+-------------+-----------------+



In [12]:
# agrupando e ordenando por genero
from pyspark.sql.functions import explode, desc

def get_top_genres(movies):
    # Explode the 'genres' array column
    exploded_movies = movies.select(explode('genres_array').alias('genre'))
    
    # Group by the 'genre' column and count occurrences, then sort by the count in descending order
    top_genres = (
        exploded_movies
        .groupBy('genre')
        .count()
        .orderBy(desc('count'))
        .limit(5)
    )
    
    return top_genres


In [13]:
top_genres_df = get_top_genres(movies)
top_genres_df.show()

+-----------+-------+
|      genre|  count|
+-----------+-------+
|      Drama|2247995|
|     Comedy|1653725|
|      Short|1021850|
|  Talk-Show| 900198|
|Documentary| 764885|
+-----------+-------+



In [14]:
#dont truncate rows
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", -1)


movies.show(500, truncate=False)

+----------+------------+-----------------------------------------------------------------+-----------------------------------------------------------------+-------+---------+-------+--------------+-----------------------------+-------------+--------+---------------------------------+
|tconst    |titleType   |primaryTitle                                                     |originalTitle                                                    |isAdult|startYear|endYear|runtimeMinutes|genres                       |averageRating|numVotes|genres_array                     |
+----------+------------+-----------------------------------------------------------------+-----------------------------------------------------------------+-------+---------+-------+--------------+-----------------------------+-------------+--------+---------------------------------+
|tt0000002 |short       |Le clown et ses chiens                                           |Le clown et ses chiens                             

In [15]:
# agrupando e ordenando por genero com melhor nota média
def get_top_genres_rating(movies):
    movies = (
    movies.groupBy('genres_array')
    .agg(F.avg('averageRating').alias('avgRating'))
    .orderBy(F.desc('avgRating'))
    .limit(50)
)
    return movies

In [16]:
top_genres_rating = get_top_genres_rating(movies)
top_genres_rating.show(truncate=False)

+----------------------------------+-----------------+
|genres_array                      |avgRating        |
+----------------------------------+-----------------+
|[Comedy, History, Western]        |9.8              |
|[Biography, Comedy, Sport]        |9.8              |
|[News, Reality-TV, Short]         |9.7              |
|[Music, Musical, Talk-Show]       |9.409756097560974|
|[Adventure, Music, Mystery]       |9.406666666666666|
|[Comedy, Game-Show, Musical]      |9.290476190476191|
|[Biography, Crime, Reality-TV]    |9.25             |
|[Biography, Reality-TV, Talk-Show]|9.209999999999999|
|[Comedy, Fantasy, Game-Show]      |9.2              |
|[Action, Reality-TV, Western]     |9.2              |
|[Horror, News, Talk-Show]         |9.2              |
|[Music, War]                      |9.2              |
|[Fantasy, Talk-Show]              |9.1              |
|[Mystery, Sci-Fi, Talk-Show]      |9.1              |
|[History, News]                   |9.0              |
|[Game-Sho

In [17]:
movies.show(50, truncate=False)

+----------+------------+-----------------------------------------------------------+-------------------------------------------------+-------+---------+-------+--------------+-----------------------+-------------+--------+---------------------------+
|tconst    |titleType   |primaryTitle                                               |originalTitle                                    |isAdult|startYear|endYear|runtimeMinutes|genres                 |averageRating|numVotes|genres_array               |
+----------+------------+-----------------------------------------------------------+-------------------------------------------------+-------+---------+-------+--------------+-----------------------+-------------+--------+---------------------------+
|tt0000002 |short       |Le clown et ses chiens                                     |Le clown et ses chiens                           |0      |1892     |\N     |5             |Animation,Short        |6.0          |233     |[Animation, Short]   

In [18]:
#Qual o vídeo game do gênero aventura mais bem avaliado em 2020?
def get_adventure_genre_most_rating(movies):
    movies = (
    movies.groupBy('genres_array'.where(F.col('genres') == 'Adventure'\
                                    & F.col('startYear') == 2020))
    .agg(F.avg('averageRating').alias('avgRating'))
    .orderBy(F.desc('avgRating'))
    .limit(5)
)
    return movies

In [19]:
from pyspark.sql.functions import col

def get_adventure_genre_most_rating(movies):
    adventure_2020 = (
        movies.filter((col('titleType') == 'videoGame') & 
                      (col('startYear') == 2020))
        .groupBy('primaryTitle', 'genres_array')
        .agg(F.avg('averageRating').alias('avgRating'))
        .orderBy(F.desc('avgRating'))
        .limit(100)
    )
    return adventure_2020



In [20]:
top_games_adventure = get_adventure_genre_most_rating(movies)
top_games_adventure.show(truncate=False)

+---------------------------------------------------+------------------------------+---------+
|primaryTitle                                       |genres_array                  |avgRating|
+---------------------------------------------------+------------------------------+---------+
|Half-Life: Alyx                                    |[Action, Adventure, Horror]   |9.5      |
|Ghost of Tsushima                                  |[Action, Adventure, Drama]    |9.3      |
|Omori                                              |[Adventure, Drama, Fantasy]   |9.2      |
|Ori and the Will of the Wisps                      |[Adventure, Fantasy]          |9.1      |
|Final Fantasy VII Remake                           |[Action, Adventure, Fantasy]  |9.1      |
|There Is No Game: Wrong Dimension                  |[Adventure, Comedy]           |8.9      |
|Mega Man Zero/ZX Legacy Collection                 |[Action, Adventure, Sci-Fi]   |8.9      |
|Blaseball                                        

In [21]:
#Quantos títulos de filmes diferentes existem? Use
df_titles.select('primaryTitle').distinct().count()

3931670

In [22]:
movies.show(500, truncate=False)

+----------+------------+-----------------------------------------------------------------+-----------------------------------------------------------------+-------+---------+-------+--------------+-----------------------------+-------------+--------+---------------------------------+
|tconst    |titleType   |primaryTitle                                                     |originalTitle                                                    |isAdult|startYear|endYear|runtimeMinutes|genres                       |averageRating|numVotes|genres_array                     |
+----------+------------+-----------------------------------------------------------------+-----------------------------------------------------------------+-------+---------+-------+--------------+-----------------------------+-------------+--------+---------------------------------+
|tt0000002 |short       |Le clown et ses chiens                                           |Le clown et ses chiens                             

In [23]:
#Qual a duração média dos filmes com conteúdo adulto?
# Use uma combinação de filter() e describe().
movies_adult = df_titles.filter(col('genres') == 'Adult') \
                     .agg(F.avg('runtimeMinutes').alias('avg_runtime'))
movies_adult.show()

+-----------------+
|      avg_runtime|
+-----------------+
|95.77505098844262|
+-----------------+



In [24]:
#Qual a duração média dos filmes com conteúdo adulto?
# Use uma combinação de filter() e describe().
movies_adult = movies.filter(col('genres') == 'Adult') \
                     .agg(F.avg('runtimeMinutes').alias('avg_runtime'))
movies_adult.show()

+-----------------+
|      avg_runtime|
+-----------------+
|95.77505098844262|
+-----------------+



In [25]:
#dont truncate rows
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", -1)

movies_adult2 = movies.filter(col('genres') == 'Adult')

In [26]:
avg = movies_adult2.describe()
avg.show()

+-------+---------+---------+--------------------+--------------------+-------------------+------------------+------------------+-----------------+------+------------------+------------------+
|summary|   tconst|titleType|        primaryTitle|       originalTitle|            isAdult|         startYear|           endYear|   runtimeMinutes|genres|     averageRating|          numVotes|
+-------+---------+---------+--------------------+--------------------+-------------------+------------------+------------------+-----------------+------+------------------+------------------+
|  count|   221116|   221116|              221116|              221116|             221116|            221116|            221116|           221116|221116|             13527|             13527|
|   mean|     NULL|     NULL|            Infinity|            Infinity| 0.9974402576023444|2010.4815295586066|2014.7289377289378|95.77505098844262|  NULL| 6.463310416204626|34.062467657278034|
| stddev|     NULL|     NULL|      

In [27]:
from pyspark.sql.functions import col

# Assuming df_titles is your DataFrame
df_titles.filter(col('primaryTitle') != col('originalTitle')).count()

125056

In [36]:
#Qual o filme que tem o nome mais longo? Dica: consulte
#https://sparkbyexamples.com/spark/spark-using-length-size-of-a-dataframe-column/
#Links to an external site.e use algo como df_titles.orderBy(length(col("primaryTitle")).desc()).
#Create new column with the length of existing string column
from pyspark.sql.functions import col, length, trim

# Assuming df_titles is your DataFrame
df_titles.select(
    col("tconst"),
    col("primaryTitle"),
    length(col("primaryTitle")).alias("len_col"),
    length(trim(col("primaryTitle"))).alias("trim_len_col")
).orderBy(col("len_col").desc()).show(20, truncate=False)


+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+------------+
|tconst    |primaryTitle                                                                                                                                                                                                                                                                                                                                                                                                                       |len_col|trim_len_col|
+----------+--------------------------------------------------------------------------------

In [45]:
movies.select(col('tconst'),col('numVotes'),col('primaryTitle')) \
    .orderBy(col('numVotes').desc()).show(5)

+---------+--------+--------------------+
|   tconst|numVotes|        primaryTitle|
+---------+--------+--------------------+
|tt0110570|    9999|         The Monster|
|tt0110889|    9998|              Priest|
|tt5456546|    9996|            Judwaa 2|
|tt0066995|   99943|Diamonds Are Forever|
|tt0052902|    9992|  The Horse Soldiers|
+---------+--------+--------------------+
only showing top 5 rows



In [40]:
#Qual filme tem a maior quantidade de votos? Dica: Use describe().
movies.select("tconst", "numVotes").describe().show()

+-------+---------+-----------------+
|summary|   tconst|         numVotes|
+-------+---------+-----------------+
|  count|  8203690|          1182639|
|   mean|     NULL|973.0778656885153|
| stddev|     NULL|16275.70904325849|
|    min|tt0000001|               10|
|    max|tt9916880|             9999|
+-------+---------+-----------------+



In [51]:
#Qual é a menor nota média de um filme? Use describe().
from pyspark.sql.functions import col

# Filtering out null averageRating and ordering by ascending values
movies.filter(col('averageRating').isNotNull()) \
    .orderBy(col('averageRating').asc()) \
    .select('primaryTitle', 'averageRating') \
    .show(20, truncate=False)


+-------------------------------------------+-------------+
|primaryTitle                               |averageRating|
+-------------------------------------------+-------------+
|Episode dated 15 September 2006            |1.0          |
|Episode dated 12 May 2004                  |1.0          |
|Weird Norwegian                            |1.0          |
|I Am Autism                                |1.0          |
|That Hogan Man                             |1.0          |
|Best Hellraiser                            |1.0          |
|Episode #3.130                             |1.0          |
|Steckler Interviews                        |1.0          |
|Joan Crawford/ Lili Valenty                |1.0          |
|Tashibanasan onna ni naru!?                |1.0          |
|Girls Will Be Girls                        |1.0          |
|DreamGirls: Real Adventures 37             |1.0          |
|Blood of the Undead: The Unwanted          |1.0          |
|Performance Animation: Catch 'Em in the