In [7]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
import os
import matplotlib.pyplot as plt
import numpy as np
from pyspark.sql.window import Window

spark = SparkSession.builder.getOrCreate()
DISPLAY = False # True/False para ver os displays

In [8]:
links_path = '../input/links.csv' # usando small pra ir mais rapido
df_links = spark.read.csv(links_path, header=True)

ratings_path = '../input/ratings.csv'# usando small pra ir mais rapido
df_ratings = spark.read.csv(ratings_path, header=True)

movies_metadata_path = '../input/movies_metadata.csv'
df_movies = spark.read.csv(movies_metadata_path, header=True)

keywords_path = '../input/keywords.csv'
df_keywords = spark.read.csv(keywords_path, header=True)

credits_path = '../input/credits.csv'
df_credits = spark.read.csv(credits_path, header=True)

In [3]:
df_movies.show(truncate=False)

+-----+---------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+-----------------------------------------------------------------------------------------------------------------------------+--------------------------------------------+-----+---------+-----------------+------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------------------------------+-----------------------------------------------------------------------------------------------

In [3]:
schema = T.ArrayType(T.StructType([
    T.StructField("id", T.StringType(), True),
    T.StructField("name", T.StringType(), True)
]))

w = Window().partitionBy('genres_explode.id')

df_genero = (
  df_movies
  .withColumn("genres_array", F.from_json("genres", schema))
  .withColumn("imdbId", F.substring(F.col("imdb_id"), 3, 1000))# tirando tt
  .filter(F.col('genres_array.name').isNotNull())
  .withColumn("genres_explode", F.explode("genres_array"))
  .withColumn('num_aparicao', F.count("*").over(w))
  .where(F.col("num_aparicao") > 2)
  .selectExpr('imdbId', 'genres_explode.name as genero')
  .groupBy("imdbId") 
  .pivot("genero") 
  .agg(F.lit(1)) 
  .na.fill(0)
)

                                                                                

In [4]:
df_genero.show(truncate=False)



+-------+------+---------+---------+------+-----+-----------+-----+------+-------+-------+-------+------+-------------+-----+-------+-------+---------------+--------+--------+---+-------+
|imdbId |Action|Adventure|Animation|Comedy|Crime|Documentary|Drama|Family|Fantasy|Foreign|History|Horror|Miramax Films|Music|Mystery|Romance|Science Fiction|TV Movie|Thriller|War|Western|
+-------+------+---------+---------+------+-----+-----------+-----+------+-------+-------+-------+------+-------------+-----+-------+-------+---------------+--------+--------+---+-------+
|2114375|0     |0        |0        |1     |0    |0          |0    |0     |0      |0      |0      |0     |0            |0    |0      |0      |0              |0       |0       |0  |0      |
|0089114|0     |1        |0        |0     |0    |0          |0    |1     |1      |0      |0      |0     |0            |0    |0      |1      |1              |0       |0       |0  |0      |
|2900822|0     |0        |0        |0     |0    |0          

                                                                                

In [33]:
df_adult.printSchema()


root
 |-- imdb_id: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- adult: integer (nullable = false)
 |-- title: string (nullable = true)
 |-- release_date: date (nullable = true)



In [34]:
df_preprocess = (
    df_movies
    .withColumn("imdbId", F.substring(F.col("imdb_id"), 3, 1000))# tirando tt
    .withColumn("adult", F.when(F.col("adult") == True, 1).otherwise(0))
    .withColumn("is_valid_title", F.when((F.col("title").cast(T.StringType()).isNotNull()) & (~F.col("title").rlike(r"[\[\]\{\}]")) & (~F.col("title").rlike(r"^\d+(\.\d+)?$")), True).otherwise(False))
    .filter(F.col("is_valid_title") == True)
    .withColumn("is_valid_overview", F.when((F.col("overview").cast(T.StringType()).isNotNull()) & (~F.col("overview").rlike(r"[\[\]\{\}]")) & (~F.col("overview").rlike(r"^\d+(\.\d+)?$")) & (F.length(F.col("overview")) >= 10), True).otherwise(False))
    .filter(F.col("is_valid_overview") == True)
    .withColumn("release_date", F.to_date(F.col("release_date"), "yyyy-MM-dd"))
    .withColumn('popularity', F.log1p(F.col('popularity').cast("float")))
    .withColumn('vote_average', F.col('vote_average').cast("float"))
    .dropna()
    .filter((F.col('vote_average') >= 0) & (F.col('vote_average') <= 10))
    .withColumn('movie_overall_grade', F.round((F.col("vote_average")/10),2))
    .withColumn("movie_popularity", F.round(F.when(F.col("popularity")> 3, F.lit(1)).otherwise(F.col("popularity")/3),4))
    .select('imdbId', 'overview', 'adult', 'title', 'release_date',"movie_popularity","movie_overall_grade")
    .join(df_genero, on=["imdbId"], how="inner")
)

    


In [35]:
df_adult.show(truncate=False)

24/08/26 21:28:14 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+---------------------------------------+------------+----------------+-------------------+
|imdbId |overview                                                                                                                                                                                                                                                                                                                                                                                                         |adult|title                                  |release_date|movie_popularity|movie

In [5]:
print(df_movies.columns)


['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count']
