<h1>1) Which director's movies has the highest votes? and for that director, which year he has the highest votes?</h1>

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum, max as spark_max

# Initialize Spark session
spark = SparkSession.builder.appName("DisneyAnalysis").getOrCreate()

# Load the Disney dataset
df = spark.read.csv("disney_cleaned_222.csv", header=True, inferSchema=True)

# Group by director and sum the votes
director_votes = df.groupBy("director").agg(spark_sum("imdb_votes").alias("total_votes"))

# Find the director with the highest total votes
top_director_row = director_votes.orderBy(col("total_votes").desc()).first()
top_director = top_director_row["director"]

# Filter the original dataset for the top director
top_director_df = df.filter(col("director") == top_director)

# Group by year and sum the votes for the top director
yearly_votes = top_director_df.groupBy("added_at").agg(spark_sum("imdb_votes").alias("yearly_votes"))

# Find the year with the highest votes for the top director
top_year_row = yearly_votes.orderBy(col("yearly_votes").desc()).first()
top_year = top_year_row["added_at"]
top_year_votes = top_year_row["yearly_votes"]

print(f"The director with the highest total votes is {top_director}.")
print(f"The year with the highest votes for {top_director} is {top_year} with {top_year_votes} votes.")



The director with the highest total votes is George Lucas.
The year with the highest votes for George Lucas is 2019-11-12 with 3182245 votes.


<h1>2) Total number of movies released in 1999, by director wise.</h1>

In [14]:

movies_1999 = df.filter(col("released_at").rlike("(?i)-99$"))

# Group by director and count the number of movies
movies_by_director = movies_1999.groupBy("director").count()

# Show the result
movies_by_director.show()


+--------------------+-----+
|            director|count|
+--------------------+-----+
|          Gil Junger|    1|
|       Maurice Joyce|    1|
|       David Kellogg|    1|
|        LeVar Burton|    1|
|Robert C. Ramirez...|    1|
|     Kenneth Johnson|    2|
|          Rod Daniel|    1|
|             Unknown|    3|
|         David Lynch|    1|
|         Steve Boyum|    1|
|         Greg Beeman|    1|
|      Duwayne Dunham|    1|
|John Lasseter, As...|    1|
|        Rob Marshall|    1|
|      Paul Schneider|    1|
|        George Lucas|    1|
|Jun Falkenstein, ...|    1|
|       Donald Petrie|    1|
+--------------------+-----+



<h1>3) Which year movies got highest awards? And it belongs to which director?</h1>

In [31]:
from pyspark.sql.functions import col, sum as spark_sum

# Create a new column for total awards
df_with_total = df.withColumn("total_awards", col("wins") + col("oscar_wins"))

# Filter out rows where director is 'unknown'
df_filtered = df_with_total.filter(col("director") != "Unknown")

# Group by release year and sum the total awards
awards_by_year = df_filtered.groupBy("released_at").agg(spark_sum("total_awards").alias("total_awards"))

# Find the year with the highest total awards
max_awards_row = awards_by_year.orderBy(col("total_awards").desc()).first()
max_awards_year = max_awards_row["released_at"]

# Filter the dataset for that year and get directors with non-zero total awards
directors_in_max_year = df_filtered.filter(
    (col("released_at") == max_awards_year) & (col("total_awards") > 0)
).select("director", "total_awards")

# Show results
print(f"The year with the highest total awards is: {max_awards_year}")
directors_in_max_year.show()



The year with the highest total awards is: 16-Feb-18
+------------+------------+
|    director|total_awards|
+------------+------------+
|Ryan Coogler|         111|
+------------+------------+



<h1>4) Total number of movies nominated for awards between 1998-2018?</h1>

In [41]:

# Add a new column for total nominations
df_with_total_nominations = df.withColumn("total_nominations",col("nominations") + col("oscar_nominations") + col("golden_globe_nominations"))

from pyspark.sql.functions import regexp_extract

# Extract year from 'released_at' and convert to full year
df_with_year = df_with_total_nominations.withColumn("release_year",regexp_extract("released_at", r"(\d{2})-(\w{3})-(\d{2})", 3).cast("int") + 1900).withColumn("release_year", col("release_year") + (100 * (col("release_year") < 1950).cast("int")))

# Filter for nominated movies between 1998 and 2018
nominated_movies = df_with_year.filter(
    (col("release_year") >= 1998) &
    (col("release_year") <= 2018) &
    (col("total_nominations").isNotNull()) &
    (col("total_nominations") > 0)
)

# Count the total number of such movies
total_nominated = nominated_movies.count()

print(f"Total number of Disney movies nominated for awards between 1998 and 2018: {total_nominated}")



Total number of Disney movies nominated for awards between 1998 and 2018: 362
