<a href="https://colab.research.google.com/github/nickname8888/pyspark-prac/blob/main/spark_sql_eda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("spark-sql-analyzer").getOrCreate()

df = spark.read.csv("/content/imdb_top_1000.csv", header=True, inferSchema=True)

df.createOrReplaceTempView("imdb_movies")

In [5]:
spark.sql("SELECT * FROM imdb_movies LIMIT 5").show()

+--------------------+--------------------+-------------+-----------+-------+--------------------+-----------+--------------------+----------+--------------------+--------------+--------------+-------------+--------------+-----------+-----------+
|         Poster_Link|        Series_Title|Released_Year|Certificate|Runtime|               Genre|IMDB_Rating|            Overview|Meta_score|            Director|         Star1|         Star2|        Star3|         Star4|No_of_Votes|      Gross|
+--------------------+--------------------+-------------+-----------+-------+--------------------+-----------+--------------------+----------+--------------------+--------------+--------------+-------------+--------------+-----------+-----------+
|https://m.media-a...|The Shawshank Red...|         1994|          A|142 min|               Drama|        9.3|Two imprisoned me...|        80|      Frank Darabont|   Tim Robbins|Morgan Freeman|   Bob Gunton|William Sadler|    2343110| 28,341,469|
|https://m.m

In [6]:
spark.sql("DESCRIBE imdb_movies").show()

+-------------+---------+-------+
|     col_name|data_type|comment|
+-------------+---------+-------+
|  Poster_Link|   string|   NULL|
| Series_Title|   string|   NULL|
|Released_Year|   string|   NULL|
|  Certificate|   string|   NULL|
|      Runtime|   string|   NULL|
|        Genre|   string|   NULL|
|  IMDB_Rating|   double|   NULL|
|     Overview|   string|   NULL|
|   Meta_score|   string|   NULL|
|     Director|   string|   NULL|
|        Star1|   string|   NULL|
|        Star2|   string|   NULL|
|        Star3|   string|   NULL|
|        Star4|   string|   NULL|
|  No_of_Votes|   string|   NULL|
|        Gross|   string|   NULL|
+-------------+---------+-------+



In [11]:
# count NULL values in all columns

spark.sql("""
SELECT
    SUM(CASE WHEN Poster_Link IS NULL THEN 1 ELSE 0 END) AS Poster_Link_nulls,
    SUM(CASE WHEN Series_Title IS NULL THEN 1 ELSE 0 END) AS Series_Title_nulls,
    SUM(CASE WHEN Released_Year IS NULL THEN 1 ELSE 0 END) AS Released_Year_nulls,
    SUM(CASE WHEN Certificate IS NULL THEN 1 ELSE 0 END) AS Certificate_nulls,
    SUM(CASE WHEN Runtime IS NULL THEN 1 ELSE 0 END) AS Runtime_nulls,
    SUM(CASE WHEN Genre IS NULL THEN 1 ELSE 0 END) AS Genre_nulls,
    SUM(CASE WHEN IMDB_Rating IS NULL THEN 1 ELSE 0 END) AS IMDB_Rating_nulls,
    SUM(CASE WHEN Overview IS NULL THEN 1 ELSE 0 END) AS Overview_nulls,
    SUM(CASE WHEN Meta_score IS NULL THEN 1 ELSE 0 END) AS Meta_score_nulls,
    SUM(CASE WHEN No_of_Votes IS NULL THEN 1 ELSE 0 END) AS No_of_Votes_nulls,
    SUM(CASE WHEN Gross IS NULL THEN 1 ELSE 0 END) AS Gross_nulls
FROM imdb_movies
""").show()

+-----------------+------------------+-------------------+-----------------+-------------+-----------+-----------------+--------------+----------------+-----------------+-----------+
|Poster_Link_nulls|Series_Title_nulls|Released_Year_nulls|Certificate_nulls|Runtime_nulls|Genre_nulls|IMDB_Rating_nulls|Overview_nulls|Meta_score_nulls|No_of_Votes_nulls|Gross_nulls|
+-----------------+------------------+-------------------+-----------------+-------------+-----------+-----------------+--------------+----------------+-----------------+-----------+
|                0|                 0|                  0|              101|            0|          0|                0|             0|             156|                0|        169|
+-----------------+------------------+-------------------+-----------------+-------------+-----------+-----------------+--------------+----------------+-----------------+-----------+



In [None]:
# no need to drop anythiing as missing values are not particularly relevant in this case

In [12]:
# basic stats

spark.sql("SELECT COUNT(*) AS Total_Movies FROM imdb_movies").show()

+------------+
|Total_Movies|
+------------+
|        1000|
+------------+



In [14]:
spark.sql("SELECT AVG(IMDB_Rating) AS Avg_Rating FROM imdb_movies").show()

+-----------------+
|       Avg_Rating|
+-----------------+
|7.949300000000012|
+-----------------+



In [16]:
spark.sql("""
SELECT Series_Title, IMDB_Rating
FROM imdb_movies
ORDER BY IMDB_Rating DESC
LIMIT 5
""").show()


+--------------------+-----------+
|        Series_Title|IMDB_Rating|
+--------------------+-----------+
|The Shawshank Red...|        9.3|
|       The Godfather|        9.2|
|     The Dark Knight|        9.0|
|The Godfather: Pa...|        9.0|
|        12 Angry Men|        9.0|
+--------------------+-----------+



In [18]:
# get most popular genres of movies

spark.sql("""
SELECT Genre, COUNT(*) AS Movie_count
FROM imdb_movies
GROUP BY Genre
ORDER BY Movie_count DESC
LIMIT 5
""").show()

+--------------------+-----------+
|               Genre|Movie_count|
+--------------------+-----------+
|               Drama|         85|
|      Drama, Romance|         37|
|       Comedy, Drama|         35|
|Comedy, Drama, Ro...|         31|
|Action, Crime, Drama|         30|
+--------------------+-----------+



In [19]:
# average revenue per genre

spark.sql(
    """
    SELECT Genre, AVG(CAST(REPLACE(Gross, ',', '') AS FLOAT)) AS Avg_Revenue
    FROM imdb_movies
    GROUP BY Genre
    ORDER BY Avg_Revenue DESC
    LIMIT 5
    """
).show()


+--------------------+-------------------+
|               Genre|        Avg_Revenue|
+--------------------+-------------------+
|      Family, Sci-Fi|        4.3511056E8|
|Action, Adventure...|      3.621854256E8|
|Action, Adventure...|         3.019592E8|
|Action, Adventure...|2.808885443333333E8|
|  Adventure, Fantasy|       2.80685216E8|
+--------------------+-------------------+



In [20]:
# creating a director summary table

director_ratings = spark.sql(
    """
    SELECT Director, AVG(IMDB_Rating) AS Avg_Rating, COUNT(*) AS Movie_Count
    FROM imdb_movies
    GROUP BY Director
    ORDER BY Avg_Rating DESC
    """
)

director_ratings.createOrReplaceTempView("director_ratings")

In [23]:
# top 5 directors in terms of imdb ratings

spark.sql(
    """
    SELECT Director, Avg_Rating, Movie_Count
    FROM director_ratings
    ORDER BY Avg_Rating DESC
    LIMIT 5
    """
).show()

+---------------+----------+-----------+
|       Director|Avg_Rating|Movie_Count|
+---------------+----------+-----------+
| Frank Darabont|      8.95|          2|
| Irvin Kershner|       8.7|          1|
| Lana Wachowski|       8.7|          1|
|Roberto Benigni|       8.6|          1|
|    Thomas Kail|       8.6|          1|
+---------------+----------+-----------+



In [24]:
# where is Nolan?

spark.sql(
    """
    SELECT Director, Avg_Rating, Movie_Count
    FROM director_ratings
    WHERE Director LIKE '%Nolan%'
    """
).show()

+-----------------+-----------------+-----------+
|         Director|       Avg_Rating|Movie_Count|
+-----------------+-----------------+-----------+
|Christopher Nolan|8.462499999999999|          8|
+-----------------+-----------------+-----------+



In [27]:
# window function for ranking movies by revenue

from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col, regexp_replace

# Remove commas from revenue and convert to integer
df_clean = df.withColumn("Gross", regexp_replace(col("Gross"), ",", "").cast("int"))

window_spec = Window.orderBy(df_clean["Gross"].desc())

df_ranked = df_clean.withColumn("Rank", rank().over(window_spec))
df_ranked.show(5)

+--------------------+--------------------+-------------+-----------+-------+--------------------+-----------+--------------------+----------+-------------+-----------------+-----------------+----------------+------------------+-----------+---------+----+
|         Poster_Link|        Series_Title|Released_Year|Certificate|Runtime|               Genre|IMDB_Rating|            Overview|Meta_score|     Director|            Star1|            Star2|           Star3|             Star4|No_of_Votes|    Gross|Rank|
+--------------------+--------------------+-------------+-----------+-------+--------------------+-----------+--------------------+----------+-------------+-----------------+-----------------+----------------+------------------+-----------+---------+----+
|https://m.media-a...|Star Wars: Episod...|         2015|          U|138 min|Action, Adventure...|        7.9|As a new threat t...|        80|  J.J. Abrams|     Daisy Ridley|      John Boyega|     Oscar Isaac|  Domhnall Gleeson|    