## A3 CE408 
## 2021015

In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, desc, avg, max, min, split, length, collect_list
import os

### Create session

In [18]:
def create_spark_session():
    return SparkSession.builder \
        .appName("Netflix Dataset EDA") \
        .getOrCreate()

### Load Dataset

In [19]:
def load_netflix_dataset(spark, file_path):
    return spark.read.csv(file_path, header=True, inferSchema=True)

### Perform Exploratory Data Analysis

In [20]:
def perform_eda(netflix_df):

    print("1. Basic Dataset Information:")
    netflix_df.printSchema()
    print(f"\nTotal number of records: {netflix_df.count()}")

    print("\n2. Top Directors with the Most Titles:")
    top_directors = netflix_df.groupBy("director") \
        .agg(count("*").alias("title_count")) \
        .orderBy(col("title_count").desc()) \
        .limit(10)
    top_directors.show()

    print("\n3. Average Release Year by Content Type:")
    avg_release_year = netflix_df.groupBy("type") \
        .agg(avg("release_year").alias("avg_release_year")) \
        .orderBy("type")
    avg_release_year.show()

    print("\n4. Distribution of Content by Duration Length:")
    content_duration_dist = netflix_df \
        .withColumn("duration_numeric", split(col("duration"), " ")[0].cast("int")) \
        .groupBy("type") \
        .agg(avg("duration_numeric").alias("avg_duration"), \
             max("duration_numeric").alias("max_duration"), \
             min("duration_numeric").alias("min_duration"))
    content_duration_dist.show()

    print("\n5. Countries with the Most Diverse Genres:")
    country_genres = netflix_df \
        .groupBy("country") \
        .agg(count("listed_in").alias("genre_count")) \
        .orderBy(desc("genre_count")) \
        .limit(10)
    country_genres.show()

    print("\n6. Titles with the Longest Words:")
    longest_word_titles = netflix_df \
        .withColumn("title_length", length(col("title"))) \
        .orderBy(desc("title_length")) \
        .select("title", "title_length") \
        .limit(10)
    longest_word_titles.show()

    print("\n7. Grouping Content by Rating and Analyzing Count:")
    rating_analysis = netflix_df.groupBy("rating") \
        .agg(count("*").alias("count")) \
        .orderBy(desc("count"))
    rating_analysis.show()


### Creating Spark Session

In [21]:
spark = create_spark_session()

### Performing the Analysis

In [22]:
 # Get the current directory
dataset_path = 'netflix_titles.csv' 

# Load Netflix Dataset
netflix_df = load_netflix_dataset(spark, dataset_path)

# Perform EDA
perform_eda(netflix_df)

1. Basic Dataset Information:
root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)


Total number of records: 8809

2. Top Directors with the Most Titles:
+--------------------+-----------+
|            director|title_count|
+--------------------+-----------+
|                NULL|       2636|
|       Rajiv Chilaka|         19|
|Raúl Campos, Jan ...|         18|
|        Marcus Raboy|         16|
|         Suhas Kadav|         16|
|           Jay Karas|         14|
| Cathy Garcia-Molina|         13|
|     Youssef Chahine|         12|
|     Martin Scorsese|         12

In [23]:
spark.stop()