In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import functions as F
from pyspark.sql.functions import col, to_date, when, countDistinct, count
from pyspark.sql.functions import split, explode, trim, col, lit, length
from pyspark.sql.functions import current_date, row_number, regexp_replace, coalesce
from pyspark.sql.functions import sum as spark_sum
from pyspark.sql.window import Window
import numpy as np

In [0]:
display(dbutils.fs.ls("dbfs:/FileStore/tables/"))


In [0]:
# File location and type
file_location = "/FileStore/tables/Imdb_Movie_Dataset-1.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
# quote <-- Important to handle commas correctly
df = (
    spark.read.format(file_type)
    .option("inferSchema", infer_schema)
    .option("header", first_row_is_header)
    .option("sep", delimiter)
    .option("quote", '"')
    .option("escape", '"')
    .load(file_location)
)

display(df)

#Data Cleaning

Can we predict the box office revenue of a movie before it is released?
This project explores that question by building a machine learning regression model using Apache Spark MLlib to predict the revenue of a movie based on features such as budget, genre, popularity, vote average, and more.


In [0]:
df.printSchema()


## Dataset Schema with Correct Types

| Column                  | Current Type | Correct Type      |
|-------------------------|--------------|-------------------|
| id                      | integer      | integer           |
| title                   | string       | string            |
| vote_average            | string       | **double** (float)|
| vote_count              | string       | **integer**       |
| status                  | string       | string            |
| release_date            | string       | **date**          |
| revenue                 | string       | **long** (integer)|
| runtime                 | string       | **integer**       |
| adult                   | string       | **boolean**       |
| budget                  | string       | **long** (integer)|
| imdb_id                 | string       | string            |
| original_language       | string       | string            |
| original_title          | string       | string            |
| overview                | string       | string            |
| popularity              | string       | **double** (float)|
| tagline                 | string       | string            |
| genres                  | string       | **array<string>** |
| production_companies    | string       | **array<string>** |
| production_countries    | string       | **array<string>** |
| spoken_languages        | string       | **array<string>** |
| keywords                | string       | **array<string>** |

In [0]:
# Function to calculate missing values by column
def missing_values_table_spark(df):

    # Calculate the total missing values for each column
    mis_val = df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df.columns])

    # Convert to Pandas for easier handling
    mis_val_pd = mis_val.toPandas().transpose()

    # Calculate the percentage of missing values for each column
    mis_val_percent = (mis_val_pd[0] / df.count()) * 100

    # Create a new table combining count and percentage
    mis_val_table = pd.concat([mis_val_pd, mis_val_percent], axis=1)
    mis_val_table.columns = ['Missing Values', '% of Total Values']

    # Keep only columns with >0% missing, sort descending, round
    mis_val_table = (
        mis_val_table[mis_val_table['% of Total Values'] > 0]
        .sort_values('% of Total Values', ascending=False)
        .round(1)
    )

    # Reset index so that original column names become a column
    mis_val_table = mis_val_table.reset_index().rename(columns={'index': 'Column'})

    # Print summary
    print(f"Your selected dataframe has {len(df.columns)} columns.\n"
          f"There are {mis_val_table.shape[0]} columns that have missing values.")

    return mis_val_table

# Usage
missing_values = missing_values_table_spark(df)
display(missing_values)

In [0]:
def plot_missing_values(mis_val_table, title):
    """
    Plots the percentage of missing values per feature, showing feature names on the x-axis.
    Assumes `mis_val_table` has a column 'Column' with the feature names and
    '% of Total Values' with the missing-value percentages.
    """
    plt.figure(figsize=(12, 8))

    # Use the 'Column' column for x-labels
    sns.barplot(
        data=mis_val_table,
        x='Column',
        y='% of Total Values'
    )

    # Rotate the x labels for better readability
    plt.xticks(rotation=90)

    # Set plot labels and title
    plt.title(title)
    plt.xlabel('Feature')
    plt.ylabel('% of Total Values')

    plt.tight_layout()
    plt.show()

# Call the function to plot the missing values with feature names shown
plot_missing_values(missing_values, title='Percentage of Missing Values by Feature')

In [0]:
# Updated function to include a 'Column' column instead of using the index
def zero_values_table_spark(df):

    # For each column, count how many values equal zero
    zero_counts = df.select([F.sum(F.when(F.col(c) == 0, 1).otherwise(0)).alias(c) for c in df.columns])

    # Convert result to Pandas for easier manipulation
    zero_counts_pd = zero_counts.toPandas().transpose()

    # Compute percentage
    row_count = df.count()
    zero_percent = (zero_counts_pd[0] / row_count) * 100

    # Create summary table
    zero_table = pd.concat([zero_counts_pd, zero_percent], axis=1)
    zero_table.columns = ['Zero Values', '% of Total Values']

    # Keep only columns with zeros, sort and round
    zero_table = (
        zero_table[zero_table['Zero Values'] > 0]
        .sort_values('% of Total Values', ascending=False)
        .round(1)
    )

    # Reset index so column names appear as a column
    zero_table = zero_table.reset_index().rename(columns={'index': 'Column'})

    # Print summary
    print(f"Your selected dataframe has {len(df.columns)} columns.\n"
          f"There are {zero_table.shape[0]} columns that have zero values.")

    return zero_table

# Use the function
zero_values = zero_values_table_spark(df)
display(zero_values)

In [0]:
def plot_zero_values(zero_table, title):
    """
    Plots the percentage of zero values per feature, using the 'Column' column
    for the x-axis labels.
    """
    plt.figure(figsize=(10, 8))

    sns.barplot(
        data=zero_table,
        x='Column',
        y='% of Total Values'
    )

    # Rotate x-labels for readability
    plt.xticks(rotation=90)

    # Set labels and title
    plt.title(title)
    plt.xlabel('Feature')
    plt.ylabel('% of Total Values')

    plt.tight_layout()
    plt.show()

# print
plot_zero_values(zero_values, title='Percentage of Zero Values by Feature')

## Missing or Zero Values Summary

### Missing Values
These columns have significant number of null entries:
- **tagline** (85.4% missing)  
- **keywords** (72.1% missing)  
- **production_companies** (54.2% missing)
- **imdb_id** (46.5% missing)  
- **production_countries** (44.2% missing)  
- **spoken_languages** (~42% missing)  
- **genres** (39.6% missing)  
- **overview** (20.6% missing)  
- **release_date** (17.3% missing)

### Zero Values
The following columns contain a substantial fraction of zeros:
- **revenue** (98% zeros)  
- **budget** (94.6% zeros)  
- **adult** (90.7% zeros)
- **vote average** (66.6% zeros)
- **vote_count** (66.5%)   
- **runtime** (28% zeros) 
- **popularity** (13.2% zeros)   


##ID and IMDB_ID 

First, we will check for duplicates by **id** and **imdb_id**

In [0]:
# Calculate total and distinct counts
total_rows = df.count()
unique_id = df.select(countDistinct("id")).collect()[0][0]
unique_imdb_id = df.select(countDistinct("imdb_id")).collect()[0][0]

# Calculate duplicated counts
duplicated_id = total_rows - unique_id
duplicated_imdb_id = total_rows - unique_imdb_id

# Print results
print(f"Total duplicated IDs: {duplicated_id}")
print(f"Total duplicated IMDB IDs: {duplicated_imdb_id}")

# Display examples of duplicated 'id' rows
dup_id_vals = (
    df
    .groupBy("id")
    .count()
    .filter(col("count") > 1)
    .select("id")
)
dup_id_rows = (
    df
    .join(dup_id_vals, on="id", how="inner")
    .orderBy("id")
)
print("Examples of duplicated 'id' rows:")
display(dup_id_rows.limit(10))

# Display examples of duplicated 'imdb_id' rows 
dup_imdb_vals = (
    df
    .groupBy("imdb_id")
    .count()
    .filter(col("count") > 1)
    .select("imdb_id")
)
dup_imdb_rows = (
    df
    .join(dup_imdb_vals, on="imdb_id", how="inner")
    .orderBy("imdb_id")
)
print("Examples of duplicated 'imdb_id' rows:")
display(dup_imdb_rows.limit(10))

In [0]:
# Count how many "None" strings are in imdb_id
none_count = df.filter(col("imdb_id") == "None").count()

# Replace "None" strings with actual nulls
df = df.withColumn(
    "imdb_id",
    when(col("imdb_id") == "None", None)
      .otherwise(col("imdb_id"))
)

# Print how many values were set to null
print(f"Number of 'None' values replaced with null in 'imdb_id': {none_count}")


In [0]:
# Calculate counts for non-null imdb_id values
non_null_rows = df.filter(col("imdb_id").isNotNull()).count()
unique_non_null = (
    df
    .filter(col("imdb_id").isNotNull())
    .select(countDistinct("imdb_id"))
    .collect()[0][0]
)
duplicated_imdb_id = non_null_rows - unique_non_null

# Print the results
print(f"Total non-null imdb_id rows: {non_null_rows}")
print(f"Total duplicated IMDB IDs (excluding nulls): {duplicated_imdb_id}")

# Display examples of duplicated 'imdb_id' rows
dup_imdb_vals = (
    df
    .filter(col("imdb_id").isNotNull())
    .groupBy("imdb_id")
    .count()
    .filter(col("count") > 1)
    .select("imdb_id")
)

dup_imdb_rows = (
    df
    .join(dup_imdb_vals, on="imdb_id", how="inner")
    .orderBy("imdb_id")
)

print("Examples of duplicated 'imdb_id' rows:")
display(dup_imdb_rows.limit(10))


Turns out that the duplicated rows by **id** column are in fact duplicates. 
- We will deduplicate and leave only the row with the lest number of null values.

When it comes to **imdb_id**, those also concern the same movies but with original and translated titles. 
- We will deduplicate using the same logic as we used with **id**


In [0]:
def drop_duplicates_keep_least_null(df, id_col="id"):
    """
    Remove duplicate rows among those with non-null `id_col`, keeping the row with the fewest null values.
    Rows where `id_col` is null are left untouched.
    Prints how many rows were dropped among non-null IDs.
    """
    # Split into rows with and without an ID
    non_null_df = df.filter(col(id_col).isNotNull())
    null_df     = df.filter(col(id_col).isNull())
    
    # Count non-null rows before deduplication
    orig_count = non_null_df.count()
    
    # For each row, count how many columns are null
    null_count_expr = sum(
        when(col(c).isNull(), 1).otherwise(0) for c in non_null_df.columns
    ).alias("_null_count")
    nn_with_nulls = non_null_df.withColumn("_null_count", null_count_expr)
    
    # Within each id group, rank rows by null_count ascending
    window = Window.partitionBy(id_col).orderBy(col("_null_count").asc())
    ranked = nn_with_nulls.withColumn("_rn", row_number().over(window))
    
    # Keep only the first row for each id (fewest nulls), drop helper cols
    deduped_non_null = (
        ranked
        .filter(col("_rn") == 1)
        .drop("_null_count", "_rn")
    )
    
    # Count how many non-null rows were dropped
    new_count = deduped_non_null.count()
    dropped = orig_count - new_count
    print(f"Dropped {dropped} duplicate rows among non-null '{id_col}'.")
    
    # Combine back with the untouched null-ID rows
    result_df = deduped_non_null.unionByName(null_df)
    return result_df

# Use the function to deduplicate by "id", preserving rows with null id
df = drop_duplicates_keep_least_null(df, id_col="id")

display(df.limit(10))

In [0]:
# Now drop duplicates by imdb_id
df = drop_duplicates_keep_least_null(df, id_col="imdb_id")

# Inspect a few rows to confirm
display(df.limit(10))

In [0]:
# Sanity check - recalculate duplicates
# Total rows
total_rows = df.count()

# id duplicates (id never null)
unique_id = df.select(countDistinct("id")).collect()[0][0]
duplicated_id = total_rows - unique_id

# imdb_id duplicates (exclude null/"None")
valid_imdb = df.filter(col("imdb_id").isNotNull() & (col("imdb_id") != "None"))
total_valid = valid_imdb.count()
unique_imdb = valid_imdb.select(countDistinct("imdb_id")).collect()[0][0]
duplicated_imdb_id = total_valid - unique_imdb

print(f"Total duplicated IDs: {duplicated_id}")
print(f"Total duplicated IMDB IDs (excluding nulls): {duplicated_imdb_id}")


Now, **id** and **imdb_id** columns are deduplicated. Their datatype is correct, so no need for further changes.

## TITLE

In [0]:
# Count null, empty, and "None"/"none" titles
null_count = df.filter(col("title").isNull()).count()
empty_count = df.filter((col("title") == "") | (col("title") == " ")).count()
none_count = df.filter((col("title") == "None") | (col("title") == "none")).count()

print(f"Null titles: {null_count}")
print(f"Empty titles: {empty_count}")
print(f"None titles: {none_count}")

In [0]:
# Title length distribution
title_len_df = df.withColumn("title_length", length(col("title")))
min_len = title_len_df.agg({"title_length": "min"}).collect()[0][0]
max_len = title_len_df.agg({"title_length": "max"}).collect()[0][0]
median_len = title_len_df.approxQuantile("title_length", [0.5], 0.01)[0]
print(f"Title length — min: {min_len}, median: {median_len:.0f}, max: {max_len}")

# Show extremes
print("Longest titles:")
display(
    title_len_df
    .orderBy(col("title_length").desc())
    .select("id", "title", "title_length")
    .limit(5)
)
print("Shortest non-empty titles:")
display(
    title_len_df
    .filter(col("title").isNotNull() & (col("title") != ""))
    .orderBy(col("title_length").asc())
    .select("id", "title", "title_length")
    .limit(5)
)


**title** column does not have any missing values and has the right data type.

##VOTE_AVERAGE and VOTE_COUNT

In [0]:
# Summary of null, empty-string, and "None"/"none" counts for vote_average and vote_count
total = df.count()

for cname in ["vote_average", "vote_count"]:
    nulls   = df.filter(col(cname).isNull()).count()
    empties = df.filter((col(cname) == "") | (col(cname) == " ")).count()
    nones   = df.filter(col(cname).isin("None", "none")).count()
    
    print(
        f"{cname}: total={total}, "
        f"nulls={nulls}, "
        f"empty strings={empties}, "
        f"'None'/'none' strings={nones}"
    )

First, we need to cast **vote_average** and **vote_count** into correct data types:

In [0]:
# Cast vote_average to double and vote_count to integer
df = (
    df
    .withColumn("vote_average", col("vote_average").cast("double"))
    .withColumn("vote_count",    col("vote_count").cast("integer"))
)

# Verify schema
df.select("vote_average", "vote_count").printSchema()

In [0]:
# Count instances where vote_average == 0
count_avg_zero = df.filter(col("vote_average") == 0).count()
print(f"Number of rows with vote_average == 0: {count_avg_zero}")

# Count instances where vote_count == 0
count_count_zero = df.filter(col("vote_count") == 0).count()
print(f"Number of rows with vote_count == 0: {count_count_zero}")

IMDb’s user rating system allows you to assign any whole-number score from 1 (the lowest) up to 10 (the highest) for any movie.

In [0]:
# Movies with vote_average below the allowed minimum (1)
below_allowed = df.filter(col("vote_average") < 1)
count_below = below_allowed.count()
print(f"Number of movies with vote_average below 1: {count_below}")

# Movies with vote_average above the allowed maximum (10)
above_allowed = df.filter(col("vote_average") > 10)
count_above = above_allowed.count()
print(f"Number of movies with vote_average above 10: {count_above}")

In [0]:
# Replace those invalid vote_average values with null
df = df.withColumn(
    "vote_average",
    when(col("vote_average") < 1, None)
     .otherwise(col("vote_average"))
)

# Verify none remain
remaining_invalid = df.filter(col("vote_average") < 1).count()
print(f"Remaining vote_average < 1 after cleanup: {remaining_invalid}")

In [0]:
# Movies where vote_count == 0 but vote_average != 0
cond = df.filter((col("vote_count") == 0) & (col("vote_average") != 0))
count = cond.count()
print(f"Number of movies with vote_count == 0 and vote_average != 0: {count}")
# Display all columns for a few examples
display(cond.limit(5))

In [0]:
# Count how many rows have vote_count == 0
zero_count = df.filter(col("vote_count") == 0).count()
print(f"Found {zero_count} rows with vote_count == 0; setting vote_average to null for these.")

# Replace vote_average with null where vote_count is zero
df = df.withColumn(
    "vote_average",
    when(col("vote_count") == 0, None)
     .otherwise(col("vote_average"))
)

# Verify that no row with vote_count == 0 still has a non-null vote_average
remaining = df.filter((col("vote_count") == 0) & col("vote_average").isNotNull()).count()
print(f"Remaining rows with vote_count == 0 and non-null vote_average: {remaining}")

In [0]:

# Examples where vote_average == 0
count_avg_zero = df.filter(col("vote_average") == 0).count()
print(f"Number of rows with vote_average == 0: {count_avg_zero}")

# Examples where vote_count == 0
count_count_zero = df.filter(col("vote_count") == 0).count()
print(f"Number of rows with vote_count == 0: {count_count_zero}")

In [0]:
# Convert only the needed columns to Pandas to avoid memory issues
df_plot = df.select("vote_average", "vote_count").dropna().toPandas()

# Set seaborn style
sns.set(style="whitegrid")

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot vote_average histogram
sns.histplot(df_plot['vote_average'], bins=20, kde=True, ax=axes[0], color='skyblue')
axes[0].set_title('Distribution of Vote Average')
axes[0].set_xlabel('Vote Average')
axes[0].set_ylabel('Count')

# Plot vote_count histogram (log scale)
sns.histplot(df_plot['vote_count'], bins=50, kde=False, ax=axes[1], color='salmon')
axes[1].set_title('Distribution of Vote Count')
axes[1].set_xlabel('Vote Count')
axes[1].set_ylabel('Count')
axes[1].set_yscale('log')  # log scale for better readability of skewed data

plt.tight_layout()
plt.show()

The distribution for **vote_average** looks almost normal, with a bump around 5 and 6–7, which aligns with typical average ratings on platforms like IMDb or TMDB.

When it comes to **vote_count**, there's a long tail of movies with thousands of votes, likely the popular ones. However, a big portion of movies did not receive any votes.

## STATUS

**status** is in correct data type. First, let's check for distinct values 

In [0]:
# Count distinct non-null statuses
distinct_status_count = df.filter(col("status").isNotNull()).select("status").distinct().count()
print(f"Number of distinct 'status' values (excluding nulls): {distinct_status_count}")

# Show frequency of each status
status_counts = df.groupBy("status").count().orderBy(col("count").desc())
print("Counts per status:")
display(status_counts)


In [0]:
# Calculate average and median revenue by status
status_revenue_stats = (
    df
    .groupBy("status")
    .agg(
        F.avg("revenue").alias("average_revenue"),
        F.expr("percentile_approx(revenue, 0.5)").alias("median_revenue")
    )
    .orderBy("status")
)

# Display the results
display(status_revenue_stats)


**We need to decide whether to predict the revenue of movies with all statuses or just the ones released**

## RELEASE DATE

First, we need to handle the data type of **release_date**. Upon initial feature investigation, we realized that some dates might be in US formats and some in EU. We will need to hadle it appropriately.

In [0]:
# Summary of null, empty-string, and "None"/"none" counts for release_date
total_rows    = df.count()
null_count    = df.filter(col("release_date").isNull()).count()
empty_count   = df.filter((col("release_date") == "") | (col("release_date") == " ")).count()
none_count    = df.filter(col("release_date").isin("None", "none")).count()

print(f"Total rows: {total_rows}")
print(f"Null release_date values          : {null_count}")
print(f"Empty-string release_date values  : {empty_count}")
print(f"'None'/'none' release_date values : {none_count}")

In [0]:
# Switch back to the old, pre–Spark-3.0 date parser behavior
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

# Trim whitespace and normalize separators (remove spaces around -, /, .)
df = df.withColumn(
    "release_date_norm",
    regexp_replace(
        trim(col("release_date")),
        r"\s*([/\-\.])\s*",
        r"$1"
    )
)

# Try parsing with a variety of common date formats
df = df.withColumn(
    "release_date_parsed",
    coalesce(
        to_date(col("release_date_norm"), "yyyy-MM-dd"),
        to_date(col("release_date_norm"), "yyyy/MM/dd"),
        to_date(col("release_date_norm"), "yyyy.MM.dd"),
        to_date(col("release_date_norm"), "MM-dd-yyyy"),
        to_date(col("release_date_norm"), "MM/dd/yyyy"),
        to_date(col("release_date_norm"), "MM.dd.yyyy"),
        to_date(col("release_date_norm"), "dd-MM-yyyy"),
        to_date(col("release_date_norm"), "dd/MM/yyyy"),
        to_date(col("release_date_norm"), "dd.MM.yyyy")
    )
)

# How many rows actually parsed to a non-null date?
total = df.count()
parsed_non_null = df.filter(col("release_date_parsed").isNotNull()).count()
parsed_null     = total - parsed_non_null

print(f"Total rows:                {total}")
print(f"Rows parsed successfully:  {parsed_non_null}")
print(f"Rows that failed to parse: {parsed_null}")


In [0]:

# Replace the old column with the parsed one since all the rows accepted the correct type
df = df.drop("release_date").withColumnRenamed("release_date_parsed", "release_date")

# Display the DataFrame with the parsed release_date column
display(df.limit(5))

In [0]:
# print df
display(df.limit(5))

In [0]:
# Print the schema to see the data type
df.select("release_date").printSchema()

In [0]:
# Check for future release dates (dates beyond today)
future_movies = df.filter(col("release_date") > current_date())

# Filter movies released before 1888-01-01
old_movies = df.filter(col("release_date") < lit("1888-01-01"))

# Count and print the result
count_future = future_movies.count()
print(f"Number of movies released after today: {count_future}")
count_old = old_movies.count()
print(f"Number of movies released before 1888: {count_old}")

In [0]:
# Display 30 movies with selected columns
display(
    future_movies
    .select("id", "title", "status", "release_date")
    .limit(30)
)


**A little fuck up with casting the type probably. Will revisit later**

In [0]:
# Display a few examples 
display(old_movies.limit(20))

The movies released before 1888 turn out to be time-laps photography or similar techniques to capture movemement. Therefore, we will leave them in our dataset.

# CHECKPOINT

In [0]:
# Cell 1: Export current DataFrame to CSV as a checkpoint
output_path = "/FileStore/tables/df_checkpoint"

# Coalesce to 1 file (optional), write with header
df.coalesce(1) \
  .write \
  .mode("overwrite") \
  .option("header", "true") \
  .csv(output_path)

print(f"Checkpoint written to {output_path}/ (use Data → DBFS to download the CSV part file)")


In [0]:
# Cell 2: Reload the checkpoint CSV back into a DataFrame
checkpoint_path = "/FileStore/tables/df_checkpoint"

# Read the CSV back in (with header and schema inference)
df_checkpoint = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(checkpoint_path)

# (Optional) Replace your working df with the checkpointed one
df = df_checkpoint

# Verify load
print("Checkpoint schema:")
df.printSchema()
print(f"Checkpoint row count: {df.count()}")

## REVENUE and BUDGET

Both **revenue** and **budget** are currently in string data type. We need them both as integers. (Alternatively, we could cast them to float point numbers, however, we do not need to be decimal point specific)

In [0]:
# Summary of null, empty-string, and "None"/"none" counts for revenue and budget
total = df.count()

for cname in ["revenue", "budget"]:
    nulls   = df.filter(col(cname).isNull()).count()
    empties = df.filter((col(cname) == "") |(col(cname) == " ")).count()
    nones   = df.filter(col(cname).isin("None", "none")).count()
    zeros   = df.filter(col(cname) == 0).count()
    
    print(
        f"{cname}: total={total}, "
        f"nulls={nulls}, "
        f"empty strings={empties}, "
        f"'None'/'none' strings={nones}, "
        f"zeros={zeros}"
    )

In [0]:
# Cast revenue and budget to long (64-bit integer)
df = df.withColumn(
        "revenue", col("revenue").cast("long")
    ).withColumn(
        "budget",  col("budget").cast("long")
    )

# Verify that the types have been updated
df.select("revenue", "budget").printSchema()

In [0]:
# Filter 
df_money = df.filter((col("budget") > 0) & (col("revenue") > 0))

# Select and convert necessary columns to Pandas
df_money_pd = df_money.select("budget", "revenue").toPandas()

# Plot original distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.set(style="whitegrid")

# Budget distribution
sns.histplot(df_money_pd['budget'], bins=100, ax=axes[0], color='orange')
axes[0].set_title('Budget Distribution')
axes[0].set_xlabel('Budget ($)')
axes[0].set_ylabel('Count')

# Revenue distribution
sns.histplot(df_money_pd['revenue'], bins=100, ax=axes[1], color='blue')
axes[1].set_title('Revenue Distribution')
axes[1].set_xlabel('Revenue ($)')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [0]:
# Filter Spark DataFrame for positive budget and revenue
df_money = df.filter((col("budget") > 0) & (col("revenue") > 0))

# Convert to Pandas
df_money_pd = df_money.select("budget", "revenue").toPandas()

# Apply log10 transformation
df_money_pd['log_budget'] = np.log10(df_money_pd['budget'])
df_money_pd['log_revenue'] = np.log10(df_money_pd['revenue'])

# Plot
sns.set(style="whitegrid")
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

sns.histplot(df_money_pd['log_budget'], bins=50, ax=axes[0], color='orange')
axes[0].set_title('Log10 Budget Distribution')
axes[0].set_xlabel('Log10(Budget)')

sns.histplot(df_money_pd['log_revenue'], bins=50, ax=axes[1], color='blue')
axes[1].set_title('Log10 Revenue Distribution')
axes[1].set_xlabel('Log10(Revenue)')

plt.tight_layout()
plt.show()

Budget and Revenue (log₁₀) Distributions

Log₁₀(Budget)
The bulk of film budgets cluster around log₁₀(budget) ≈ 6.5–7.5, i.e. budgets of roughly \$3 million to \$30 million. Most budgets fall between log₁₀ ≈ 5 (≈ \$100 k) and log₁₀ ≈ 8 (≈ \$100 million). There is a long left‐hand tail extending down toward zero (very low‐budget or no‐budget films), and a shorter right tail tapering off beyond log₁₀ ≈ 8. Small peaks around log₁₀ ≈ 2–3 (\$100–\$1 000 budgets) and log₁₀ ≈ 4–5 (\$10 000–\$100 000) likely correspond to micro‐budget productions.

Log₁₀(Revenue)
Revenues tend to concentrate around log₁₀(revenue) ≈ 7–8, i.e. \$10 million to \$100 million. Revenues span from very low figures (log₁₀ ≈ 0–2, under \$100–\$100) up to blockbuster grosses (log₁₀ ≈ 9+, over \$1 billion), though the extreme high end thins out sharply. The distribution is right‐skewed, with a heavy left tail of low‐earning films and a tapering right tail of big‐blockbusters. A visible bump near log₁₀ ≈ 4–5 (\$10 000–\$100 000) suggests a cluster of modest indie releases, and occasional spikes around log₁₀ ≈ 2 (\$100–\$1 000) reflect very small‐scale runs or data artifacts.

Overall, both distributions are left-skewed on the log scale, with most films clustering in the mid‐range budgets and revenues, but with long tails toward both the ultra‐low and the ultra‐high ends.



In [0]:
# Movies with revenue == 0 but budget != 0
zero_rev_nonzero_budget = df.filter((col("revenue") == 0) & (col("budget") != 0))
count_zero_rev_nonzero_budget = zero_rev_nonzero_budget.count()
print(f"Movies with revenue = 0 and budget != 0: {count_zero_rev_nonzero_budget}")
display(zero_rev_nonzero_budget.limit(5))

# Movies with budget == 0 but revenue != 0
zero_budget_nonzero_rev = df.filter((col("budget") == 0) & (col("revenue") != 0))
count_zero_budget_nonzero_rev = zero_budget_nonzero_rev.count()
print(f"Movies with budget = 0 and revenue != 0: {count_zero_budget_nonzero_rev}")
display(zero_budget_nonzero_rev.limit(5))


In [0]:
# Movies with negative budget
neg_budget_count = df.filter(col("budget") < 0).count()
print(f"Number of movies with negative budget: {neg_budget_count}")

#  Movies with negative revenue
neg_revenue_count = df.filter(col("revenue") < 0).count()
print(f"Number of movies with negative revenue: {neg_revenue_count}")

## RUNTIME

First of all, we need to cast **runtime** into integer type. Then, we will check if there are any inconsistencies in this column.

In [0]:
# Summary of null, empty-string, "None"/"none", and zero counts for runtime
total = df.count()

nulls   = df.filter(col("runtime").isNull()).count()
empties = df.filter((col("runtime") == "") |(col("runtime") == " ")).count()
nones   = df.filter(col("runtime").isin("None", "none")).count()
zeros   = df.filter(col("runtime") == 0).count()

print(f"runtime: total={total}, nulls={nulls}, empty strings={empties}, 'None'/'none' strings={nones}, zero values={zeros}")

In [0]:
# Cast runtime to integer
df = df.withColumn("runtime", col("runtime").cast("integer"))

# Verify the schema after casting
df.select("runtime").printSchema()

In [0]:
# Check for negative runtime values
negative_runtime_count = df.filter(col("runtime") < 0).count()
print(f"Number of movies with negative runtime: {negative_runtime_count}")

In [0]:
df.select("runtime").describe().show()

Although some "movies" can indeed be shorter than a minute (for example, very old movies or ads), in order to clean up our dataset we will assume that movies with 0 value for runtime, actually have this column missing. We will set them to null.

In [0]:
# Count rows where runtime == 0 before replacing
zero_runtime_count = df.filter(col("runtime") == 0).count()
print(f"Rows with runtime = 0 before nulling: {zero_runtime_count}")

# Replace runtime == 0 with null
df = df.withColumn(
    "runtime",
    when(col("runtime") == 0, None).otherwise(col("runtime"))
)

# Verify how many runtime nulls now (should include the replacements)
null_runtime_count = df.filter(col("runtime").isNull()).count()
print(f"Rows with runtime = null after replacement: {null_runtime_count}")


outliers check

In [0]:
# Calculate Q1 and Q3 using approxQuantile
q1, q3 = df.approxQuantile("runtime", [0.25, 0.75], 0.01)
iqr = q3 - q1

# Calculate bounds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Identify outliers
outliers_runtime = df.filter((col("runtime") < lower_bound) | (col("runtime") > upper_bound))
df_no_outliers = df.filter((col("runtime") >= lower_bound) & (col("runtime") <= upper_bound))

# Display the number of outliers
print(f"Movies with outlier runtime values: {outliers_runtime.count()}")

In [0]:
# Whiskers plot to visualize typical runtime and outliers
runtime_pd = df.select("runtime").toPandas()

plt.figure(figsize=(10, 6))
sns.boxplot(x=runtime_pd['runtime'], color='skyblue')

plt.title("Runtime Distribution with Outliers", fontsize=16)
plt.xlabel("Runtime (Minutes)", fontsize=12)

plt.show()

Without outliers:

In [0]:
# Convert to pandas DataFrame
df_no_outliers_pd = df_no_outliers.select("runtime").toPandas()

plt.figure(figsize=(10, 6))
sns.boxplot(x=df_no_outliers_pd['runtime'], color='skyblue')

plt.title("Runtime Distribution without Outliers", fontsize=16)
plt.xlabel("Runtime (Minutes)", fontsize=12)

plt.show()

## ADULT

In [0]:
# Check distinct values in the "adult" column and count their occurrences
distinct_adult = df.groupBy("adult").count()

# Get the number of distinct values in the "adult" column
distinct_adult_count = distinct_adult.count()
print(f"Number of distinct values in 'adult': {distinct_adult_count}")

# Display the distinct values along with their counts
display(distinct_adult)

In [0]:
# Plot distribution of 'adult' values

# Count occurrences of each distinct value in 'adult' column
adult_distribution = df.groupBy("adult").count().toPandas()

# Plot the distribution
plt.figure(figsize=(8, 6))
sns.barplot(x='adult', y='count', data=adult_distribution, palette='Set2')
plt.title('Distribution of Adult Movies')
plt.xlabel('Adult')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

Overwhelming majority of the movies is **not** for adults only

## ORIGINAL LANGUAGE and ORIGINAL TITLE

In [0]:
# Summary of null, empty-string, "None"/"none", and zero counts for original_language and original_title
for cname in ["original_language", "original_title"]:
    nulls   = df.filter(col(cname).isNull()).count()
    empties = df.filter((col(cname) == "") |(col(cname) == " ")).count()
    nones   = df.filter(col(cname).isin("None", "none")).count()
    zeros   = df.filter(col(cname) == "0").count()

    print(
        f"{cname}: nulls={nulls}, "
        f"empty strings={empties}, "
        f"'None'/'none' strings={nones}, "
        f"zero values={zeros}"
    )


In [0]:
# Group by original_language and count occurrences
language_counts = df.groupBy("original_language").count()

# Sort and get the top 20 languages
top_20_languages = language_counts.orderBy(col("count").desc()).limit(20)

# Show the count of movies in the top 20 languages
top_20_languages.show()

In [0]:
# Group by original_language and count occurrences
language_counts = df.groupBy("original_language").count().orderBy(col("count").desc())

# Show top 5 languages by count
top_languages = language_counts.limit(10).toPandas()

# Plot top 5 languages
plt.figure(figsize=(15, 9))
sns.barplot(x="count", y="original_language", data=top_languages, palette="Set2")
plt.title('Top 10 Most Popular Languages')
plt.xlabel('Number of Movies')
plt.ylabel('Language')
plt.tight_layout()
plt.show()

Great majority of the movies was filmed in english as the original language.

In [0]:
# Count the total number of rows
total_rows = df.count()

# Count the number of rows where the original_title is different from title
different_titles_count = df.filter(col("original_title") != col("title")).count()

# Calculate the percentage of rows where original_title is different from title
percentage_different_titles = (different_titles_count / total_rows) * 100

print(f"Total rows: {total_rows}")
print(f"Number of movies with different 'original_title' than 'title': {different_titles_count}")
print(f"Percentage of movies with different titles: {percentage_different_titles:.2f}%")

In [0]:
# Group by original_language and calculate the average revenue
language_revenue_avg = df.groupBy("original_language") \
    .agg(F.avg("revenue").alias("average_revenue"))

# Sort the languages by average revenue in descending order
language_revenue_avg = language_revenue_avg.orderBy(col("average_revenue").desc()).limit(5)

# Convert to Pandas for visualization
language_revenue_avg_pd = language_revenue_avg.toPandas()

# 4) Create a pretty bar plot
plt.figure(figsize=(10, 6))
sns.barplot(
    x="average_revenue", 
    y="original_language", 
    data=language_revenue_avg_pd, 
    palette="viridis"  # Color palette
)

# Add titles and labels for clarity
plt.title('Top 5 Original Languages by Average Revenue', fontsize=16)
plt.xlabel('Average Revenue', fontsize=12)
plt.ylabel('Original Language', fontsize=12)
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability

# Adjust layout for better spacing
plt.tight_layout()

# Show the plot
plt.show()


The listed short forms of languages refer to:
- **tn**: Tswana
- **zh**: Chinese
- **ko**: Korean
- **hi**: Hindi

https://www.science.co.il/language/Codes.php

## OVERVIEW

In [0]:
# Summary of null, empty-string, "None"/"none", and zero counts for 'overview' column
total = df.count()

nulls   = df.filter(col("overview").isNull()).count()
empties = df.filter((col("overview") == "") | (col("overview") == " ")).count()
nones   = df.filter(col("overview").isin("None", "none")).count()
zeros   = df.filter(col("overview") == "0").count()

print(f"overview: total={total}, nulls={nulls}, empty strings={empties}, 'None'/'none' strings={nones}, zero values={zeros}")

In [0]:
# Set those empty or whitespace-only overviews to null
df = df.withColumn(
    "overview",
    when(trim(col("overview")) == "", None).otherwise(col("overview"))
)

# Then check again how many empty strings there are
empties = df.filter((col("overview") == "") |(col("overview") == " ")).count()
print(f"Number of empty strings left: {empties}")

In [0]:
# Add a column with the length of each overview
df_length = df.withColumn("overview_length", length(col("overview")))

# Compute min, max, avg, and median of overview_length
stats = (
    df_length
    .agg(
        F.min("overview_length").alias("min_length"),
        F.max("overview_length").alias("max_length"),
        F.avg("overview_length").alias("avg_length"),
        F.expr("percentile_approx(overview_length, 0.5)").alias("median_length")
    )
    .collect()[0]
)

min_length    = stats["min_length"]
max_length    = stats["max_length"]
avg_length    = stats["avg_length"]
median_length = stats["median_length"]

print(f"Shortest overview length: {min_length}")
print(f"Longest overview length : {max_length}")
print(f"Average overview length : {avg_length:.1f}")
print(f"Median overview length  : {median_length}")

# Show examples of the shortest and longest overviews
print("Examples of shortest overviews:")
display(
    df_length
      .filter(col("overview_length") == min_length)
      .select("id", "title", "overview", "overview_length")
      .limit(5)
)

print("Examples of longest overviews:")
display(
    df_length
      .filter(col("overview_length") == max_length)
      .select("id", "title", "overview", "overview_length")
      .limit(5)
)


In [0]:
# Plot the distribution of overview lengths
overview_lengths_pd = df_length.select("overview_length").toPandas()

plt.figure(figsize=(10, 6))
sns.histplot(overview_lengths_pd["overview_length"], bins=50, kde=False)
plt.title("Distribution of Overview Lengths")
plt.xlabel("Overview Length (characters)")
plt.ylabel("Number of Movies")
plt.tight_layout()
plt.show()

Distribution of overview leghts is right skewed. It indicates that most overviews have around 150 words. Only some have the maximum number of 1000 characters.

In [0]:
df = df.drop("overview_length")

## POPULARITY

In [0]:
# Summary of null, empty-string, "None"/"none", and zero counts for 'popularity' column
total = df.count()

nulls   = df.filter(col("popularity").isNull()).count()
empties = df.filter((col("popularity") == "") |(col("popularity") == " ")).count()
nones   = df.filter(col("popularity").isin("None", "none")).count()
zeros   = df.filter(col("popularity") == "0").count()

print(f"popularity: total={total}, nulls={nulls}, empty strings={empties}, 'None'/'none' strings={nones}, zero values={zeros}")

In [0]:
# Cast the datatype of popularity to double
df = df.withColumn("popularity", col("popularity").cast("double"))

# Print the schema to check if it worked
df.select("popularity").printSchema()

Can **popularity** be zero? Let's display some examples of rows with **popularity** = 0:

In [0]:
# Count how many movies have popularity == 0
zero_pop_count = df.filter(col("popularity") == 0).count()
print(f"Number of movies with popularity = 0: {zero_pop_count}")

# Display some examples of movies with popularity == 0 
display(
    df
    .filter(col("popularity") == 0)
    .limit(10)
)

In [0]:
# Calculate summary statistics for popularity
stats = df.agg(
    F.min("popularity").alias("min_popularity"),
    F.expr("percentile_approx(popularity, 0.5)").alias("median_popularity"),
    F.avg("popularity").alias("average_popularity"),
    F.max("popularity").alias("max_popularity")
).collect()[0]

print(f"Min popularity     : {stats['min_popularity']}")
print(f"Median popularity  : {stats['median_popularity']}")
print(f"Average popularity : {stats['average_popularity']:.2f}")
print(f"Max popularity     : {stats['max_popularity']}")

In [0]:
# Filter Spark DataFrame
pop_filtered = df.filter(df['popularity'] > 0)

# Convert to Pandas
pop_filtered_pd = pop_filtered.select('popularity').toPandas()

# Plot
plt.figure(figsize=(10, 6))
sns.histplot(pop_filtered_pd['popularity'], kde=True, color='skyblue', bins=30)

plt.xlim(0, 300)
plt.title("Distribution of Popularity", fontsize=16)
plt.xlabel("Popularity", fontsize=12)
plt.ylabel("Frequency", fontsize=12)

plt.show()

## TAGLINE

The column **tagline** has the correct data type - string. Let's investigate this feature:

In [0]:
# Summary for 'tagline'
total = df.count()

nulls   = df.filter(col("tagline").isNull()).count()
empties = df.filter((col("tagline") == "") | (col("tagline") == " ")).count()
nones   = df.filter(col("tagline").isin("None", "none")).count()
zeros   = df.filter(col("tagline") == "0").count()

print(f"tagline: total={total}, nulls={nulls}, empty strings={empties}, 'None'/'none' strings={nones}, zero values={zeros}")

In [0]:
# Replace "None"/"none" in `tagline` with null
from pyspark.sql.functions import when, col

# Count before replacement
none_count = df.filter(col("tagline").isin("None", "none")).count()
print(f"Taglines with 'None' or 'none' before cleanup: {none_count}")

# Perform replacement
df = df.withColumn(
    "tagline",
    when(col("tagline").isin("None", "none"), None).otherwise(col("tagline"))
)

# Verify post-replacement
post_none_count = df.filter(col("tagline").isNull()).count()
print(f"Taglines null after cleanup: {post_none_count}")

In [0]:
# Compute min, max, average, and median lengths of `tagline`
from pyspark.sql.functions import length, avg, expr, min as spark_min, max as spark_max

df_length = df.withColumn("tagline_length", length(col("tagline")))

stats = df_length.agg(
    spark_min("tagline_length").alias("min_length"),
    spark_max("tagline_length").alias("max_length"),
    avg("tagline_length").alias("avg_length"),
    expr("percentile_approx(tagline_length, 0.5)").alias("median_length")
).collect()[0]

print(f"Shortest tagline: {stats['min_length']}")
print(f"Longest  tagline: {stats['max_length']}")
print(f"Average  tagline: {stats['avg_length']:.1f}")
print(f"Median   tagline: {stats['median_length']}")

In [0]:
# Plot distribution of tagline lengths
# Convert to Pandas
lengths_pd = df_length.select("tagline_length").toPandas()["tagline_length"].dropna()

plt.figure(figsize=(10, 6))
sns.histplot(lengths_pd, bins=50, kde=False)
plt.title("Distribution of Tagline Lengths")
plt.xlabel("Tagline Length (chars)")
plt.ylabel("Number of Movies")
plt.tight_layout()
plt.show()


In [0]:
# Compare average revenue: tagline NULL vs non-NULL
# Compute stats
rev_stats = (
    df.groupBy(col("tagline").isNull().alias("tagline_null"))
      .agg(
          F.avg("revenue").alias("avg_revenue"),
          F.expr("percentile_approx(revenue, 0.5)").alias("median_revenue")
      )
      .orderBy("tagline_null")
      .toPandas()
)

# Map boolean to labels
rev_stats["tagline_null"] = rev_stats["tagline_null"].map({True: "Null Tagline", False: "Has Tagline"})

plt.figure(figsize=(8, 6))
sns.barplot(x="tagline_null", y="avg_revenue", data=rev_stats, color="skyblue", label="Average Revenue", ci=None)
sns.barplot(x="tagline_null", y="median_revenue", data=rev_stats, color="orange", label="Median Revenue", ci=None)
plt.title("Revenue by Tagline Presence")
plt.xlabel("")
plt.ylabel("Revenue")
plt.legend()
plt.tight_layout()
plt.show()

## GENRES

Currently **genres** is in type string. We want to cast it to an array.

In [0]:
# Summary for 'genres'
total = df.count()

nulls   = df.filter(col("genres").isNull()).count()
empties = df.filter((col("genres") == "") | (col("genres") == " ")).count()
nones   = df.filter(col("genres").isin("None", "none")).count()
zeros   = df.filter(col("genres") == "0").count()

print(f"genres: total={total}, nulls={nulls}, empty strings={empties}, 'None'/'none' strings={nones}, zero values={zeros}")

In [0]:
# Cast "genres" from string to array<string> (split on ", ")
df = df.withColumn("genres_array", split(col("genres"), ",\\s*"))
df.select("genres", "genres_array").show(50, truncate=False)

In [0]:
# Count occurrences of each distinct genre
genre_counts = (
    df
    .select(explode(col("genres_array")).alias("genre"))
    .groupBy("genre")
    .count()
    .orderBy(col("count").desc())
)

display(genre_counts)

In [0]:
# Plot top 5 genres by total revenue

top5_revenue = (
    df
    .select(col("revenue"), explode(col("genres_array")).alias("genre"))
    .groupBy("genre")
    .agg(spark_sum("revenue").alias("total_revenue"))
    .orderBy(col("total_revenue").desc())
    .limit(5)
    .toPandas()
)

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.barplot(
    x="total_revenue",
    y="genre",
    data=top5_revenue,
    palette="Blues_d"
)
plt.title("Top 5 Genres by Total Revenue")
plt.xlabel("Total Revenue")
plt.ylabel("Genre")
plt.tight_layout()
plt.show()

In [0]:
# Plot top 5 genres by total budget
top5_budget = (
    df
    .select(col("budget"), explode(col("genres_array")).alias("genre"))
    .groupBy("genre")
    .agg(spark_sum("budget").alias("total_budget"))
    .orderBy(col("total_budget").desc())
    .limit(5)
    .toPandas()
)

plt.figure(figsize=(10, 6))
sns.barplot(
    x="total_budget",
    y="genre",
    data=top5_budget,
    palette="Greens_d"
)
plt.title("Top 5 Genres by Total Budget")
plt.xlabel("Total Budget")
plt.ylabel("Genre")
plt.tight_layout()
plt.show()


In [0]:
# Overwrite "genres" with the array column
df = df.withColumn("genres", col("genres_array"))

# Drop the helper "genres_array" column
df = df.drop("genres_array")

# Verify the change
df.select("genres").printSchema()


In [0]:
# Split comma-separated string into an array
df_split = df.withColumn("genre", split(col("genres"), ","))

# Explode the array to separate rows
df_exploded = df_split.withColumn("genre", explode(col("genre")))

# Clean up whitespace (optional but helpful)
df_clean = df_exploded.withColumn("genre", trim(col("genre")))

# Group by genre and count
genre_counts = df_clean.groupBy("genre").count().orderBy("count", ascending=False)

# Show the results
display(genre_counts)

## PRODUCTION COMPANIES

Currently **production_companies** is in type string. We want to cast it to an array.

In [0]:
# Summary for 'production_companies'
total = df.count()

nulls   = df.filter(col("production_companies").isNull()).count()
empties = df.filter((col("production_companies") == "") | (col("production_companies") == " ")).count()
nones   = df.filter(col("production_companies").isin("None", "none")).count()
zeros   = df.filter(col("production_companies") == "0").count()

print(f"production_companies: total={total}, nulls={nulls}, empty strings={empties}, 'None'/'none' strings={nones}, zero values={zeros}")

In [0]:
# Cast `production_companies` from string to array<string>
df = df.withColumn(
    "production_companies",
    split(col("production_companies"), ",\\s*")
)
df.select("production_companies").printSchema()

In [0]:
# Top 10 production_companies by number of movies
company_counts = (
    df
    .select(explode(col("production_companies")).alias("company"))
    .groupBy("company")
    .count()
    .orderBy(col("count").desc())
    .limit(10)
    .toPandas()
)

plt.figure(figsize=(10,6))
sns.barplot(x="count", y="company", data=company_counts, palette="Greens_d")
plt.title("Top 10 Production Companies by Number of Movies")
plt.xlabel("Number of Movies")
plt.ylabel("Company")
plt.tight_layout()
plt.show()


In [0]:
# 3) Top 10 production_companies by average revenue

company_revenue = (
    df
    .select(col("revenue"), explode(col("production_companies")).alias("company"))
    .groupBy("company")
    .agg(avg("revenue").alias("avg_revenue"))
    .orderBy(col("avg_revenue").desc())
    .limit(10)
    .toPandas()
)

plt.figure(figsize=(10,6))
sns.barplot(x="avg_revenue", y="company", data=company_revenue, palette="Oranges_d")
plt.title("Top 10 Production Companies by Average Revenue")
plt.xlabel("Average Revenue")
plt.ylabel("Company")
plt.tight_layout()
plt.show()


## PRODUCTION COUNTRIES

Currently **production_countries** is in type string. We want to cast it to an array.

In [0]:
# Summary for 'production_countries'
total = df.count()

nulls   = df.filter(col("production_countries").isNull()).count()
empties = df.filter((col("production_countries") == "") | (col("production_countries") == " ")).count()
nones   = df.filter(col("production_countries").isin("None", "none")).count()
zeros   = df.filter(col("production_countries") == "0").count()

print(f"production_countries: total={total}, nulls={nulls}, empty strings={empties}, 'None'/'none' strings={nones}, zero values={zeros}")

In [0]:
# Cast `production_countries` from string to array<string>
df = df.withColumn(
    "production_countries",
    split(col("production_countries"), ",\\s*")
)
df.select("production_countries").printSchema()

In [0]:
# Top 5 production_countries by number of movies

country_counts = (
    df
    .select(explode(col("production_countries")).alias("country"))
    .groupBy("country")
    .count()
    .orderBy(col("count").desc())
    .limit(5)
    .toPandas()
)

plt.figure(figsize=(8,5))
sns.barplot(x="count", y="country", data=country_counts, palette="Blues_d")
plt.title("Top 5 Production Countries by Number of Movies")
plt.xlabel("Number of Movies")
plt.ylabel("Country")
plt.tight_layout()
plt.show()

## SPOKEN LANGUAGES

- Currently **spoken_languages** is in type string. We want to cast it to an array.

In [0]:
# Summary for 'spoken_languages'
total = df.count()

nulls   = df.filter(col("spoken_languages").isNull()).count()
empties = df.filter((col("spoken_languages") == "") | (col("spoken_languages") == " ")).count()
nones   = df.filter(col("spoken_languages").isin("None", "none")).count()
zeros   = df.filter(col("spoken_languages") == "0").count()

print(f"spoken_languages: total={total}, nulls={nulls}, empty strings={empties}, 'None'/'none' strings={nones}, zero values={zeros}")

In [0]:
# Cast `spoken_languages` from string to array<string>
df = df.withColumn(
    "spoken_languages",
    split(col("spoken_languages"), ",\\s*")
)
df.select("spoken_languages").printSchema()


In [0]:
# Top 5 spoken_languages by number of movies
lang_counts = (
    df
    .select(explode(col("spoken_languages")).alias("language"))
    .groupBy("language")
    .count()
    .orderBy(col("count").desc())
    .limit(5)
    .toPandas()
)

plt.figure(figsize=(8,5))
sns.barplot(x="count", y="language", data=lang_counts, palette="Purples_d")
plt.title("Top 5 Spoken Languages by Number of Movies")
plt.xlabel("Number of Movies")
plt.ylabel("Language")
plt.tight_layout()
plt.show()

In [0]:
# Average revenue per top 5 spoken_languages
top5_langs = lang_counts["language"].tolist()
avg_rev_lang = (
    df
    .select(col("revenue"), explode(col("spoken_languages")).alias("language"))
    .filter(col("language").isin(top5_langs))
    .groupBy("language")
    .agg(avg("revenue").alias("avg_revenue"))
    .orderBy(col("avg_revenue").desc())
    .toPandas()
)

plt.figure(figsize=(8,5))
sns.barplot(x="avg_revenue", y="language", data=avg_rev_lang, palette="Purples_d")
plt.title("Average Revenue by Top 5 Spoken Languages")
plt.xlabel("Average Revenue")
plt.ylabel("Language")
plt.tight_layout()
plt.show()

## KEYWORDS

- Currently **keywords** is in type string. We want to cast it to an array.

In [0]:
# Summary for 'keywords'
total = df.count()

nulls   = df.filter(col("keywords").isNull()).count()
empties = df.filter((col("keywords") == "") | (col("keywords") == " ")).count()
nones   = df.filter(col("keywords").isin("None", "none")).count()
zeros   = df.filter(col("keywords") == "0").count()

print(f"keywords: total={total}, nulls={nulls}, empty strings={empties}, 'None'/'none' strings={nones}, zero values={zeros}")

In [0]:
# Cast `keywords` from string to array<string>
df = df.withColumn(
    "keywords",
    split(col("keywords"), ",\\s*")
)
df.select("keywords").printSchema()


In [0]:
# Top 50 keywords by frequency

keyword_counts = (
    df
    .select(explode(col("keywords")).alias("keyword"))
    .groupBy("keyword")
    .count()
    .orderBy(col("count").desc())
    .limit(50)
    .toPandas()
)

print("Top 50 Keywords by Frequency:")
display(keyword_counts)


In [0]:
# Average revenue per top 10 keywords
top10_keywords = keyword_counts["keyword"].tolist()[:10]
avg_rev_kw = (
    df
    .select(col("revenue"), explode(col("keywords")).alias("keyword"))
    .filter(col("keyword").isin(top10_keywords))
    .groupBy("keyword")
    .agg(avg("revenue").alias("avg_revenue"))
    .orderBy(col("avg_revenue").desc())
    .toPandas()
)

plt.figure(figsize=(10,6))
sns.barplot(x="avg_revenue", y="keyword", data=avg_rev_kw, palette="Reds_d")
plt.title("Average Revenue by Top 10 Keywords")
plt.xlabel("Average Revenue")
plt.ylabel("Keyword")
plt.tight_layout()
plt.show()


###Create Profit and ROI to enhance our analysis

In [0]:
# Filter out rows with zero or null budget or revenue to avoid division errors
df_money = df.filter((col("budget") > 0) & (col("revenue") > 0))

# Create 'profit' and 'roi' columns
df_money = df_money.withColumn("profit", col("revenue") - col("budget"))
df_money = df_money.withColumn("roi", col("revenue") / col("budget"))

In [0]:
# Describe the statistics for 'profit' and 'roi' columns in PySpark
df_money.describe(['profit', 'roi']).show()


###Check only de ROI > 0

In [0]:
# Step 1: Calculate profit and ROI
df_money = df.withColumn("profit", col("revenue") - col("budget"))
df_money = df_money.withColumn("roi", col("revenue") / col("budget"))

# Step 2: Filter for positive profit and ROI
profit_roi_filtered = df_money.filter((col('profit') > 0) & (col('roi') > 0))

# Step 3: Convert to pandas DataFrame for plotting
profit_roi_filtered_pd = profit_roi_filtered.select('profit').toPandas()

# Step 4: Plot the distribution of profit
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 5))
sns.histplot(profit_roi_filtered_pd['profit'], bins=100, color='mediumseagreen')
plt.title('Distribution of Movie Profit')
plt.xlabel('Profit (Revenue - Budget)')
plt.ylabel('Number of Movies')
plt.axvline(0, color='red', linestyle='--', label='Break-even')
plt.legend()
plt.tight_layout()
plt.show()

###Movies with the highest profit

In [0]:
# Step 1: Sort by profit in descending order and get the top 10 movies
top_profit = profit_roi_filtered.orderBy(col('profit'), ascending=False).limit(10)

# Step 2: Select the relevant columns (title, revenue, budget, profit)
top_profit_selected = top_profit.select('title', 'revenue', 'budget', 'profit')

# Step 3: Convert to pandas DataFrame to display in a table
top_profit_selected_pd = top_profit_selected.toPandas()

# Step 4: Display the result
print(top_profit_selected_pd)

###Movies with the highest ROI

In [0]:
# Step 1: Sort by ROI in descending order and get the top 10 movies
top_roi = profit_roi_filtered.orderBy(col('roi'), ascending=False).limit(10)

# Step 2: Select the relevant columns (title, budget, revenue, roi)
top_roi_selected = top_roi.select('title', 'budget', 'revenue', 'roi')

# Step 3: Convert to pandas DataFrame to display in a table
top_roi_selected_pd = top_roi_selected.toPandas()

# Step 4: Display the result
print(top_roi_selected_pd)


##Correlation Matrix for Popularity, Vote_Count and Revenue

In [0]:
# Convert to pandas for correlation analysis
df_pd = df.select('popularity', 'vote_count', 'revenue').toPandas()

# Compute correlation matrix
correlation_popularity = df_pd.corr()

# Display the correlation matrix
print("Correlation between Popularity, Vote Count, and Revenue:")
print(correlation_popularity)


Vote count appears to have the strongest relationship with revenue.

Both popularity and vote count are positively correlated with revenue, but the relationship with popularity is weaker compared to the relationship between vote count and revenue.

# Export the dataset

In [0]:
# coalesce to 1 file (optional), write with header
df.coalesce(1) \
  .write \
  .mode("overwrite") \
  .option("header", "true") \
  .csv("/FileStore/tables/IMDB_Cleaned")