#Install libraries

In [1]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Check this site for the latest download link
# https://www.apache.org/dyn/closer.lua/spark
!wget -q https://dlcdn.apache.org/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
!tar xf spark-3.5.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j


[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
[33m0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.83)] [[0m[33m0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.83)] [[0m                                                                               Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
[33m0% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.[0m                                                                               Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Ign:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy Release [5,713 B]
Get:8 https://r2u.stat.illinois

# Pyspark imports and context setup


In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, to_date, year, expr

In [6]:
# Create spark session
spark = SparkSession.builder.appName("Movies-Activity").getOrCreate()

# Read json file
df_movies = spark.read.option("inferSchema", "true").json("/content/movies.json")
df_movies.printSchema()

root
 |-- Creative_Type: string (nullable = true)
 |-- Director: string (nullable = true)
 |-- Distributor: string (nullable = true)
 |-- IMDB_Rating: double (nullable = true)
 |-- IMDB_Votes: long (nullable = true)
 |-- MPAA_Rating: string (nullable = true)
 |-- Major_Genre: string (nullable = true)
 |-- Production_Budget: long (nullable = true)
 |-- Release_Date: string (nullable = true)
 |-- Rotten_Tomatoes_Rating: long (nullable = true)
 |-- Running_Time_min: long (nullable = true)
 |-- Source: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- US_DVD_Sales: long (nullable = true)
 |-- US_Gross: long (nullable = true)
 |-- Worldwide_Gross: long (nullable = true)



# Profit Margin

In [10]:
df_ProfitMargin = df_movies.select("Title", "Worldwide_Gross", "Production_Budget")
df_ProfitMargin = df_ProfitMargin.withColumn('Profit Margin', ((df_movies['Worldwide_Gross'] - df_movies['Production_Budget']) / df_movies['Production_Budget']))
df_ProfitMargin.show(n=10)

+--------------------+---------------+-----------------+-------------------+
|               Title|Worldwide_Gross|Production_Budget|      Profit Margin|
+--------------------+---------------+-----------------+-------------------+
|      The Land Girls|         146083|          8000000|       -0.981739625|
|First Love, Last ...|          10876|           300000|-0.9637466666666666|
|I Married a Stran...|         203134|           250000|          -0.187464|
|Let's Talk About Sex|         373615|           300000|0.24538333333333334|
|                Slam|        1087521|          1000000|           0.087521|
| Mississippi Mermaid|        2624551|          1600000|        0.640344375|
|           Following|          44705|             6000|  6.450833333333334|
|             Foolish|        6026908|          1600000|          2.7668175|
|             Pirates|        6341825|         40000000|       -0.841454375|
|     Duel in the Sun|       20400000|          6000000|                2.4|

#Determine if a Movie is a Box Office Hit

In [12]:
df_BoxOfficeHit = df_movies.select("Title", "Worldwide_Gross", "Production_Budget")
df_BoxOfficeHit = df_BoxOfficeHit.withColumn('Box_Office_hit',when(df_movies['Worldwide_Gross'] > (2 * df_movies['Production_Budget']), "Hit").otherwise("Flop"))
df_BoxOfficeHit.show(n=10)

+--------------------+---------------+-----------------+---------------+
|               Title|Worldwide_Gross|Production_Budget|Box_Office_hit?|
+--------------------+---------------+-----------------+---------------+
|      The Land Girls|         146083|          8000000|           Flop|
|First Love, Last ...|          10876|           300000|           Flop|
|I Married a Stran...|         203134|           250000|           Flop|
|Let's Talk About Sex|         373615|           300000|           Flop|
|                Slam|        1087521|          1000000|           Flop|
| Mississippi Mermaid|        2624551|          1600000|           Flop|
|           Following|          44705|             6000|            Hit|
|             Foolish|        6026908|          1600000|            Hit|
|             Pirates|        6341825|         40000000|           Flop|
|     Duel in the Sun|       20400000|          6000000|            Hit|
+--------------------+---------------+-------------

#Convert Release Date to a Standard Format

In [14]:
df_date = df_movies.select("Title", "Release_Date")
df_date = df_date.withColumn(
    "Release_Date",
    when(df_movies["Release_Date"].rlike("^\d{1,2}-[A-Za-z]{3}-\d{2}$"), to_date(df_movies["Release_Date"], "d-MMM-yy"))
    .when(df_movies["Release_Date"].rlike("^\d{4}-\d{2}-\d{2}$"), to_date(df_movies["Release_Date"], "yyyy-MM-dd"))
    .when(df_movies["Release_Date"].rlike("^[A-Za-z]+, \d{4}$"), to_date(df_movies["Release_Date"], "MMMM, yyyy"))
    .otherwise(None)
)
df_date = df_date.withColumn("Release_Year", year(df_date["Release_Date"]))
df_date.show(n=10)

+--------------------+------------+------------+
|               Title|Release_Date|Release_Year|
+--------------------+------------+------------+
|      The Land Girls|  2098-06-12|        2098|
|First Love, Last ...|  2098-08-07|        2098|
|I Married a Stran...|  2098-08-28|        2098|
|Let's Talk About Sex|  2098-09-11|        2098|
|                Slam|  2098-10-09|        2098|
| Mississippi Mermaid|  2099-01-15|        2099|
|           Following|  2099-04-04|        2099|
|             Foolish|  2099-04-09|        2099|
|             Pirates|  2086-07-01|        2086|
|     Duel in the Sun|  2046-12-31|        2046|
+--------------------+------------+------------+
only showing top 10 rows



#Create a new column: IMDB Rating Category

In [15]:
df_IMDB = df_movies.select("Title", "IMDB_Rating")
df_IMDB = df_IMDB.withColumn('IMDB_Simple_Rating', when(df_movies['IMDB_Rating'] >= 7.0 , "High").when(df_movies['IMDB_Rating'] >= 5.0 , "Medium").otherwise("Low"))
df_IMDB.show(n=10)


+--------------------+-----------+------------------+
|               Title|IMDB_Rating|IMDB_Simple_Rating|
+--------------------+-----------+------------------+
|      The Land Girls|        6.1|            Medium|
|First Love, Last ...|        6.9|            Medium|
|I Married a Stran...|        6.8|            Medium|
|Let's Talk About Sex|       NULL|               Low|
|                Slam|        3.4|               Low|
| Mississippi Mermaid|       NULL|               Low|
|           Following|        7.7|              High|
|             Foolish|        3.8|               Low|
|             Pirates|        5.8|            Medium|
|     Duel in the Sun|        7.0|              High|
+--------------------+-----------+------------------+
only showing top 10 rows



#Calculate Average IMDB Rating for Each Distributor

In [18]:
df_Distibutor = df_movies.select("Title", "IMDB_Rating", "Distributor")
df_Distibutor = df_Distibutor.groupby('Distributor').agg(avg(col('IMDB_Rating')).alias('IMDB_AVG_Rating'))
df_Distibutor.show(n=10)

+--------------------+------------------+
|         Distributor|   IMDB_AVG_Rating|
+--------------------+------------------+
|Oscilloscope Pict...|               6.2|
|               Savoy| 6.533333333333334|
|             Embassy|              NULL|
|         Fader Films|               6.5|
|       October Films|               6.4|
|              Strand| 6.322222222222223|
|             Trimark| 5.085714285714286|
|              Matson|               7.3|
|      Cinema Service|               7.1|
| Weinstein/Dimension|5.7250000000000005|
+--------------------+------------------+
only showing top 10 rows



#Stop spark

In [19]:
spark.stop()