# **Part A**


**Question 1.**

In [23]:
# Install MRJob library
!pip install MRJob



In [24]:
 # Save the script to a file for MRJob to run
%%file hw1_mrA1_212536924_322836180.py

# This MRJob script processes TV show data to find shows that aired between 13:30 and 16:29,
# meet genre and title conditions, and counts the number of unique air dates for each title.

from mrjob.job import MRJob
from mrjob.step import MRStep
from datetime import time
import re
import csv
from io import StringIO

WORD_RE = re.compile(r"[\w']+")

class MRBigBrotherPrograms(MRJob):

  # Constants:
  START_TIME = time(13, 30, 0)
  END_TIME = time(16, 29, 59)
  GOOD_GENRES = ['Reality', 'Community', 'Adventure', 'Animated']
  GOOD_LETTERS = ['p', 'w', 'm']
  BAD_LETTERS = ['a', 'b']

  SHOW_TITLE_INDEX = 0
  GENRE_INDEX = 2
  AIR_DATE_INDEX = 3
  AIR_TIME_INDEX = 4


  def steps(self):

    return [
            # Step 1: Fulfill conditions 2,3,4
            MRStep(mapper=self.mapper_conditions_234),

            # Step 2: Fulfill condition 1 and format result
            MRStep(mapper=self.mapper_filter_irrelevant_dates,
                   reducer=self.reducer_unique_dates)]



  def mapper_filter_irrelevant_dates(self, title, values):
    # Filter dates where the airing time is not between 13:30 and 16:29

    date = values[0]
    air_time_raw = int(values[1])
    good_genre_list = values[2]
    genre_count = values[3]

    air_time = time(air_time_raw // 10000, (air_time_raw % 10000) // 100, air_time_raw % 100)

    if self.START_TIME <= air_time <= self.END_TIME:
      # Set title as key so we can count dates after
      yield title, (date, good_genre_list, genre_count)



  def mapper_conditions_234(self, _, line):
    # Filters rows based on:
    # - Condition 2: Genre contains one of GOOD_GENRES
    # - Condition 3: Title has at least two of GOOD_LETTERS
    # - Condition 4: Title has none of BAD_LETTERS

    for row in csv.reader(StringIO(line)):
      # The above line parses the CSV file into a list of rows and then iterates

      # Skip header row
      if row[self.AIR_TIME_INDEX] == "air_time":
        continue

      title = row[self.SHOW_TITLE_INDEX]
      genre_list = row[self.GENRE_INDEX]

      good_genre_list = [title]

      # Condition 2:
      condition_two = False
      for genre in self.GOOD_GENRES:
          if genre in genre_list:
            good_genre_list.append(genre)
            condition_two = True

      # Condition 3:
      counter = 0
      for letter in self.GOOD_LETTERS:
        if letter in title.lower():
          counter += 1

      condition_three = counter >= 2

      # Condition 4:
      condition_four = True
      for letter in self.BAD_LETTERS:
        if letter in title.lower():
          condition_four = False

      total_genre_count = len(genre_list.split(','))

      if condition_two and condition_three and condition_four:
        yield title, (row[self.AIR_DATE_INDEX], row[self.AIR_TIME_INDEX], good_genre_list, total_genre_count)




  def reducer_unique_dates(self, title, values):
    # Count unique dates
    dates = set()


    for air_date, good_genre_list, genre_count in values:
      dates.add(air_date)
      genre_list = good_genre_list
      num_of_genres = genre_count

    yield genre_list, (len(dates), genre_count)


if __name__ == '__main__':
  MRBigBrotherPrograms.run()

Overwriting hw1_mrA1_212536924_322836180.py


In [25]:
# Run and save py file
!python hw1_mrA1_212536924_322836180.py "440k_data.csv" -q

["600 Pound Mom", "Reality"]	[1, 3]
["Computerwise", "Community"]	[2, 1]
["El Show de Tom y Jerry", "Animated"]	[1, 3]
["Empowered: Keys to Unlocking", "Community"]	[3, 2]
["Extreme Couponing", "Reality"]	[4, 1]
["Jiggijump", "Adventure"]	[20, 3]
["KUOW's Week In Review Summer Tour", "Community"]	[1, 1]
["Lord of the Rings: Fellowship of Ring", "Adventure"]	[1, 2]
["Love & Hip Hop: Hollywood", "Reality"]	[4, 1]
["Missouri Viewpoints", "Community"]	[1, 1]
["My Time With Jesus", "Animated"]	[3, 2]
["New Mexico True TV", "Community"]	[1, 1]
["Pok & Mok", "Animated"]	[1, 3]
["Pok\u00e9mon 4Ever", "Adventure", "Animated"]	[1, 3]
["Pok\u00e9mon: XY", "Animated"]	[16, 4]
["Pompeii", "Adventure"]	[6, 3]
["Semper Ride", "Community"]	[1, 1]
["Shipwrecked", "Adventure"]	[1, 1]
["Super Why!", "Animated"]	[119, 3]
["Super Wings", "Adventure", "Animated"]	[14, 4]
["Swim Week", "Community"]	[4, 3]
["The Powerpuff Girls", "Adventure", "Animated"]	[365, 4]
["The Sylvester & Tweety Mysteries", "Animated

**Question 2.**

In [26]:
# Install MRJob library
!pip install MRJob



In [27]:
# Save MRJob part A2 script to file
%%file hw1_mrA2_212536924_322836180.py

# Extends part A1 by adding a third step:
# After filtering and counting unique air dates, it selects the best program
# based on a custom score: (number of unique air dates + number of genres).

from mrjob.job import MRJob
from mrjob.step import MRStep
from datetime import time
import re
import csv
from io import StringIO

WORD_RE = re.compile(r"[\w']+")

class MRBigBrotherPrograms(MRJob):

  # Constants:
  START_TIME = time(13, 30, 0)
  END_TIME = time(16, 29, 59)
  GOOD_GENRES = ['Reality', 'Community', 'Adventure', 'Animated']
  GOOD_LETTERS = ['p', 'w', 'm']
  BAD_LETTERS = ['a', 'b']

  SHOW_TITLE_INDEX = 0
  GENRE_INDEX = 2
  AIR_DATE_INDEX = 3
  AIR_TIME_INDEX = 4


  def steps(self):

    return [
            # Step 1: Fulfill conditions 2,3,4
            MRStep(mapper=self.mapper_conditions_234),

            # Step 2: Fulfill condition 1 and format result
            MRStep(mapper=self.mapper_filter_irrelevant_dates,
                   reducer=self.reducer_unique_dates),
            MRStep(mapper=self.mapper_title_value,
                   reducer=self.reducer_best_program)]



  def mapper_filter_irrelevant_dates(self, title, values):
    # Filter dates where the airing time is not between 13:30 and 16:29

    date = values[0]
    air_time_raw = int(values[1])
    good_genre_list = values[2]
    genre_count = values[3]

    air_time = time(air_time_raw // 10000, (air_time_raw % 10000) // 100, air_time_raw % 100)

    if self.START_TIME <= air_time <= self.END_TIME:
      # Set title as key so we can count dates after
      yield title, (date, good_genre_list, genre_count)



  def mapper_conditions_234(self, _, line):
    # Filter rows who don't satisfy conditions 2, 3 or 4

    for row in csv.reader(StringIO(line)):
      # The above line parses the CSV file into a list of rows and then iterates

      # Skip header row
      if row[self.AIR_TIME_INDEX] == "air_time":
        continue

      title = row[self.SHOW_TITLE_INDEX]
      genre_list = row[self.GENRE_INDEX]

      good_genre_list = [title]

      # Condition 2:
      condition_two = False
      for genre in self.GOOD_GENRES:
          if genre in genre_list:
            good_genre_list.append(genre)
            condition_two = True

      # Condition 3:
      counter = 0
      for letter in self.GOOD_LETTERS:
        if letter in title.lower():
          counter += 1

      condition_three = counter >= 2

      # Condition 4:
      condition_four = True
      for letter in self.BAD_LETTERS:
        if letter in title.lower():
          condition_four = False

      total_genre_count = len(genre_list.split(','))

      if condition_two and condition_three and condition_four:
        yield title, (row[self.AIR_DATE_INDEX], row[self.AIR_TIME_INDEX], good_genre_list, total_genre_count)




  def reducer_unique_dates(self, title, values):
    # Count unique dates
    dates = set()


    for air_date, good_genre_list, genre_count in values:
      dates.add(air_date)
      genre_list = good_genre_list
      num_of_genres = genre_count

    yield genre_list, (len(dates), genre_count)


  def mapper_title_value(self, key, value):
    yield None, (key[0], value[0]+value[1])


  def reducer_best_program(self, _, title_value_pairs):
    yield max(title_value_pairs, key=lambda x: x[1])


if __name__ == '__main__':
  MRBigBrotherPrograms.run()

Overwriting hw1_mrA2_212536924_322836180.py


In [28]:
# Run and save py file
!python hw1_mrA1_212536924_322836180.py "440k_data.csv" -q

["600 Pound Mom", "Reality"]	[1, 3]
["Computerwise", "Community"]	[2, 1]
["El Show de Tom y Jerry", "Animated"]	[1, 3]
["Empowered: Keys to Unlocking", "Community"]	[3, 2]
["Extreme Couponing", "Reality"]	[4, 1]
["Jiggijump", "Adventure"]	[20, 3]
["KUOW's Week In Review Summer Tour", "Community"]	[1, 1]
["Lord of the Rings: Fellowship of Ring", "Adventure"]	[1, 2]
["Love & Hip Hop: Hollywood", "Reality"]	[4, 1]
["Missouri Viewpoints", "Community"]	[1, 1]
["My Time With Jesus", "Animated"]	[3, 2]
["New Mexico True TV", "Community"]	[1, 1]
["Pok & Mok", "Animated"]	[1, 3]
["Pok\u00e9mon 4Ever", "Adventure", "Animated"]	[1, 3]
["Pok\u00e9mon: XY", "Animated"]	[16, 4]
["Pompeii", "Adventure"]	[6, 3]
["Semper Ride", "Community"]	[1, 1]
["Shipwrecked", "Adventure"]	[1, 1]
["Super Why!", "Animated"]	[119, 3]
["Super Wings", "Adventure", "Animated"]	[14, 4]
["Swim Week", "Community"]	[4, 3]
["The Powerpuff Girls", "Adventure", "Animated"]	[365, 4]
["The Sylvester & Tweety Mysteries", "Animated

# **Part B**

In [29]:
# Import required modules for date handling, CSV processing, and Spark DataFrame operations
from datetime import datetime, time, timedelta

import csv

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import (
    col, floor, lower, split, size, array_contains,
    to_date, expr, when, sum as _sum
)

In [30]:
# Initialize a Spark session
spark = SparkSession.builder.appName('HW1').getOrCreate() #Create SparkSession
sc = spark.sparkContext
# keep only important logs
spark.sparkContext.setLogLevel("ERROR")
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [31]:
# Defines a function to read a Big Brother CSV file into a Spark DataFrame using a predefined schema.

def read_from_file(filename):
  # Read from big brother csv

  schema = StructType([
    StructField("title", StringType(), True),
    StructField("prog_code", StringType(), True),
    StructField("genre", StringType(), True),
    StructField("air_date", StringType(), True),
    StructField("air_time", IntegerType(), True),
    StructField("Duration", StringType(), True)])

  df = spark.read.format("csv").option("header", "true").schema(schema) \
  .load(filename)
  return df

In [32]:
# Filters out unwanted programs and scores the rest based on genre, duration, and title,
# returning a ranked list by total score.

# Filter out programs with at least one airing on Thursday between 13:30–15:30
def filter_thursday(df):
    # Parse air_date string to DateType to extract weekday
    df = df.withColumn("air_date_parsed", to_date(col("air_date"), "yyyyMMdd"))

    # Filter for Thursday (Spark: 1=Sunday, ..., 5=Thursday)
    df_thurs = df.filter(expr("EXTRACT(DAYOFWEEK FROM air_date_parsed) = 5"))

    # Compute start time in minutes since midnight
    df_thurs = df_thurs.withColumn("start_hour", floor(col("air_time") / 10000)) \
                       .withColumn("start_minute", floor((col("air_time") % 10000) / 100)) \
                       .withColumn("start_in_minutes", col("start_hour") * 60 + col("start_minute"))

    # Compute end time by adding duration
    df_thurs = df_thurs.withColumn("end_in_minutes", col("start_in_minutes") + col("Duration"))

    # Define time window in minutes (13:30 to 15:30)
    window_start = 13 * 60 + 30
    window_end = 15 * 60 + 30

    # Find shows that overlap with the time window
    overlap_condition = (col("start_in_minutes") <= window_end) & \
                        (col("end_in_minutes") >= window_start)

    df_overlap = df_thurs.filter(overlap_condition)

    # Exclude shows (by title) that had any overlapping airing on Thursday
    titles_to_exclude = df_overlap.select("title").distinct()
    df_filtered = df.join(titles_to_exclude, on="title", how="left_anti")

    return df_filtered

# Filter out programs whose title contains certain keywords
def filter_names(df):
    keywords = ['friends', 'bang', 'breaking', 'montana', 'doctor', 'fox', 'news']

    # Convert titles to lowercase for case-insensitive matching
    df = df.withColumn("title_lower", lower(col("title")))

    # Build exclusion condition using OR logic
    exclude_condition = None
    for kw in keywords:
        condition = col("title_lower").contains(kw)
        exclude_condition = condition if exclude_condition is None else (exclude_condition | condition)

    # Filter out titles matching any keyword
    df = df.filter(~exclude_condition).drop("title_lower")

    return df

# Wrapper function to apply all filtering steps
def filter_df(df):
    df = filter_thursday(df)
    df = filter_names(df)
    return df

# Compute total scores per program based on Big Data Brother's preferences
def add_scores(df):
    # Split genres string into array
    genre_array = split(col("genre"), ",\\s*")

    # Rule 1: +10 points if only one genre
    one_genre_points = when(size(genre_array) == 1, 10).otherwise(0)

    # Rule 2: +90 points if genre includes 'Adventure' or 'Animated'
    has_fav_genre = when(
        array_contains(genre_array, "Adventure") | array_contains(genre_array, "Animated"),
        90
    ).otherwise(0)

    # Rule 3: +duration / 5
    duration_points = col("Duration") / 5

    # Rule 4: +100 points if title contains 'girls' (case-insensitive)
    girls_points = when(lower(col("title")).contains("girls"), 100).otherwise(0)

    # Compute total score per row (airing)
    total_score = one_genre_points + has_fav_genre + duration_points + girls_points

    # Add score column to each airing
    scored_df = df.withColumn("score", total_score)

    # Group by title and genre, and compute total score for each show
    result_df = scored_df.groupBy("title", "genre").agg(
        _sum("score").alias("total_score")
    ).orderBy(col("total_score").desc())

    return result_df


In [33]:
 #Loads, cleans, filters, scores, and displays the top 20 programs based on custom rules.

# Load the CSV data as a Spark DataFrame
df = read_from_file("440k_data.csv")

# Convert air_date from string format 'yyyyMMdd' to Spark DateType
df = df.withColumn("air_date", to_date(col("air_date"), "yyyyMMdd"))

# Clean up Duration values:
# - If read as strings like "10.0", this casts them to float first, then to int
df = df.withColumn("Duration", col("Duration").cast("float").cast("int"))

# Apply filtering:
# - Remove shows airing on Thursday between 13:30–15:30
# - Exclude titles containing certain unwanted keywords
new_df = filter_df(df)

# Score each viewing of remaining shows based on:
# - Genre count
# - Specific favorite genres
# - Duration
# - Title keyword ("girls")
scored_df = add_scores(new_df)

# Show top 20 scored programs with full row output (no truncation)
scored_df.show(20, truncate=False)

+-----------------------+-----------------------------------------+------------------+
|title                  |genre                                    |total_score       |
+-----------------------+-----------------------------------------+------------------+
|The Simpsons           |Sitcom,Animated                          |74880.4           |
|2 Broke Girls          |Sitcom                                   |41762.6           |
|Up to the Minute       |News                                     |23767.200000000103|
|Futurama               |Sitcom,Science fiction,Animated          |22672.2           |
|Mike & Molly           |Sitcom                                   |19903.799999999996|
|The Fairly OddParents  |Children,Comedy,Animated                 |19434.0           |
|Globe Trekker          |Travel,Adventure                         |17848.0           |
|Modern Family          |Sitcom                                   |17040.800000000003|
|Peppa Pig              |Children,Adventure