# Neon Films Audience Preference Analysis

## Data Collection

In [38]:
import requests
import time

# OMDB API Key
OMDB_API_KEY = "c0c5b16c"

# fetch movie data from omdb
def fetch_omdb_data(title):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
    }
    url = f"http://www.omdbapi.com/?t={title}&apikey={OMDB_API_KEY}"
    try:
        # request data from OMDP API
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        data = response.json()
        if data.get("Response") == "True":
            return {
                "Title": data.get("Title", ""),
                "Year": data.get("Year", ""),
                "Rated": data.get("Rated", ""),
                "Runtime": data.get("Runtime", ""),
                "imdbRating": data.get("imdbRating", ""),
                "imdbVotes": data.get("imdbVotes", ""),
                "BoxOffice": data.get("BoxOffice", ""),
                "Released": data.get("Released", ""),
                "Genre": data.get("Genre", ""),
                "Director": data.get("Director", ""),
                "Writer": data.get("Writer", ""),
                "Actors": data.get("Actors", ""),
                "Plot": data.get("Plot", ""),
                "Language": data.get("Language", ""),
                "Country": data.get("Country", ""),
                "Awards": data.get("Awards", ""),
                "Poster": data.get("Poster", ""),
                "Metascore": data.get("Metascore", ""),
                "Ratings": data.get("Ratings", "")
            }
        return {"error": data.get("Error", "Unknown error occurred")}
    # handle errors
    except requests.RequestException as e:
        return {"error": f"Request failed: {str(e)}"}

In [39]:
from trendspy import Trends
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# initialize trendsPy
tr = Trends(request_delay=10.0)

# fetch movie data from google trends
def fetch_googletrends_data(movie):
    # get movie title and release date
    title = movie["Title"].replace(":", "")
    release_date = movie["Released"]
    
    # convert release date to datetime format
    release_datetime = datetime.strptime(release_date, "%d %b %Y")
    # get date for one year later
    one_year_later = release_datetime + relativedelta(years=1)
    one_year_later_strftime = one_year_later.strftime("%Y-%m-%d")

    # timeframe string
    timeframe = f'{one_year_later_strftime} 1-y'
    
    try:
        # get number of movie searches
        interest_data = tr.interest_over_time(title, timeframe=timeframe)
        if not interest_data.empty:
            # get total number of searches in timeframe
            total_searches = interest_data[title].sum()
            return total_searches
        else:
            return None
    except Exception as e:
        print(f"Error fetching data for '{title}': {e}")
        return None

In [40]:
import pandas as pd
from tqdm import tqdm
import time

# load movie list
movies_path = "data/movies.xlsx"
movies_list = pd.read_excel(movies_path)

# store fetched data
omdb_cache = {}
googletrends_cache = {}

In [41]:
# store data for each movie
omdb_data_list = []
# fetch omdb data for movie list
for row_index, row in tqdm(movies_list.iterrows(), total=len(movies_list), desc="Fetching OMDB Data"):
    title = row["title"]
    movie_id = row_index
    # check if data has already been fetched
    if movie_id in omdb_cache:
        omdb_data = omdb_cache[movie_id]
    else:
        omdb_data = fetch_omdb_data(title)
        time.sleep(1)
    if omdb_data:
        omdb_data["ID"] = movie_id
        omdb_cache[movie_id] = omdb_data
        omdb_data_list.append(omdb_data)
    else:
        continue
omdb_df = pd.DataFrame(omdb_data_list)

Fetching OMDB Data: 100%|█████████████████████| 150/150 [02:42<00:00,  1.08s/it]


In [43]:
omdb_df_after_2006 = omdb_df[omdb_df["Year"].astype(int) >= 2006]
# store data for each movie
googletrends_data_list = []
# fetch googletrends data for movie list
for row_index, row in tqdm(omdb_df_after_2006.iterrows(), total=len(omdb_df_after_2006), desc="Fetching Google Trends Data"):
    title = row["Title"]
    release_date = row["Released"]
    movie_id = row["ID"]
    # check if data has already been fetched
    if movie_id in googletrends_cache:
        googletrends_data = googletrends_cache[movie_id]
    else:
        googletrends_data = fetch_googletrends_data(row)
    if googletrends_data:
        googletrends_cache[movie_id] = googletrends_data
        googletrends_data_list.append({"ID": movie_id, "GoogleSearches": googletrends_data})
    else:
        continue     
googletrends_df = pd.DataFrame(googletrends_data_list)

Fetching Google Trends Data: 100%|██████████████| 94/94 [00:10<00:00,  9.31it/s]


In [48]:
# save data to csv file
merged_df = omdb_df.merge(googletrends_df, on="ID", how="left")
merged_df.fillna("N/A", inplace=True)
merged_df.to_csv("data/merged_movies.csv", index=False)

## Data Cleaning and Preperation

In [103]:
# create new dataframe for movies in each genre
df_genre_exploded = merged_df.assign(Genre=merged_df['Genre'].str.split(',')).explode('Genre')
genre_df = df_genre_exploded.groupby('Genre')['ID'].apply(list).reset_index()
genre_df.to_csv("data/movies_bygenre.csv", index=False)

In [104]:
# create new dataframe for movies in each language
df_country_exploded = merged_df.assign(Country=merged_df['Country'].str.split(',')).explode('Country')
country_df = df_country_exploded.groupby('Country')['ID'].apply(list).reset_index()
country_df.to_csv("data/movies_bycountry.csv", index=False)

In [105]:
import re

def transform_data(df):
    df = df.replace('N/A', pd.NA)
    
    df["Title"] = df["Title"].str.title()
    df["Year"] = pd.to_numeric(df["Year"], errors='coerce')
    
    df["Rated"] = df["Rated"].fillna('Not Rated')
    rating_map = {
    'R': 'R', 'NC-17': 'R', 'TV-MA': 'R',
    'PG': 'PG', 'PG-13': 'PG', 'TV-PG': 'PG', 'TV-14': 'PG',
    'TV-G': 'G', 'Approved': 'G', 'G': 'G',
    'Not Rated': 'Not Rated', 'Unrated': 'Not Rated'}
    df["Rated"] = df["Rated"].map(rating_map)
    
    df["Runtime"] = df["Runtime"].fillna("")
    df["Runtime"] = df["Runtime"].str.replace(" min", "")
    df["Runtime"] = pd.to_numeric(df["Runtime"], errors='coerce')
    
    df["imdbRating"] = pd.to_numeric(df["imdbRating"], errors='coerce')
    df["imdbVotes"] = pd.to_numeric(df["imdbVotes"].str.replace(',', ''), errors='coerce')
    df["BoxOffice"] = pd.to_numeric(df["BoxOffice"].str.replace("$", "").str.replace(',', ''), errors='coerce')
    df["Released"] = pd.to_datetime(df["Released"], format="%d %b %Y", errors='coerce')
    
    df = df.drop(columns=["Genre", "Director", "Writer", "Actors", "Plot", "Language", "Country"], axis=1)
    
    df["Awards"] = df["Awards"].apply(
        lambda x: sum(map(int, re.findall(r'\d+', x))) if isinstance(x, str) and re.search(r'\d', x) else pd.NA
    )
    
    df = df.drop("Poster", axis=1)
    
    df["Metascore"] = pd.to_numeric(df["Metascore"], errors='coerce')
    df["RTScore"] = df['Ratings'].apply(lambda x: next((item['Value'] for item in x if item['Source'] == 'Rotten Tomatoes'), pd.NA))
    df["RTScore"] = pd.to_numeric(df["RTScore"].str.replace('%', ''), errors='coerce')
    
    df = df.drop("Ratings", axis=1)
    
    df["GoogleSearches"] = pd.to_numeric(df["GoogleSearches"], errors='coerce')
    df["Month"] = df["Released"].dt.month
    
    df["AvgRating"] = df.apply(
    lambda row: np.nanmean([row["imdbRating"] * 10, row['Metascore'], row['RTScore']]), axis=1)

    return df

In [106]:
movies_df = transform_data(merged_df)
movies_df.to_csv("data/transformed_movies.csv", index=False)