In [2]:
# %pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [3]:
import dask.dataframe as dd
import dask.array as da
from dask import delayed, compute

import pandas as pd
import numpy as np

# Import natural language library
import nltk
from nltk.corpus import wordnet

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shadowclone/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
# Suppress scientific notations for large float values
pd.options.display.float_format = '{:.0f}'.format

## Import our Data

In [4]:
movies_df = dd.read_csv(
    'movies.csv',
    blocksize = '25MB',
    sample = 500
)

movies_pd = movies_df.compute()

## Missing Value Calculation

In [5]:
task_missing = movies_df.isnull().sum()
task_missing.visualize(engine = "cytoscape")

CytoscapeWidget(cytoscape_layout={'name': 'dagre', 'rankDir': 'BT', 'nodeSep': 10, 'edgeSep': 10, 'spacingFact…

In [6]:
task_missing.compute()

Unnamed: 0                 0
id                         0
title                      0
genres                     0
original_language          0
overview                 489
popularity                 0
production_companies    2197
release_date             266
budget                     0
revenue                    0
runtime                   81
status                     0
tagline                 5129
vote_average               0
vote_count                 0
credits                  324
keywords                4010
poster_path              785
backdrop_path           3385
recommendations         6623
release_date_new         266
dtype: int64

## Extracting Datetime features from Strings

In [13]:
movies_df['release_date'].head(5)
movies_df.reset_index()
movies_df['release_date_dt'] = dd.to_datetime(
    movies_df['release_date'], errors = 'coerce',
    exact = False, format = '%Y-%m-%d'
)

In [None]:
movies_df.head(10)

## Engineer Year, Month and Day of the week String features

In [17]:
movies_df['year'] = movies_df['release_date_dt'].dt.strftime('%Y')  # year
movies_df['month'] = movies_df['release_date_dt'].dt.strftime('%b') # Abbreviated name of month
movies_df['day_of_week'] = movies_df['release_date_dt'].dt.strftime('%a')   # Abbreviated day of the week

## Filter by Datetime intervals & Sort the data

In [24]:
%%time

# Filter by time index between 2016 and 2024
filtered_df = movies_df.loc[
    (movies_df['release_date_dt'] > '2015-12-31') & (movies_df['release_date_dt'] < '2024'), :
].compute().sort_values(['budget', 'release_date_dt'], ascending=False)

print(f'Number of matching observations - {len(filtered_df)}')
filtered_df.head(5)

Number of matching observations - 2539
CPU times: user 257 ms, sys: 18.1 ms, total: 275 ms
Wall time: 332 ms


Unnamed: 0.1,Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,...,credits,keywords,poster_path,backdrop_path,recommendations,release_date_new,release_date_dt,year,month,day_of_week
6,77,616037,Thor: Love and Thunder,fantasy-action-comedy,en,After his retirement is interrupted by Gorr th...,394.087,Marvel Studios-Kevin Feige Productions,2022-07-06,250000000.0,...,Chris Hemsworth-Natalie Portman-Christian Bale...,ex-girlfriend-hero-greek mythology-sequel-supe...,/pIkRyD18kl4FhoCNQuWxWu5cBLM.jpg,/jsoz1HlxczSuTx0mDl2h0lxy36l.jpg,539681-610150-985939-629176-2-45920-438148-782...,2022-07-06,2022-07-06,2022,Jul,Wed
27,335,508943,Luca,animation-comedy-family-fantasy-action-adventure,en,Luca and his best friend Alberto experience an...,132.456,Pixar-Walt Disney Pictures,2021-06-17,200000000.0,...,Jacob Tremblay-Jack Dylan Grazer-Emma Berman-S...,italy-monster-friendship-friends-coming of age...,/8tABCBpzu3mZbzMB3sRzMEHEvJi.jpg,/620hnMVLu6RSZW6a5rwO8gqpt0t.jpg,527774-77742-79233-337404-800409-497698-400216...,2021-06-17,2021-06-17,2021,Jun,Thu
83,1491,508439,Onward,family-animation-adventure-comedy-fantasy-action,en,In a suburban fantasy world two teenage elf br...,47.176,Walt Disney Pictures-Pixar,2020-02-29,200000000.0,...,Tom Holland-Chris Pratt-Julia Louis-Dreyfus-Oc...,elves-magic-dead father-dead parent-fantasy wo...,/f4aul3FyD3jv3v4bul1IrkWZvzq.jpg,/xFxk4vnirOtUxpOEWgA1MCRfy6J.jpg,726166-662018-872325-611059-1038789-579955-454...,2020-02-29,2020-02-29,2020,Feb,Sat
28,341,384018,Fast & Furious Presents: Hobbs & Shaw,action-adventure-comedy,en,Ever since US Diplomatic Security Service Agen...,130.097,Universal Pictures-Chris Morgan Productions-Se...,2019-08-01,200000000.0,...,Dwayne Johnson-Jason Statham-Idris Elba-Vaness...,london england-biological weapon-secret organi...,/qRyy2UmjC5ur9bDi3kpNNRCc5nc.jpg,/hpgda6P9GutvdkDX5MUJ92QG9aj.jpg,337339-429617-458156-423204-420818-168259-9615...,2019-08-01,2019-08-01,2019,Aug,Thu
25,296,436969,The Suicide Squad,action-comedy-adventure,en,Supervillains Harley Quinn Bloodsport Peacemak...,144.638,DC Films-Atlas Entertainment-The Safran Compan...,2021-07-28,185000000.0,...,Margot Robbie-Idris Elba-John Cena-Joel Kinnam...,monster-anti hero-secret mission-superhero-bas...,/kb4s0ML0iVZlG6wAKbbs9NAm6X.jpg,/jlGmlFOcfo8n5tURmhC7YVd4Iyy.jpg,451048-497698-385128-550988-193414-566525-7913...,2021-07-28,2021-07-28,2021,Jul,Wed


## Aggregate Total Budget by Year

In [34]:
%%time

# Aggregation by year to find the total sum of movie budgets and sorted the pandas series
budget_summation = movies_df.groupby('year').agg({'budget': 'sum'}).sort_values(['budget'], ascending = False)

budget_summation.head(10)

CPU times: user 245 ms, sys: 15.3 ms, total: 261 ms
Wall time: 267 ms


Unnamed: 0_level_0,budget
year,Unnamed: 1_level_1
2013,1141692969
2016,1126213907
2022,1108167602
2017,1098264623
2010,1097269021
2019,1031065618
2003,929983303
2014,859188496
2002,857081750
2012,794552769


## Text Mining for Movie Descriptions

In [37]:
def find_synonyms(word_list: list) :
    all_matches = []

    for word in word_list :
        synonyms = []
        word_nltk = wordnet.synsets(word)
        for synonym in word_nltk :
            for lemma in synonym.lemmas():
                name = str(lemma.name())
                name = name.replace('-', '')
                synonyms.append(name)

        synonym_set = list(set(synonyms))
        all_matches.extend(synonym_set)

    # flat_list = []
    # for sublist in all_matches:
    #     for element in sublist:
    #         flat_list.append(element)

    return all_matches

In [43]:
synonym_list = find_synonyms(['happy', 'joy', 'nice', 'good'])

## Search for matching movie generes

In [45]:
%%time

# Converting strings to all lower case
movies_df['genres'] = movies_df['genres'].str.lower()

# Concat list of synanyms into one string, seperated by '|'
synonym_string = '|'.join('synonym_list')

# String based matching search
feel_good_df = movies_df.loc[movies_df['genres'].str.contains(synonym_string), :]

filtered_movies_df = feel_good_df.compute().sort_values(['popularity'], ascending=False)
filtered_movies_df

CPU times: user 263 ms, sys: 13.3 ms, total: 276 ms
Wall time: 277 ms


Unnamed: 0.1,Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,...,credits,keywords,poster_path,backdrop_path,recommendations,release_date_new,release_date_dt,year,month,day_of_week
0,12,594767,Shazam! Fury of the Gods,action-comedy-fantasy-adventure,en,Billy Batson and his foster siblings who trans...,2011,New Line Cinema-The Safran Company-DC Films,2023-03-15,125000000,...,Zachary Levi-Asher Angel-Jack Dylan Grazer-Rac...,superhero-end of the world-super power-aftercr...,/A3ZbZsmsvNGdprRi2lKgGEeVLEH.jpg,/nDxJJyA5giRhXx96q1sWbOUjMBI.jpg,868759-994751-700391-948713-502356-938992-7660...,2023-03-15,2023-03-15,2023,Mar,Wed
1,18,615656,Meg 2: The Trench,action-science fiction-horror-comedy,en,An exploratory dive into the deepest depths of...,1321,Apelles Entertainment-Warner Bros. Pictures-di...,2023-08-02,129000000,...,Jason Statham-Wu Jing-Shuya Sophia Cai-Sergio ...,based on novel or book-sequel-kaiju,/4m1Au3YkjqsxF8iwQy0fPYSxE0h.jpg,/Aukfa8dk6B5OxuelbaPBOJYXaBI.jpg,447277-872585-346698-1083862-496450-457332-114...,2023-08-02,2023-08-02,2023,Aug,Wed
2,19,868759,Ghosted,romance-action-comedy,en,Salt-of-the-earth Cole falls head over heels f...,1215,Skydance Media-Apple Studios,2023-04-18,0,...,Chris Evans-Ana de Armas-Adrien Brody-Mike Moh...,secret agent,/liLN69YgoovHVgmlHJ876PKi5Yi.jpg,/b9UCfDzwiWw7mIFsIQR9ZJUeh7q.jpg,640146-726759,2023-04-18,2023-04-18,2023,Apr,Tue
3,22,758009,Shotgun Wedding,action-romance-comedy,en,Darcy and Tom gather their families for the ul...,1043,Lionsgate-Mandeville Films-Nuyorican Productio...,2022-12-28,0,...,Jennifer Lopez-Josh Duhamel-Jennifer Coolidge-...,wedding-hostage situation,/t79ozwWnwekO0ADIzsFP1E5SkvR.jpg,/zGoZB4CboMzY1z4G3nU6BWnMDB2.jpg,702432-1064489-1013870-953734-805307-753965-84...,2022-12-28,2022-12-28,2022,Dec,Wed
4,48,587092,Unicorn Wars,action-animation-comedy-fantasy-horror-war,es,An army of bear cubs train and indoctrinate yo...,536,UniKo-Schmuby Productions-Autour de Minuit-Pan...,2022-10-21,0,...,Jon Goiri-Jaione Insausti-Ramón Barea-Txema Re...,gore-bear-unicorn-war-animation,/8KBj11zBaRdhoeq1q9jcAwKmDSk.jpg,/rbUPJoJJquPbX1AiV6GzOqcmJME.jpg,852046-601796,2022-10-21,2022-10-21,2022,Oct,Fri
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6836,101279,786413,Family 420,action-comedy-music,pa,A thriving indian family smokes a bunch of the...,1,T-Series,2004-09-21,0,...,Dilawar Sidhu-Parvaz Kaur-Sabbu Deol,,/gbCp1EJDbKwH9N4L8mkE8GGJv5d.jpg,,,2004-09-21,2004-09-21,2004,Sep,Tue
6835,101000,781582,13rd Sister: Three Deadly Days,comedy-action,vi,She thought she had overthrown the patron Hac ...,1,,2020-12-25,0,...,Thu Trang-Tiến Luật-Kiều Minh Tuấn-Trương Minh...,,/faqGzwnN6OWXDSiQQofFmMs2SBg.jpg,,,2020-12-25,2020-12-25,2020,Dec,Fri
6834,100837,793730,Futari wa Precure Splash Star: Maji Doki Theater,animation-action-comedy-fantasy-family,ja,A seed of hope descends from the sky landing a...,1,Toei Animation,2008-01-01,0,...,Orie Kimoto-Atsuko Enomoto-Kappei Yamaguchi-Mi...,precure-anime,/t2v4pA1klWMrOBrexO2Vk0PyvnL.jpg,,,2008-01-01,2008-01-01,2008,Jan,Tue
6833,100698,728503,Tribe: The Untold Story of the Making of Vice ...,comedy-action-drama,en,Two brave young men set out to create the grea...,1,Reel Green Pictures-Manuel Alejandro Films,2020-02-20,8000,...,Thomas Burke-Xavier Alvarado-Manuel Alejandro ...,movie business-gun-writing-satire-tribe-bloody...,/9VmT16H0Qsswjr8vmjcRMM8l40z.jpg,/y6oFB4HfJTJg3CCBmvKDjTc6lgL.jpg,,2020-02-20,2020-02-20,2020,Feb,Thu
