In [2]:
# %pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [3]:
import dask.dataframe as dd
import dask.array as da
from dask import delayed, compute

import pandas as pd
import numpy as np

# Import natural language library
import nltk
from nltk.corpus import wordnet

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shadowclone/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Import our Data

In [4]:
movies_df = dd.read_csv(
    'movies.csv',
    blocksize = '25MB',
    sample = 500
)

movies_pd = movies_df.compute()

## Missing Value Calculation

In [5]:
task_missing = movies_df.isnull().sum()
task_missing.visualize(engine = "cytoscape")

CytoscapeWidget(cytoscape_layout={'name': 'dagre', 'rankDir': 'BT', 'nodeSep': 10, 'edgeSep': 10, 'spacingFact…

In [6]:
task_missing.compute()

Unnamed: 0                 0
id                         0
title                      0
genres                     0
original_language          0
overview                 489
popularity                 0
production_companies    2197
release_date             266
budget                     0
revenue                    0
runtime                   81
status                     0
tagline                 5129
vote_average               0
vote_count                 0
credits                  324
keywords                4010
poster_path              785
backdrop_path           3385
recommendations         6623
release_date_new         266
dtype: int64

## Extracting Datetime features from Strings

In [13]:
movies_df['release_date'].head(5)
movies_df.reset_index()
movies_df['release_date_dt'] = dd.to_datetime(
    movies_df['release_date'], errors = 'coerce',
    exact = False, format = '%Y-%m-%d'
)

In [18]:
movies_df.head(10)

Unnamed: 0.1,Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,...,credits,keywords,poster_path,backdrop_path,recommendations,release_date_new,release_date_dt,year,month,day_of_week
0,12,594767,Shazam! Fury of the Gods,action-comedy-fantasy-adventure,en,Billy Batson and his foster siblings who trans...,2010.98,New Line Cinema-The Safran Company-DC Films,2023-03-15,125000000.0,...,Zachary Levi-Asher Angel-Jack Dylan Grazer-Rac...,superhero-end of the world-super power-aftercr...,/A3ZbZsmsvNGdprRi2lKgGEeVLEH.jpg,/nDxJJyA5giRhXx96q1sWbOUjMBI.jpg,868759-994751-700391-948713-502356-938992-7660...,2023-03-15,2023-03-15,2023,Mar,Wed
1,18,615656,Meg 2: The Trench,action-science fiction-horror-comedy,en,An exploratory dive into the deepest depths of...,1321.17,Apelles Entertainment-Warner Bros. Pictures-di...,2023-08-02,129000000.0,...,Jason Statham-Wu Jing-Shuya Sophia Cai-Sergio ...,based on novel or book-sequel-kaiju,/4m1Au3YkjqsxF8iwQy0fPYSxE0h.jpg,/Aukfa8dk6B5OxuelbaPBOJYXaBI.jpg,447277-872585-346698-1083862-496450-457332-114...,2023-08-02,2023-08-02,2023,Aug,Wed
2,19,868759,Ghosted,romance-action-comedy,en,Salt-of-the-earth Cole falls head over heels f...,1214.867,Skydance Media-Apple Studios,2023-04-18,0.0,...,Chris Evans-Ana de Armas-Adrien Brody-Mike Moh...,secret agent,/liLN69YgoovHVgmlHJ876PKi5Yi.jpg,/b9UCfDzwiWw7mIFsIQR9ZJUeh7q.jpg,640146-726759,2023-04-18,2023-04-18,2023,Apr,Tue
3,22,758009,Shotgun Wedding,action-romance-comedy,en,Darcy and Tom gather their families for the ul...,1043.225,Lionsgate-Mandeville Films-Nuyorican Productio...,2022-12-28,0.0,...,Jennifer Lopez-Josh Duhamel-Jennifer Coolidge-...,wedding-hostage situation,/t79ozwWnwekO0ADIzsFP1E5SkvR.jpg,/zGoZB4CboMzY1z4G3nU6BWnMDB2.jpg,702432-1064489-1013870-953734-805307-753965-84...,2022-12-28,2022-12-28,2022,Dec,Wed
4,48,587092,Unicorn Wars,action-animation-comedy-fantasy-horror-war,es,An army of bear cubs train and indoctrinate yo...,535.524,UniKo-Schmuby Productions-Autour de Minuit-Pan...,2022-10-21,0.0,...,Jon Goiri-Jaione Insausti-Ramón Barea-Txema Re...,gore-bear-unicorn-war-animation,/8KBj11zBaRdhoeq1q9jcAwKmDSk.jpg,/rbUPJoJJquPbX1AiV6GzOqcmJME.jpg,852046-601796,2022-10-21,2022-10-21,2022,Oct,Fri
5,73,899112,Violent Night,action-comedy-crime-thriller,en,When a team of mercenaries breaks into a wealt...,407.163,87North Productions-Universal Pictures,2022-11-30,20000000.0,...,David Harbour-John Leguizamo-Beverly D'Angelo-...,holiday-santa claus-mercenary-saving christmas...,/1XSYOP0JjjyMz1irihvWywro82r.jpg,/sBOenwOZGRN5nZZGw4TxwtnfrEf.jpg,860155-812025-505642-661374-315162-76600-87726...,2022-11-30,2022-11-30,2022,Nov,Wed
6,77,616037,Thor: Love and Thunder,fantasy-action-comedy,en,After his retirement is interrupted by Gorr th...,394.087,Marvel Studios-Kevin Feige Productions,2022-07-06,250000000.0,...,Chris Hemsworth-Natalie Portman-Christian Bale...,ex-girlfriend-hero-greek mythology-sequel-supe...,/pIkRyD18kl4FhoCNQuWxWu5cBLM.jpg,/jsoz1HlxczSuTx0mDl2h0lxy36l.jpg,539681-610150-985939-629176-2-45920-438148-782...,2022-07-06,2022-07-06,2022,Jul,Wed
7,82,325358,Superfast!,comedy-action,en,Undercover cop Lucas White joins Vin Serento's...,369.009,The Safran Company-3 in the Box-Ketchup Entert...,2015-03-05,20000000.0,...,Alex Ashbaugh-Dale Pavinski-Andrea Navedo-Lili...,car race-parody-spoof,/iuIWl90qCpoxv6g775JB6Kg0m86.jpg,/bWZ4ge00FlDyHcOQZl7AzbZcrxT.jpg,502356-868985-225703-168259-23988-13805-385687...,2015-03-05,2015-03-05,2015,Mar,Thu
8,94,833950,Little Lies,comedy-romance-thriller,es,Amidst the problems in their childless marriag...,334.664,Magoya Films-Aliwen Entertainment,2022-09-21,0.0,...,Lucas Akoskin-Leonor Varela-Florencia Peña-Ben...,,/s3NoPF6LAKDl3KUGkZsYPX9ionc.jpg,/A3mIOnYBOGKpqpfHuoES6ZIP9HP.jpg,,2022-09-21,2022-09-21,2022,Sep,Wed
9,111,718930,Bullet Train,action-comedy-thriller,en,Unlucky assassin Ladybug is determined to do h...,302.905,87North Productions-Columbia Pictures,2022-07-03,90000000.0,...,Brad Pitt-Joey King-Aaron Taylor-Johnson-Brian...,japan-assassin-based on novel or book-mission-...,/tVxDe01Zy3kZqaZRNiXFGDICdZk.jpg,/y2Ca1neKke2mGPMaHzlCNDVZqsK.jpg,760161-429473-960170-843380-597922-642885-1027...,2022-07-03,2022-07-03,2022,Jul,Sun


## Engineer Year, Month and Day of the week String features

In [17]:
movies_df['year'] = movies_df['release_date_dt'].dt.strftime('%Y')  # year
movies_df['month'] = movies_df['release_date_dt'].dt.strftime('%b') # Abbreviated name of month
movies_df['day_of_week'] = movies_df['release_date_dt'].dt.strftime('%a')   # Abbreviated day of the week