In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load the data
df = pd.read_csv('data/wiki_movie_plots_deduped.csv')


In [None]:
# Displaying 10 first columns
df.head(10) # df.head() without argument will display the first 5 rows

In [None]:
# Displaying the columns
df.columns

In [None]:
# Displaying the shape of the data (number of rows, number of columns)
df.shape

In [None]:
# Displaying the data types of the columns
df.dtypes

In [None]:
# Displaying the number of unique values in each column
df.nunique()

In [None]:
# Displaying the number of missing values in each column
df.isnull().sum()

In [None]:
# How many unknown director
unknown_director = df['Director'].value_counts()['Unknown']
print(f"There are {unknown_director}/{len(df)} unknown directors.")

In [None]:
# Most prolific directors
df['Director'].value_counts()

In [None]:
# Distribution by year
df['Release Year'].value_counts().sort_index().plot(kind="bar")

In [None]:
# Distribution by decade
df['Decade'] = df['Release Year'] // 10 * 10
df['Decade'].value_counts().sort_index().plot(kind="bar")

In [None]:
# Most prolific comedy director between 2000 and 2010

# Create a condition to filter movies released between the years 2000 and 2010 (inclusive)
condition_year = df['Release Year'].between(2000, 2010)

# Create a condition to filter movies that have 'comedy' in their genre, case insensitive
condition_comedy = df['Genre'].str.contains('comedy', case=False)

# Apply both conditions to the DataFrame to get movies that are comedies released between 2000 and 2010
comedy_2000_2010 = df[condition_year & condition_comedy]
comedy_2000_2010

In [None]:
comedy_2000_2010['Director'].value_counts()

In [None]:
# CAST
# Remove null values
print(f"The length of the dataset before removing null values: {len(df)}")
df = df.dropna(subset=['Cast'])
print(f"The length of the dataset after removing null values: {len(df)}")
df['Cast']

In [None]:
# Remove rows where the 'Cast' column contains 'Unknown'
print(f"The length of the dataset before removing 'Unknown' values: {len(df)}")

# Create a condition to identify rows where the 'Cast' column contains the string 'Unknown', case insensitive
condition = df['Cast'].str.contains('Unknown', case=False)

# Filter the DataFrame to exclude rows where the 'Cast' column contains 'Unknown'
df = df[~condition]
print(f"The length of the dataset after removing 'Unknown' values: {len(df)}")

# Display the 'Cast' column of the filtered DataFrame
df['Cast']

In [None]:
# Creating a special function that counts the number of actors in the 'Cast' column
def count_actor(cast):
    return len(cast.split(','))
# Applying the function to the 'Cast' column to create a new column 'nb_actor'
df['nb_actor'] = df['Cast'].apply(count_actor)
df['nb_actor']

In [None]:
# Checking title duplicates
print(f"Number of duplicate titles: {df['Title'].duplicated().sum()}")
# Removing duplicates: 3 methods :
# keep='first': keep the first occurence
# keep='last': keep the last occurence
# keep=False : remove all duplicates

# We remove the duplicates, that is we remove hte rows where the 'Title' column is duplicated, we keep only the first occurence of each title
df = df.drop_duplicates(subset='Title', keep="first")


In [65]:
# Making a dataframe for the year 1976
df_year_2015 = df[df['Release Year'] == 2015]
print(len(df_year_2015))
df_year_2015

652


Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,Decade,nb_actor
16910,2015,The Woman in Black 2: Angel of Death,American,Tom Harper,Phoebe Fox\r\nJeremy Irvine\r\nHelen McCrory\r...,horror,https://en.wikipedia.org/wiki/The_Woman_in_Bla...,Forty years after the events of the first film...,2010,1
16911,2015,Taken 3,American,Olivier Megaton,Liam Neeson\r\nForest Whitaker\r\nFamke Jansse...,action,https://en.wikipedia.org/wiki/Taken_3,Former covert operative Bryan Mills (Liam Nees...,2010,1
16912,2015,Match,American,Stephen Belber,Patrick Stewart\r\nCarla Gugino\r\nMatthew Lil...,drama,https://en.wikipedia.org/wiki/Match_(film),"The film revolves around Tobi, a middle-aged e...",2010,1
16913,2015,Blackhat,American,Michael Mann,Chris Hemsworth\r\nViola Davis\r\nManny Montan...,action,https://en.wikipedia.org/wiki/Blackhat_(film),"At a nuclear plant in Chai Wan, Hong Kong, a h...",2010,1
16914,2015,Little Accidents,American,Sarah Colangelo,Elizabeth Banks\r\nBoyd Holbrook\r\nChloë Sevi...,drama,https://en.wikipedia.org/wiki/Little_Accidents,In a small American town still living in the s...,2010,1
...,...,...,...,...,...,...,...,...,...,...
34739,2015,The Advocate: A Missing Body,South_Korean,Heo Jong-ho,"Lee Sun-kyun, Kim Go-eun",unknown,https://en.wikipedia.org/wiki/The_Advocate:_A_...,Byeon Ho-sung is a hotshot lawyer at a big law...,2010,2
34740,2015,Collective Invention,South_Korean,Kwon Oh-kwang,"Lee Kwang-soo, Park Bo-young, Lee Chun-hee",unknown,https://en.wikipedia.org/wiki/Collective_Inven...,A medical experiment gone wrong and a man muta...,2010,3
34741,2015,The Priests,South_Korean,Jang Jae-hyun,"Kim Yoon-seok, Kang Dong-won",unknown,https://en.wikipedia.org/wiki/The_Priests_(film),A young girl who belongs to Father Kim's paris...,2010,2
34742,2015,Inside Men,South_Korean,Woo Min-ho,"Lee Byung-hun, Jo Seung-woo",unknown,https://en.wikipedia.org/wiki/Inside_Men_(film),"Lee Kang-hee, an editor at an influential cons...",2010,2


In [68]:
# Selecting the 6th row with column ['wiki page']
url_paddington = df_2000_2010.iloc[5]['Wiki Page']
url_paddington

'https://en.wikipedia.org/wiki/Paddington_(film)'

In [71]:
import requests
r = requests.get(url_paddington)
r.status_code


200

In [83]:
# Using BeautifulSoup to parse the HTML content
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.content)

# We want the budget and the box office
# We can see that the budget and box office are in the same table
# We can use the 'table' tag to get the table

th_budget = soup.find("table").find('th', class_='infobox-label', string="Budget")
budget_value = th_budget.find_next('td', class_='infobox-data')
print(budget_value.text)

th_box_office = soup.find("table").find('th', class_='infobox-label', string="Box office")
box_office_value = th_box_office.find_next('td', class_='infobox-data')
print(box_office_value.text)

$65 million[5]
$282.8 million[6]


In [93]:
# Now we generalize the process to get the budget and box office for all movies 


def get_budget_box_office(url):
    r = requests.get(url)
    if r.status_code != 200:
        return np.nan, np.nan
    soup = BeautifulSoup(r.content)
    try:
        th_budget = soup.find("table").find('th', class_='infobox-label', string="Budget")
    except AttributeError:
        return np.nan, np.nan
    if th_budget is None:
        budget = np.nan
    else:
        budget_value = th_budget.find_next('td', class_='infobox-data')
        budget = budget_value.text
    
    try:
        th_box_office = soup.find("table").find('th', class_='infobox-label', string="Box office")
    except AttributeError:
        return np.nan, np.nan
    if th_box_office is None:
        box_office = np.nan
    else:
        box_office_value = th_box_office.find_next('td', class_='infobox-data')
        box_office = box_office_value.text
    return budget, box_office


In [107]:
# We do it 50 random movies from 2015
df_2015 = df[
    (df['Release Year'] == 2015) & 
    (df['Origin/Ethnicity'] == 'American')
]
df_2015_random = df_2015.sample(50)
# Apply the get_budget_box_office function and expand the result into separate columns
df_2015_random['Budget'], df_2015_random['Box Office'] = zip(*df_2015_random['Wiki Page'].apply(get_budget_box_office))

In [108]:
df_2015_random

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,Decade,nb_actor,Budget,Box Office
16911,2015,Taken 3,American,Olivier Megaton,Liam Neeson\r\nForest Whitaker\r\nFamke Jansse...,action,https://en.wikipedia.org/wiki/Taken_3,Former covert operative Bryan Mills (Liam Nees...,2010,1,$48 million[2],$326.4 million[2]
17014,2015,The Martian,American,Ridley Scott,Matt Damon,science fiction,https://en.wikipedia.org/wiki/The_Martian_(film),"In 2035, the crew of the Ares III mission to M...",2010,1,$108 million[3],$630.6 million[4]
16990,2015,Tomorrowland,American,Brad Bird,George Clooney\r\nHugh Laurie\r\nBritt Roberts...,science fiction mystery,https://en.wikipedia.org/wiki/Tomorrowland_(film),An adult Frank Walker talks to an unseen group...,2010,1,$180–190 million[4],$209 million[5]
16980,2015,Blackbird,American,Patrik-Ian Polk,Julian Walker\r\nMo'Nique\r\nIsaiah Washington...,drama,https://en.wikipedia.org/wiki/Blackbird_(2014_...,The story of a gay teen in high school in a sm...,2010,1,,
16945,2015,Maps to the Stars,American,David Cronenberg,Julianne Moore\r\nMia Wasikowska\r\nJohn Cusac...,drama,https://en.wikipedia.org/wiki/Maps_to_the_Stars,Agatha Weiss arrives in Los Angeles and employ...,2010,1,$13 million[6],$4.5 million[7][8]
17002,2015,Max,American,Boaz Yakin,Josh Wiggins\r\nDejon LaQuake\r\nThomas Haden ...,adventure,https://en.wikipedia.org/wiki/Max_(2015_film),"Max, a Malinois[4] used to help U.S. Marines i...",2010,1,$20 million[2],$44 million[3]
16927,2015,The Loft,American,Erik Van Looy,Karl Urban\r\nWentworth Miller\r\nJames Marsde...,mystery,https://en.wikipedia.org/wiki/The_Loft_(film),Five married men share ownership of an upmarke...,2010,1,$13 million[1],$11 million[2]
16943,2015,Focus,American,Glenn Ficarra John Requa,Will Smith\r\nMargot Robbie\r\nRodrigo Santoro,romantic comedy-drama,https://en.wikipedia.org/wiki/Focus_(2015_film),Seasoned con-man Nicky Spurgeon (Will Smith) g...,2010,1,$50.1–65 million[3][4],$158.8 million[5]
16988,2015,Pitch Perfect 2,American,Elizabeth Banks,Anna Kendrick\r\nSkylar Astin\r\nRebel Wilson\...,comedy,https://en.wikipedia.org/wiki/Pitch_Perfect_2,Three years after winning the previous competi...,2010,1,$29–31 million[2][3],$287.5 million[4]
16925,2015,We'll Never Have Paris,American,Simon Helberg,Simon Helberg\r\nMelanie Lynskey\r\nMaggie Gra...,romantic comedy,https://en.wikipedia.org/wiki/We%27ll_Never_Ha...,Quinn (Simon Helberg) and Devon (Melanie Lynsk...,2010,1,,


In [None]:
# Exercice
# Transform the columns 'Budget' and 'Box Office' into numerical values
# Using regex 
# Remove the $ sign

# Questions : 
# 1. What is the average budget of the movies in 2015?
# 2. What is the average box office of the movies in 2015?
# 3. Most cheap movies ?
# 4. Most expensive movies ?
