In [3]:
import numpy as np
import pandas as pd

# Load IMDB Data

A data set of 1,000 most popular movies on IMDB from 2006 - 2016. The data fields included are:

Title, Genre, Description, Director, Actors, Year, Runtime, Rating, Votes, Revenue, Metascrore

1. Download the dataset from [Kaggle](https://www.kaggle.com/datasets/PromptCloudHQ/imdb-data?resource=download) - you may need to create a free account with your google account
2. Upload the file to your collab session through clicking on the Files menu in the left toolbar (see the image for where to find that)

![img](https://drive.google.com/uc?export=view&id=1P4YcZK7g_1gl5XyLClD0StYLo9u9w2w4)

3. Read into a pandas dataframe as below



In [21]:
df = pd.read_csv('D:\ReDi Data Analytics\IMDB-Movie-Data.csv')

In [5]:
df.head(3)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0


## Excercises

In [11]:
# Check how many missing values there are in each column of the data set
display(df.info(),df.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rank                1000 non-null   int64  
 1   Title               1000 non-null   object 
 2   Genre               1000 non-null   object 
 3   Description         1000 non-null   object 
 4   Director            1000 non-null   object 
 5   Actors              1000 non-null   object 
 6   Year                1000 non-null   int64  
 7   Runtime (Minutes)   1000 non-null   int64  
 8   Rating              1000 non-null   float64
 9   Votes               1000 non-null   int64  
 10  Revenue (Millions)  872 non-null    float64
 11  Metascore           936 non-null    float64
dtypes: float64(3), int64(4), object(5)
memory usage: 93.9+ KB


None

Rank                    0
Title                   0
Genre                   0
Description             0
Director                0
Actors                  0
Year                    0
Runtime (Minutes)       0
Rating                  0
Votes                   0
Revenue (Millions)    128
Metascore              64
dtype: int64

In [22]:
# Check how many unique directors are in the list of top 1000 movies
df['Director'].nunique()

644

In [23]:
#Drop all rows where the Metascore (Average Critic's score) is missing
df_with_dropped_mts = df.dropna(subset = ['Metascore'])
display(df_with_dropped_mts.info())

<class 'pandas.core.frame.DataFrame'>
Index: 936 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rank                936 non-null    int64  
 1   Title               936 non-null    object 
 2   Genre               936 non-null    object 
 3   Description         936 non-null    object 
 4   Director            936 non-null    object 
 5   Actors              936 non-null    object 
 6   Year                936 non-null    int64  
 7   Runtime (Minutes)   936 non-null    int64  
 8   Rating              936 non-null    float64
 9   Votes               936 non-null    int64  
 10  Revenue (Millions)  838 non-null    float64
 11  Metascore           936 non-null    float64
dtypes: float64(3), int64(4), object(5)
memory usage: 95.1+ KB


None

In [56]:
#Replace the missing values in the `Revenue (Millions)` column with the average Revenue for other movies in the list for the same `Year`
df['Revenue (Millions)'].isna().sum() #128values
average_revenue_by_year = df.groupby('Year')['Revenue (Millions)'].transform('mean')
df['Revenue (Millions)'].fillna(average_revenue_by_year, inplace=True) #i saved  the new value
#display(average_revenue_by_year)
#display(df.info())

0       85.078723
1      107.973281
2       54.690976
3       54.690976
4       54.690976
          ...    
995     78.355044
996     87.882245
997     99.082745
998     85.078723
999     54.690976
Name: Revenue (Millions), Length: 1000, dtype: float64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rank                1000 non-null   int64  
 1   Title               1000 non-null   object 
 2   Genre               1000 non-null   object 
 3   Description         1000 non-null   object 
 4   Director            1000 non-null   object 
 5   Actors              1000 non-null   object 
 6   Year                1000 non-null   int64  
 7   Runtime (Minutes)   1000 non-null   int64  
 8   Rating              1000 non-null   float64
 9   Votes               1000 non-null   int64  
 10  Revenue (Millions)  1000 non-null   float64
 11  Metascore           936 non-null    float64
dtypes: float64(3), int64(4), object(5)
memory usage: 93.9+ KB


None

In [38]:
#Find all he movies where Leonardo DiCaprio is listed as actor.
leo_movies = df[df['Actors'].str.contains('Leonardo DiCaprio')]
leo_movies.head(20)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
80,81,Inception,"Action,Adventure,Sci-Fi","A thief, who steals corporate secrets through ...",Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ellen...",2010,148,8.8,1583625,292.57,74.0
82,83,The Wolf of Wall Street,"Biography,Comedy,Crime","Based on the true story of Jordan Belfort, fro...",Martin Scorsese,"Leonardo DiCaprio, Jonah Hill, Margot Robbie,M...",2013,180,8.2,865134,116.87,75.0
99,100,The Departed,"Crime,Drama,Thriller",An undercover cop and a mole in the police att...,Martin Scorsese,"Leonardo DiCaprio, Matt Damon, Jack Nicholson,...",2006,151,8.5,937414,132.37,85.0
129,130,The Revenant,"Adventure,Drama,Thriller",A frontiersman on a fur trading expedition in ...,Alejandro González Iñárritu,"Leonardo DiCaprio, Tom Hardy, Will Poulter, Do...",2015,156,8.0,499424,183.64,76.0
137,138,The Great Gatsby,"Drama,Romance","A writer and wall street trader, Nick, finds h...",Baz Luhrmann,"Leonardo DiCaprio, Carey Mulligan, Joel Edgert...",2013,143,7.3,386102,144.81,55.0
138,139,Shutter Island,"Mystery,Thriller","In 1954, a U.S. marshal investigates the disap...",Martin Scorsese,"Leonardo DiCaprio, Emily Mortimer, Mark Ruffal...",2010,138,8.1,855604,127.97,63.0
144,145,Django Unchained,"Drama,Western","With the help of a German bounty hunter , a fr...",Quentin Tarantino,"Jamie Foxx, Christoph Waltz, Leonardo DiCaprio...",2012,165,8.4,1039115,162.8,81.0
459,460,Revolutionary Road,"Drama,Romance",A young couple living in a Connecticut suburb ...,Sam Mendes,"Leonardo DiCaprio, Kate Winslet, Christopher F...",2008,119,7.3,159736,22.88,69.0
669,670,Blood Diamond,"Adventure,Drama,Thriller","A fisherman, a smuggler, and a syndicate of bu...",Edward Zwick,"Leonardo DiCaprio, Djimon Hounsou, Jennifer Co...",2006,143,8.0,422014,57.37,64.0
737,738,Body of Lies,"Action,Drama,Romance",A CIA agent on the ground in Jordan hunts down...,Ridley Scott,"Leonardo DiCaprio, Russell Crowe, Mark Strong,...",2008,128,7.1,182305,39.38,57.0


In [40]:
#In which genres did he play mostly?
pd.DataFrame(leo_movies.groupby('Genre').size())

Unnamed: 0_level_0,0
Genre,Unnamed: 1_level_1
"Action,Adventure,Sci-Fi",1
"Action,Drama,Romance",1
"Adventure,Drama,Thriller",2
"Biography,Comedy,Crime",1
"Crime,Drama,Thriller",1
"Drama,Romance",2
"Drama,Western",1
"Mystery,Thriller",1


In [45]:
# And what is the average rating of his movies? Is it higher or lower than the average rating of all movies?
display(df['Rating'].mean(),
        leo_movies['Rating'].mean()
)

6.723199999999999

7.969999999999999