### Importing Libraries and Loading the Dataset

In [8]:
# Importing Libraries

import numpy as np
import pandas as pd
from skimpy import skim  # Used for quick summarization and inspection of data

In [4]:
# Loading & Reading dataset
df_original = pd.read_csv("imdb_movie_dataset.csv", index_col="Title")

# Making a copy
df = df_original.copy()

In [6]:
# Checking the dataset
df.sample(5)

Unnamed: 0_level_0,Rank,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
No Strings Attached,796,"Comedy,Romance",A guy and girl try to keep their relationship ...,Ivan Reitman,"Natalie Portman, Ashton Kutcher, Kevin Kline, ...",2011,108,6.2,178243,70.63,50.0
The Lobster,227,"Comedy,Drama,Romance","In a dystopian near future, single people, acc...",Yorgos Lanthimos,"Colin Farrell, Rachel Weisz, Jessica Barden,Ol...",2015,119,7.1,121313,8.7,82.0
The Hunger Games: Mockingjay - Part 1,681,"Action,Adventure,Sci-Fi",Katniss Everdeen is in District 13 after she s...,Francis Lawrence,"Jennifer Lawrence, Josh Hutcherson, Liam Hemsw...",2014,123,6.7,331902,337.1,64.0
The Other Woman,571,"Comedy,Romance","After discovering her boyfriend is married, Ca...",Nick Cassavetes,"Cameron Diaz, Leslie Mann, Kate Upton, Nikolaj...",2014,109,6.0,110825,83.91,39.0
The Prestige,65,"Drama,Mystery,Sci-Fi",Two stage magicians engage in competitive one-...,Christopher Nolan,"Christian Bale, Hugh Jackman, Scarlett Johanss...",2006,130,8.5,913152,53.08,66.0


### Initial Exploration and Analyzing Values

In [9]:
# Dataset Initial Summary by Skimpy Library
skim(df)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, Guardians of the Galaxy to Nine Lives
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rank                1000 non-null   int64  
 1   Genre               1000 non-null   object 
 2   Description         1000 non-null   object 
 3   Director            1000 non-null   object 
 4   Actors              1000 non-null   object 
 5   Year                1000 non-null   int64  
 6   Runtime (Minutes)   1000 non-null   int64  
 7   Rating              1000 non-null   float64
 8   Votes               1000 non-null   int64  
 9   Revenue (Millions)  872 non-null    float64
 10  Metascore           936 non-null    float64
dtypes: float64(3), int64(4), object(4)
memory usage: 126.0+ KB


In [12]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rank,1000.0,500.5,288.819436,1.0,250.75,500.5,750.25,1000.0
Year,1000.0,2012.783,3.205962,2006.0,2010.0,2014.0,2016.0,2016.0
Runtime (Minutes),1000.0,113.172,18.810908,66.0,100.0,111.0,123.0,191.0
Rating,1000.0,6.7232,0.945429,1.9,6.2,6.8,7.4,9.0
Votes,1000.0,169808.255,188762.647518,61.0,36309.0,110799.0,239909.75,1791916.0
Revenue (Millions),872.0,82.956376,103.25354,0.0,13.27,47.985,113.715,936.63
Metascore,936.0,58.985043,17.194757,11.0,47.0,59.5,72.0,100.0


In [13]:
df.describe()

Unnamed: 0,Rank,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
count,1000.0,1000.0,1000.0,1000.0,1000.0,872.0,936.0
mean,500.5,2012.783,113.172,6.7232,169808.3,82.956376,58.985043
std,288.819436,3.205962,18.810908,0.945429,188762.6,103.25354,17.194757
min,1.0,2006.0,66.0,1.9,61.0,0.0,11.0
25%,250.75,2010.0,100.0,6.2,36309.0,13.27,47.0
50%,500.5,2014.0,111.0,6.8,110799.0,47.985,59.5
75%,750.25,2016.0,123.0,7.4,239909.8,113.715,72.0
max,1000.0,2016.0,191.0,9.0,1791916.0,936.63,100.0


In [14]:
df.describe(include = "object").T

Unnamed: 0,count,unique,top,freq
Genre,1000,207,"Action,Adventure,Sci-Fi",50
Description,1000,1000,A stuffy businessman finds himself trapped ins...,1
Director,1000,644,Ridley Scott,8
Actors,1000,996,"Jennifer Lawrence, Josh Hutcherson, Liam Hemsw...",2


In [15]:
# Viewing Null, Uniquue and Duplicated Values

pd.DataFrame({
                'Count':df.shape[0],
                'Column':df.shape[1],
                'Size':df.size,
                'Null':df.isnull().sum(),
                'Null %':df.isnull().mean() * 100,
                'Not-Null':df.notnull().sum(),
                'Unique':df.nunique(),
                'Duplicated':df.duplicated().sum()
})

Unnamed: 0,Count,Column,Size,Null,Null %,Not-Null,Unique,Duplicated
Rank,1000,11,11000,0,0.0,1000,1000,0
Genre,1000,11,11000,0,0.0,1000,207,0
Description,1000,11,11000,0,0.0,1000,1000,0
Director,1000,11,11000,0,0.0,1000,644,0
Actors,1000,11,11000,0,0.0,1000,996,0
Year,1000,11,11000,0,0.0,1000,11,0
Runtime (Minutes),1000,11,11000,0,0.0,1000,94,0
Rating,1000,11,11000,0,0.0,1000,55,0
Votes,1000,11,11000,0,0.0,1000,997,0
Revenue (Millions),1000,11,11000,128,12.8,872,814,0
