# Insert Libraries

In [163]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Read the dataset

In [165]:
df = pd.read_csv('imdb_top_1000.csv')

## Preview

In [167]:
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [168]:
df.shape

(1000, 16)

In [169]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB


In [170]:
df.describe()

Unnamed: 0,IMDB_Rating,Meta_score,No_of_Votes
count,1000.0,843.0,1000.0
mean,7.9493,77.97153,273692.9
std,0.275491,12.376099,327372.7
min,7.6,28.0,25088.0
25%,7.7,70.0,55526.25
50%,7.9,79.0,138548.5
75%,8.1,87.0,374161.2
max,9.3,100.0,2343110.0


# Cleaning

In [172]:
# This is a movie dataset so I will change the title.
df.rename(columns={"Series_Title": "Movie_Title"}, inplace=True)

In [173]:
df.isnull().sum()

Poster_Link        0
Movie_Title        0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64

In [174]:
df['Certificate'].unique()

array(['A', 'UA', 'U', 'PG-13', 'R', nan, 'PG', 'G', 'Passed', 'TV-14',
       '16', 'TV-MA', 'Unrated', 'GP', 'Approved', 'TV-PG', 'U/A'],
      dtype=object)

In [175]:
# Gross, certificate and meta score interest us for our analysis so I will drop out all the nulls.
df.dropna(inplace=True)


In [176]:
df.isnull().sum()

Poster_Link      0
Movie_Title      0
Released_Year    0
Certificate      0
Runtime          0
Genre            0
IMDB_Rating      0
Overview         0
Meta_score       0
Director         0
Star1            0
Star2            0
Star3            0
Star4            0
No_of_Votes      0
Gross            0
dtype: int64

In [177]:
df.shape

(714, 16)

In [178]:
df.reset_index(inplace=True)

In [179]:
# Any duplicates?
df.duplicated().sum()

0

In [180]:
# For my analysis I can't use the poster link or the overview so I will drop them out of my dataframe.

df.drop(['Poster_Link','Overview'], axis=1, inplace=True)

In [181]:
# Gross column should have numeric values.
df["Gross"] = df["Gross"].str.replace(",","")
df['Gross'] = df['Gross'].astype('float64')

In [182]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 714 entries, 0 to 713
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          714 non-null    int64  
 1   Movie_Title    714 non-null    object 
 2   Released_Year  714 non-null    object 
 3   Certificate    714 non-null    object 
 4   Runtime        714 non-null    object 
 5   Genre          714 non-null    object 
 6   IMDB_Rating    714 non-null    float64
 7   Meta_score     714 non-null    float64
 8   Director       714 non-null    object 
 9   Star1          714 non-null    object 
 10  Star2          714 non-null    object 
 11  Star3          714 non-null    object 
 12  Star4          714 non-null    object 
 13  No_of_Votes    714 non-null    int64  
 14  Gross          714 non-null    float64
dtypes: float64(3), int64(2), object(10)
memory usage: 83.8+ KB


In [183]:
# Runtime should also be numeric.
df["Runtime"] = df["Runtime"].str.replace("min","")
df['Runtime'] = df['Runtime'].astype('float64')

In [184]:
df.head()

Unnamed: 0,index,Movie_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,0,The Shawshank Redemption,1994,A,142.0,Drama,9.3,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0
1,1,The Godfather,1972,A,175.0,"Crime, Drama",9.2,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0
2,2,The Dark Knight,2008,UA,152.0,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0
3,3,The Godfather: Part II,1974,A,202.0,"Crime, Drama",9.0,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0
4,4,12 Angry Men,1957,U,96.0,"Crime, Drama",9.0,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0


In [185]:
#release year should be numeric.
df['Released_Year'].unique()

array(['1994', '1972', '2008', '1974', '1957', '2003', '1993', '2010',
       '1999', '2001', '1966', '2002', '1990', '1980', '1975', '2019',
       '2014', '1998', '1997', '1995', '1991', '1977', '1954', '2011',
       '2006', '2000', '1988', '1985', '1968', '1960', '1942', '1936',
       '1931', '2018', '2016', '2017', '2012', '2009', '1981', '1979',
       '1964', '2004', '1992', '1987', '1986', '1984', '1983', '1976',
       '1973', '1971', '1965', '1962', '1959', '1958', '1952', '1944',
       '1941', '2013', '2007', '2005', '1989', '1963', '1950', '1948',
       '2015', '1996', '1982', '1978', '1967', '1951', '1949', '1940',
       '1939', '1934', '1970', '1969', '1961', '1946', '1930', '1938',
       '1933', 'PG', '1953'], dtype=object)

In [186]:
#There is a 'PG' value that needs to be dropped first.
mask = df['Released_Year'] == 'PG'

In [187]:
df = df[~mask]

In [188]:
#There is a 'PG' value that needs to be dropped first.

df['Released_Year']=df['Released_Year'].astype(int)

In [189]:
df.head()

Unnamed: 0,index,Movie_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,0,The Shawshank Redemption,1994,A,142.0,Drama,9.3,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0
1,1,The Godfather,1972,A,175.0,"Crime, Drama",9.2,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0
2,2,The Dark Knight,2008,UA,152.0,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0
3,3,The Godfather: Part II,1974,A,202.0,"Crime, Drama",9.0,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0
4,4,12 Angry Men,1957,U,96.0,"Crime, Drama",9.0,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0


In [190]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 713 entries, 0 to 713
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          713 non-null    int64  
 1   Movie_Title    713 non-null    object 
 2   Released_Year  713 non-null    int32  
 3   Certificate    713 non-null    object 
 4   Runtime        713 non-null    float64
 5   Genre          713 non-null    object 
 6   IMDB_Rating    713 non-null    float64
 7   Meta_score     713 non-null    float64
 8   Director       713 non-null    object 
 9   Star1          713 non-null    object 
 10  Star2          713 non-null    object 
 11  Star3          713 non-null    object 
 12  Star4          713 non-null    object 
 13  No_of_Votes    713 non-null    int64  
 14  Gross          713 non-null    float64
dtypes: float64(4), int32(1), int64(2), object(8)
memory usage: 86.3+ KB


In [225]:
# Cleaned dataset is ready!
df.to_csv('cleaned.csv')