# Top 1000 Films of All Time:

List of top 1000 movies, ranked starting with #1. Rankings are arrived at by combining ratings and percentages from five prominent movie databases: Metacritic, Rotten Tomatoes, IMDb, TCM, and Sight & Sound. Additional points are given to films repeatedly appearing on lists (created by respected critics, film directors, and various publications) of top films.

In [233]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [234]:
df= pd.read_csv('movie_dataset.csv')
df

Unnamed: 0,MovieTitle,ReleaseYear,Duration,Genre,Rating,Metascore,Votes,Gross
0,Bố Già,1972,175 min,"Crime, Drama",9.2,100,1987643,134966411
1,Câu Chuyện Tokyo,1953,136 min,Drama,8.1,100,67476,
2,Bảy Võ Sĩ Đạo,1954,207 min,"Action, Drama",8.6,100,363320,269061
3,Bố Già Phần II,1974,202 min,"Crime, Drama",9.0,100,1348889,57300000
4,Chuyện Tình Thế Chiến,1942,102 min,"Drama, Romance, War",8.5,100,600392,1024560
...,...,...,...,...,...,...,...,...
995,L'eau froide,1994,95 min,Drama,7.0,89,2188,30209
996,We the Animals,2018,94 min,Drama,6.9,89,4595,400961
997,David Holzman's Diary,1967,74 min,"Comedy, Drama",6.5,89,1823,
998,Moonrise,1948,90 min,"Crime, Drama, Film-Noir",7.0,89,2938,


### Data Cleaning and Preparation: 

In [235]:
#Checking for duplicates
df. loc[df.duplicated()]

Unnamed: 0,MovieTitle,ReleaseYear,Duration,Genre,Rating,Metascore,Votes,Gross


=> No duplicated data

In [236]:
df.columns.value_counts()

MovieTitle     1
ReleaseYear    1
Duration       1
Genre          1
Rating         1
Metascore      1
Votes          1
Gross          1
dtype: int64

In [237]:
df[df.isna()].count()   #count the number of missing (NaN) values in each column 

MovieTitle     0
ReleaseYear    0
Duration       0
Genre          0
Rating         0
Metascore      0
Votes          0
Gross          0
dtype: int64

In [238]:
(df['ReleaseYear'].unique())


array(['1972', '1953', '1954', '1974', '1942', '1941', '1962', '1957',
       '1950', '1985', '1955', '1989–1990', '1994', '1931', '1952',
       '1993', '1959', '1990', '1945', '1936', '1960', '1975', '1949',
       '1988', '1964', '1979', '1956', '1928', '1939', '1948', '1966',
       '1924', '1976', '1969', '1927', '1937', '1982', '2001', '1958',
       '1991', '1989', '1944', '1920', '1973', '1984', '2016', '1902',
       '2010', '2011', '1946', '1963', '1995–1998', '1961', '1925',
       '1967', 'I 2019', '1940', '1943', '1965', '1929', '1981',
       'II 2018', '2012', '1970', 'I 2015', '2018', '1987', '1971',
       '1921', '1926', '1977', '2008', '1935', '1968', '2003', '1930',
       '1934', '1951', '2013', '1922', 'I 2014', '1984 TV Movie', '2014',
       '2006', '1938', '2000', '1974 TV Movie', '1933', '1983', '2019',
       '1978', '1999', '1947', '2017', '2007 TV Movie', '1923', '2015',
       '1980', '2007', '1932', '2002', 'I 2017', '1995', 'I 2011', '1997',
       '1992

=> There are some errors:  '1989–1990', '1995–1998', 'I 2019', 'II 2018', 'I 2015', 'I 2014', '1984 TV Movie', '1974 TV Movie', '2007 TV Movie', 'I 2017', 'I 2011', 'I 2016', '1977 TV Movie', '1985– ', '2002 TV Special', 'II 2016', '1989–1999',
       '1992 TV Movie', 'III 2015', 'I 2007', '1966 TV Movie', 'II 2011', 'II 2015', 'II 2014', 'I 2008', 'II 2017',
       '2006 TV Movie', 'I 2002', '1988 TV Movie'
- Solution: 
- Convert the extracted year to datetime objects.
- Handle cases where multiple years are provided as a range (e.g., '1989–1990').
- Handle cases where the year is provided with additional text (e.g., '1984 TV Movie').   => Replace name


In [239]:
# Handle cases where the year is provided with additional text (e.g., '1984 TV Movie').
# Take only Year from string: 
def take_only_year(input_strings):
    years= []
    for input_string in input_strings:
   # Check if input_string is a string
          parts= input_string.split()
          for part in parts:
             if part.isdigit() and len(part)==4:
                 years.append(int(part))
                 break
    return years

input_strings = ['I 2019', 'II 2018', 'I 2015', 'I 2014', '1984 TV Movie', '1974 TV Movie', '2007 TV Movie', 'I 2017', 'I 2011', 'I 2016', '1977 TV Movie', '2002 TV Special', 'II 2016', '1992 TV Movie', 'III 2015', 'I 2007', '1966 TV Movie', 'II 2011', 'II 2015', 'II 2014', 'I 2008', 'II 2017',
       '2006 TV Movie', 'I 2002', '1988 TV Movie']
new_strings= take_only_year(input_strings)
new_strings

[2019,
 2018,
 2015,
 2014,
 1984,
 1974,
 2007,
 2017,
 2011,
 2016,
 1977,
 2002,
 2016,
 1992,
 2015,
 2007,
 1966,
 2011,
 2015,
 2014,
 2008,
 2017,
 2006,
 2002,
 1988]

In [240]:
# Handle cases where multiple years are provided as a range (e.g., '1989–1990'). 
# Solution: tính trung bình của năm bắt đầu và năm kết thúc trong một dãy năm.
def clean_release_years(release_years):
    cleaned_years= []
    for year_range in release_years:
            start_year, end_year = map(int, year_range.split('–'))
            average_year= start_year + ((end_year- start_year)/2)
            cleaned_years.append(int(average_year))
    return cleaned_years

release_years = ['1989–1990', '1995–1998', '1989–1999']
new_release_year = clean_release_years(release_years)
new_release_year




[1989, 1996, 1994]

In [241]:
def replace_name(a,b):
    df.ReleaseYear.replace(a,b,inplace=True)

replace_name(input_strings,new_strings)
replace_name('1985– ', '1985')
replace_name(release_years, new_release_year)


df['ReleaseYear'].unique()


array(['1972', '1953', '1954', '1974', '1942', '1941', '1962', '1957',
       '1950', '1985', '1955', 1989, '1994', '1931', '1952', '1993',
       '1959', '1990', '1945', '1936', '1960', '1975', '1949', '1988',
       '1964', '1979', '1956', '1928', '1939', '1948', '1966', '1924',
       '1976', '1969', '1927', '1937', '1982', '2001', '1958', '1991',
       '1989', '1944', '1920', '1973', '1984', '2016', '1902', '2010',
       '2011', '1946', '1963', 1996, '1961', '1925', '1967', 2019, '1940',
       '1943', '1965', '1929', '1981', 2018, '2012', '1970', 2015, '2018',
       '1987', '1971', '1921', '1926', '1977', '2008', '1935', '1968',
       '2003', '1930', '1934', '1951', '2013', '1922', 2014, 1984, '2014',
       '2006', '1938', '2000', 1974, '1933', '1983', '2019', '1978',
       '1999', '1947', '2017', 2007, '1923', '2015', '1980', '2007',
       '1932', '2002', 2017, '1995', 2011, '1997', '1992', 2016, 1977,
       '1998', '1986', '2004', '1996', 2002, '2009', 1994, 1992, 1966,


In [242]:
print(len(df['ReleaseYear'].unique()))

122


In [243]:
print(df['Duration'].dtype)

object


In [244]:
# We need to remove min, change the datatype of duration to minutes (interger)
df['Duration'] = df['Duration'].str.replace('min', '').astype(int)
df

Unnamed: 0,MovieTitle,ReleaseYear,Duration,Genre,Rating,Metascore,Votes,Gross
0,Bố Già,1972,175,"Crime, Drama",9.2,100,1987643,134966411
1,Câu Chuyện Tokyo,1953,136,Drama,8.1,100,67476,
2,Bảy Võ Sĩ Đạo,1954,207,"Action, Drama",8.6,100,363320,269061
3,Bố Già Phần II,1974,202,"Crime, Drama",9.0,100,1348889,57300000
4,Chuyện Tình Thế Chiến,1942,102,"Drama, Romance, War",8.5,100,600392,1024560
...,...,...,...,...,...,...,...,...
995,L'eau froide,1994,95,Drama,7.0,89,2188,30209
996,We the Animals,2018,94,Drama,6.9,89,4595,400961
997,David Holzman's Diary,1967,74,"Comedy, Drama",6.5,89,1823,
998,Moonrise,1948,90,"Crime, Drama, Film-Noir",7.0,89,2938,


Solve NaN values in Gross column:  

In [247]:
df['Gross'].isna().value_counts()

False    576
True     424
Name: Gross, dtype: int64

In [248]:
# remove rows having NaN values
df.dropna(subset= ['Gross'], inplace= True)

In [250]:
df

Unnamed: 0,MovieTitle,ReleaseYear,Duration,Genre,Rating,Metascore,Votes,Gross
0,Bố Già,1972,175,"Crime, Drama",9.2,100,1987643,134966411
2,Bảy Võ Sĩ Đạo,1954,207,"Action, Drama",8.6,100,363320,269061
3,Bố Già Phần II,1974,202,"Crime, Drama",9.0,100,1348889,57300000
4,Chuyện Tình Thế Chiến,1942,102,"Drama, Romance, War",8.5,100,600392,1024560
5,Công Dân Kane,1941,119,"Drama, Mystery",8.3,100,462045,1585634
...,...,...,...,...,...,...,...,...
988,Ilo Ilo,2013,99,Drama,7.2,89,4968,54071
990,Shazam!,2019,132,"Action, Adventure, Comedy",7.0,89,380610,140371656
992,Heart of a Dog,2015,75,Documentary,7.0,89,2530,418571
995,L'eau froide,1994,95,Drama,7.0,89,2188,30209
