In [2]:
import numpy as np
import pandas as pd
import random

df = pd.read_csv('Sprint 1 - Movies Data Set Analysis/movies.csv') # Gathering data
df = df.drop(columns=['Summary']) # remove the column "Summary"
df.columns = df.columns.str.lower().str.replace(' ', '_') # Make all the column names lowercase and replace spaces with underscores
df.drop(df.tail(1).index, inplace=True) # Remove the last row of the DF (containing NaNs)

In [3]:
display(df.tail(5)) # See the first five rows of the Data Frame

Unnamed: 0,movieid,title,mpaa_rating,budget,gross,release_date,genre,runtime,rating,rating_count
610,611,Godzilla: King of the Monsters,PG-13,200000000.0,385900000.0,2019-05-13,Adventure,132.0,,
611,612,Toy Story 4,G,200000000.0,1062000000.0,2019-06-11,Animation,100.0,,
612,613,Fast & Furious Presents: Hobbs & Shaw,PG-13,200000000.0,759400000.0,2019-07-13,Thriller,136.0,,
613,614,The Lion King,PG,250000000.0,1632000000.0,2019-07-09,Drama,118.0,,
614,615,Avengers: Endgame,PG-13,356000000.0,2796000000.0,2019-04-22,Action,181.0,,


In [4]:
# Fill NaNs with a random value between (mean — 2 * std) & (mean + 2 * std)

def nan_rand_fill(data, key):
    mn = data[key].mean()
    sd = data[key].std()
    low_l = mn - 2 * sd
    high_l = mn + 2 * sd
    nans = data[key].isna()
    df.loc[nans, key] = [abs(np.floor(random.uniform(low_l, high_l))) if 'count' in key 
    else abs(round(random.uniform(low_l, high_l), 1)) for i in range(nans.sum())]

nan_rand_fill(df, 'rating')
nan_rand_fill(df, 'rating_count')

In [5]:
display(df.tail(5)) # See the first five rows of the Data Frame

Unnamed: 0,movieid,title,mpaa_rating,budget,gross,release_date,genre,runtime,rating,rating_count
610,611,Godzilla: King of the Monsters,PG-13,200000000.0,385900000.0,2019-05-13,Adventure,132.0,5.2,239755.0
611,612,Toy Story 4,G,200000000.0,1062000000.0,2019-06-11,Animation,100.0,8.6,527766.0
612,613,Fast & Furious Presents: Hobbs & Shaw,PG-13,200000000.0,759400000.0,2019-07-13,Thriller,136.0,6.0,798036.0
613,614,The Lion King,PG,250000000.0,1632000000.0,2019-07-09,Drama,118.0,8.1,453481.0
614,615,Avengers: Endgame,PG-13,356000000.0,2796000000.0,2019-04-22,Action,181.0,6.3,307241.0


In [6]:
print(df.shape) # Shape of the Data Frame

(615, 10)


In [7]:
print(df.dtypes) # Column names with their data types

movieid           int64
title            object
mpaa_rating      object
budget          float64
gross           float64
release_date     object
genre            object
runtime         float64
rating          float64
rating_count    float64
dtype: object


In [8]:
print(df.columns) # Column names

Index(['movieid', 'title', 'mpaa_rating', 'budget', 'gross', 'release_date',
       'genre', 'runtime', 'rating', 'rating_count'],
      dtype='object')


In [9]:
print(df.isna().sum()) # Column names with their NaN count

movieid         0
title           0
mpaa_rating     0
budget          0
gross           0
release_date    0
genre           0
runtime         0
rating          0
rating_count    0
dtype: int64


In [10]:
df.drop_duplicates(subset=['title', 'release_date']) # Remove row duplicates by title and release date

Unnamed: 0,movieid,title,mpaa_rating,budget,gross,release_date,genre,runtime,rating,rating_count
0,1,Look Who's Talking,PG-13,7500000.0,2.960000e+08,1989-10-12,Romance,93.0,5.9,73638.0
1,2,Driving Miss Daisy,PG,7500000.0,1.457933e+08,1989-12-13,Comedy,99.0,7.4,91075.0
2,3,Turner & Hooch,PG,13000000.0,7.107992e+07,1989-07-28,Crime,100.0,7.2,91415.0
3,4,Born on the Fourth of July,R,14000000.0,1.610017e+08,1989-12-20,War,145.0,7.2,91415.0
4,5,Field of Dreams,PG,15000000.0,8.443162e+07,1989-04-21,Drama,107.0,7.5,101702.0
...,...,...,...,...,...,...,...,...,...,...
610,611,Godzilla: King of the Monsters,PG-13,200000000.0,3.859000e+08,2019-05-13,Adventure,132.0,5.2,239755.0
611,612,Toy Story 4,G,200000000.0,1.062000e+09,2019-06-11,Animation,100.0,8.6,527766.0
612,613,Fast & Furious Presents: Hobbs & Shaw,PG-13,200000000.0,7.594000e+08,2019-07-13,Thriller,136.0,6.0,798036.0
613,614,The Lion King,PG,250000000.0,1.632000e+09,2019-07-09,Drama,118.0,8.1,453481.0


In [11]:
# Convert data types

df['budget'] = df['budget'].astype(int)
df['gross'] = df['gross'].astype(int, errors='raise')
df['release_date'] = pd.to_datetime(df['release_date'])

  return values.astype(dtype, copy=copy)


In [12]:
print(df.dtypes)

movieid                  int64
title                   object
mpaa_rating             object
budget                   int32
gross                    int32
release_date    datetime64[ns]
genre                   object
runtime                float64
rating                 float64
rating_count           float64
dtype: object


In [13]:
df.tail(10) # Problem! Getting negative gross for some titles

Unnamed: 0,movieid,title,mpaa_rating,budget,gross,release_date,genre,runtime,rating,rating_count
605,606,Pokemon Detective Pikachu,PG,150000000,431600000,2019-05-03,Mystery,104.0,7.0,933580.0
606,607,Spider-Man: Far from Home,PG-13,160000000,1131000000,2019-06-26,Action,129.0,8.3,156606.0
607,608,Dumbo,PG,170000000,353000000,2019-03-11,Adventure,112.0,6.7,521848.0
608,609,Captain Marvel,PG-13,175000000,1128000000,2019-02-27,Action,124.0,8.1,517524.0
609,610,Aladdin,PG,183000000,1049000000,2019-05-08,Fantasy,128.0,5.6,398571.0
610,611,Godzilla: King of the Monsters,PG-13,200000000,385900000,2019-05-13,Adventure,132.0,5.2,239755.0
611,612,Toy Story 4,G,200000000,1062000000,2019-06-11,Animation,100.0,8.6,527766.0
612,613,Fast & Furious Presents: Hobbs & Shaw,PG-13,200000000,759400000,2019-07-13,Thriller,136.0,6.0,798036.0
613,614,The Lion King,PG,250000000,1632000000,2019-07-09,Drama,118.0,8.1,453481.0
614,615,Avengers: Endgame,PG-13,356000000,-2147483648,2019-04-22,Action,181.0,6.3,307241.0


In [14]:
# Show the movies with more than 7 in Rating & greater than 50 million Gross
display(df[(df['rating'] > 7) & (df['gross'] > 50000000)])

Unnamed: 0,movieid,title,mpaa_rating,budget,gross,release_date,genre,runtime,rating,rating_count
1,2,Driving Miss Daisy,PG,7500000,145793296,1989-12-13,Comedy,99.0,7.4,91075.0
2,3,Turner & Hooch,PG,13000000,71079915,1989-07-28,Crime,100.0,7.2,91415.0
3,4,Born on the Fourth of July,R,14000000,161001698,1989-12-20,War,145.0,7.2,91415.0
4,5,Field of Dreams,PG,15000000,84431625,1989-04-21,Drama,107.0,7.5,101702.0
6,7,When Harry Met Sally...,R,16000000,92800000,1989-07-21,Romance,96.0,7.6,180871.0
...,...,...,...,...,...,...,...,...,...,...
604,605,How to Train Your Dragon: The Hidden World,PG,129000000,519900000,2019-01-03,Animation,104.0,7.1,119080.0
606,607,Spider-Man: Far from Home,PG-13,160000000,1131000000,2019-06-26,Action,129.0,8.3,156606.0
608,609,Captain Marvel,PG-13,175000000,1128000000,2019-02-27,Action,124.0,8.1,517524.0
611,612,Toy Story 4,G,200000000,1062000000,2019-06-11,Animation,100.0,8.6,527766.0


In [15]:
# Show the movies with more than 7 in Rating & greater than 50 million Gross & with Parental guidance as MPAA Rating
display(df[(df['rating'] > 7) & (df['gross'] > 50000000) & (df['mpaa_rating'] == 'PG')])

Unnamed: 0,movieid,title,mpaa_rating,budget,gross,release_date,genre,runtime,rating,rating_count
1,2,Driving Miss Daisy,PG,7500000,145793296,1989-12-13,Comedy,99.0,7.4,91075.0
2,3,Turner & Hooch,PG,13000000,71079915,1989-07-28,Crime,100.0,7.2,91415.0
4,5,Field of Dreams,PG,15000000,84431625,1989-04-21,Drama,107.0,7.5,101702.0
7,8,Dead Poets Society,PG,16400000,235860116,1989-06-02,Drama,129.0,8.1,382002.0
13,14,Batman,PG,35000000,411348924,1989-06-23,Action,126.0,7.5,319517.0
...,...,...,...,...,...,...,...,...,...,...
587,588,Ralph Breaks the Internet,PG,175000000,529300000,2018-11-05,Animation,112.0,7.6,210176.0
591,592,Incredibles 2,PG,200000000,1242000000,2018-06-05,Animation,118.0,8.1,947673.0
602,603,The LEGO Movie 2: The Second Part,PG,99000000,191300000,2019-02-07,Animation,107.0,7.8,506522.0
604,605,How to Train Your Dragon: The Hidden World,PG,129000000,519900000,2019-01-03,Animation,104.0,7.1,119080.0


In [16]:
# Count of Animation movies with more than 7 in Rating (use the shape() function)
print(df[(df['genre'] == 'Animation') & (df['rating'] > 7)].shape[0])

50


In [17]:
# Show the list of top 5 movies based on Budget
display(df.sort_values('budget', ascending=False).head(5))

Unnamed: 0,movieid,title,mpaa_rating,budget,gross,release_date,genre,runtime,rating,rating_count
594,595,Avengers: Infinity War,PG-13,400000000,2048000000,2018-04-23,Action,149.0,5.5,549778.0
454,455,Pirates of the Caribbean: On Stranger Tides,PG-13,380000000,1045713802,2011-05-14,Action,136.0,6.6,455211.0
614,615,Avengers: Endgame,PG-13,356000000,-2147483648,2019-04-22,Action,181.0,6.3,307241.0
574,575,Star Wars: The Last Jedi,PG-13,317000000,1333000000,2019-12-09,Science Fiction,152.0,5.4,285629.0
375,376,Pirates of the Caribbean: At World's End,PG-13,300000000,961000000,2007-05-19,Adventure,169.0,7.1,565402.0


In [18]:
# Show the top 5 Comedy movies by Rating
display(df[df['genre'] == 'Comedy'].sort_values('rating', ascending=False).head(5))

Unnamed: 0,movieid,title,mpaa_rating,budget,gross,release_date,genre,runtime,rating,rating_count
111,112,Forrest Gump,PG-13,55000000,677945399,1994-07-06,Comedy,142.0,8.8,1657851.0
582,583,Deadpool 2,R,110000000,785000000,2018-05-10,Comedy,119.0,8.4,50003.0
537,538,Deadpool,R,58000000,783112979,2016-02-09,Comedy,108.0,8.1,430268.0
185,186,The Truman Show,PG,60000000,264118201,1998-06-04,Comedy,103.0,8.1,859224.0
254,255,"Monsters, Inc.",G,115000000,562816256,2001-11-01,Comedy,92.0,8.0,758349.0


In [19]:
# Top 5 movie names by Rating
print(df.loc[df.sort_values('rating', ascending=False).head(5).index, 'title'])

393                                  The Dark Knight
287    The Lord of the Rings: The Return of the King
99                                      Pulp Fiction
98                                 Jurassic Park III
83                                  Schindler's List
Name: title, dtype: object


In [20]:
# Top 3 high Gross Romance movies released after 1999, not included (typecast it to datetime)
print(df.loc[df[(df['genre'] == 'Romance') & (df['release_date'] > pd.Timestamp('1999-12-31'))].sort_values('gross', ascending=False).head(3).index, 'title'])

464    The Twilight Saga: Breaking Dawn - Part 2
442    The Twilight Saga: Breaking Dawn - Part 1
401                  The Twilight Saga: New Moon
Name: title, dtype: object


In [21]:
# How many Genres are present in the dataframe? (use the function value_counts() which applies to Series, not Dataframe)
print(df['genre'].value_counts())

Action             110
Comedy              99
Animation           87
Drama               66
Thriller            41
Science Fiction     37
Adventure           30
Family              29
Romance             28
Fantasy             27
Crime               17
Horror              14
Mystery             11
War                  9
Western              6
History              4
Name: genre, dtype: int64


In [22]:
# Top 5 expensive movies released after 1999, not included (measured by Budget)
print(df.loc[df[df['release_date'] > pd.Timestamp('1999-12-31')].sort_values('budget', ascending=False).head(5).index, 'title'])

594                         Avengers: Infinity War
454    Pirates of the Caribbean: On Stranger Tides
614                              Avengers: Endgame
574                       Star Wars: The Last Jedi
573                                 Justice League
Name: title, dtype: object

In [23]:
# Most & least frequent MPAA Rating in the dataset in terms of occurrences
# print(df['mpaa_rating'].value_counts())
print(f"The most frequent MPAA Rating is {df['mpaa_rating'].value_counts().index[0]}")
print(f"The most frequent MPAA Rating is {df['mpaa_rating'].mode()[0]}")
print(f"The least frequent MPAA Rating is {df['mpaa_rating'].value_counts().index[-1]}")

The most frequent MPAA Rating is PG-13
The most frequent MPAA Rating is PG-13
The least frequent MPAA Rating is G


In [24]:
# Most and least expensive Genre (take an average of all Budget measures grouped by Genre - use groupBy() method)
print(f"Most expensive genre is {df.groupby('genre')['budget'].mean().sort_values(ascending=False).index[0]}")
print(f"Least expensiv genre is {df.groupby('genre')['budget'].mean().sort_values(ascending=False).index[-1]}")

Most expensive genre is Fantasy
Least expensiv genre is Horror


In [25]:
# Which Genre is favored the most by the people?
# print(df.groupby('genre')['rating'].mean().sort_values(ascending=False))
print(f"The most favored genre is {df.groupby('genre')['rating'].mean().sort_values(ascending=False).index[0]}")

The most favored genre is History
