In [63]:
import numpy as np
import pandas as pd
import random

df = pd.read_csv('Sprint 1 - Movies Data Set Analysis/movies.csv') # Gathering data
df = df.drop(columns=['Summary']) # remove the column "Summary"
df.columns = df.columns.str.lower().str.replace(' ', '_') # Make all the column names lowercase and replace spaces with underscores
df.drop(df.tail(1).index, inplace=True) # Remove the last row of the DF (containing NaNs)

In [64]:
print(df.tail(5)) # See the first five rows of the Data Frame

     movieid                                  title mpaa_rating       budget  \
610      611         Godzilla: King of the Monsters       PG-13  200000000.0   
611      612                            Toy Story 4           G  200000000.0   
612      613  Fast & Furious Presents: Hobbs & Shaw       PG-13  200000000.0   
613      614                          The Lion King          PG  250000000.0   
614      615                      Avengers: Endgame       PG-13  356000000.0   

            gross release_date      genre  runtime  rating  rating_count  
610  3.859000e+08   2019-05-13  Adventure    132.0     NaN           NaN  
611  1.062000e+09   2019-06-11  Animation    100.0     NaN           NaN  
612  7.594000e+08   2019-07-13   Thriller    136.0     NaN           NaN  
613  1.632000e+09   2019-07-09      Drama    118.0     NaN           NaN  
614  2.796000e+09   2019-04-22     Action    181.0     NaN           NaN  


In [65]:
# Fill NaNs with a random value between (mean — 2 * std) & (mean + 2 * std)

def nan_rand_fill(data, key):
    mn = data[key].mean()
    sd = data[key].std()
    low_l = mn - 2 * sd
    high_l = mn + 2 * sd
    nans = data[key].isna()
    df.loc[nans, key] = [abs(np.floor(random.uniform(low_l, high_l))) if 'count' in key 
    else abs(round(random.uniform(low_l, high_l), 1)) for i in range(nans.sum())]

nan_rand_fill(df, 'rating')
nan_rand_fill(df, 'rating_count')

In [66]:
print(df.tail(5)) # See the first five rows of the Data Frame

     movieid                                  title mpaa_rating       budget  \
610      611         Godzilla: King of the Monsters       PG-13  200000000.0   
611      612                            Toy Story 4           G  200000000.0   
612      613  Fast & Furious Presents: Hobbs & Shaw       PG-13  200000000.0   
613      614                          The Lion King          PG  250000000.0   
614      615                      Avengers: Endgame       PG-13  356000000.0   

            gross release_date      genre  runtime  rating  rating_count  
610  3.859000e+08   2019-05-13  Adventure    132.0     7.0      192107.0  
611  1.062000e+09   2019-06-11  Animation    100.0     6.8      917790.0  
612  7.594000e+08   2019-07-13   Thriller    136.0     6.2      373793.0  
613  1.632000e+09   2019-07-09      Drama    118.0     5.4      356247.0  
614  2.796000e+09   2019-04-22     Action    181.0     6.1      899833.0  


In [67]:
print(df.shape) # Shape of the Data Frame

(615, 10)


In [68]:
print(df.dtypes) # Column names with their data types

movieid           int64
title            object
mpaa_rating      object
budget          float64
gross           float64
release_date     object
genre            object
runtime         float64
rating          float64
rating_count    float64
dtype: object


In [69]:
print(df.columns) # Column names

Index(['movieid', 'title', 'mpaa_rating', 'budget', 'gross', 'release_date',
       'genre', 'runtime', 'rating', 'rating_count'],
      dtype='object')


In [70]:
print(df.isna().sum()) # Column names with their NaN count

movieid         0
title           0
mpaa_rating     0
budget          0
gross           0
release_date    0
genre           0
runtime         0
rating          0
rating_count    0
dtype: int64


In [71]:
df.drop_duplicates(subset=['title', 'release_date']) # Remove row duplicates by title and release date

Unnamed: 0,movieid,title,mpaa_rating,budget,gross,release_date,genre,runtime,rating,rating_count
0,1,Look Who's Talking,PG-13,7500000.0,2.960000e+08,1989-10-12,Romance,93.0,5.9,73638.0
1,2,Driving Miss Daisy,PG,7500000.0,1.457933e+08,1989-12-13,Comedy,99.0,7.4,91075.0
2,3,Turner & Hooch,PG,13000000.0,7.107992e+07,1989-07-28,Crime,100.0,7.2,91415.0
3,4,Born on the Fourth of July,R,14000000.0,1.610017e+08,1989-12-20,War,145.0,7.2,91415.0
4,5,Field of Dreams,PG,15000000.0,8.443162e+07,1989-04-21,Drama,107.0,7.5,101702.0
...,...,...,...,...,...,...,...,...,...,...
610,611,Godzilla: King of the Monsters,PG-13,200000000.0,3.859000e+08,2019-05-13,Adventure,132.0,7.0,192107.0
611,612,Toy Story 4,G,200000000.0,1.062000e+09,2019-06-11,Animation,100.0,6.8,917790.0
612,613,Fast & Furious Presents: Hobbs & Shaw,PG-13,200000000.0,7.594000e+08,2019-07-13,Thriller,136.0,6.2,373793.0
613,614,The Lion King,PG,250000000.0,1.632000e+09,2019-07-09,Drama,118.0,5.4,356247.0


In [75]:
# Convert data types

df['budget'] = df['budget'].astype(int)
df['gross'] = df['gross'].astype(int, errors='raise')
df['release_date'] = pd.to_datetime(df['release_date'])

In [73]:
print(df.dtypes)

movieid                  int64
title                   object
mpaa_rating             object
budget                   int32
gross                    int32
release_date    datetime64[ns]
genre                   object
runtime                float64
rating                 float64
rating_count           float64
dtype: object


In [76]:
df.tail(10) # Problem! Getting negative gross for some titles

Unnamed: 0,movieid,title,mpaa_rating,budget,gross,release_date,genre,runtime,rating,rating_count
605,606,Pokemon Detective Pikachu,PG,150000000,431600000,2019-05-03,Mystery,104.0,5.5,381043.0
606,607,Spider-Man: Far from Home,PG-13,160000000,1131000000,2019-06-26,Action,129.0,6.2,21444.0
607,608,Dumbo,PG,170000000,353000000,2019-03-11,Adventure,112.0,8.1,112317.0
608,609,Captain Marvel,PG-13,175000000,1128000000,2019-02-27,Action,124.0,8.2,706466.0
609,610,Aladdin,PG,183000000,1049000000,2019-05-08,Fantasy,128.0,7.6,43122.0
610,611,Godzilla: King of the Monsters,PG-13,200000000,385900000,2019-05-13,Adventure,132.0,7.0,192107.0
611,612,Toy Story 4,G,200000000,1062000000,2019-06-11,Animation,100.0,6.8,917790.0
612,613,Fast & Furious Presents: Hobbs & Shaw,PG-13,200000000,759400000,2019-07-13,Thriller,136.0,6.2,373793.0
613,614,The Lion King,PG,250000000,1632000000,2019-07-09,Drama,118.0,5.4,356247.0
614,615,Avengers: Endgame,PG-13,356000000,-2147483648,2019-04-22,Action,181.0,6.1,899833.0


In [83]:
# Show the movies with more than 7 in Rating & greater than 50 million Gross
print(df[(df['rating'] > 7) & (df['gross'] > 50000000)])

     movieid                                       title mpaa_rating  \
1          2                          Driving Miss Daisy          PG   
2          3                              Turner & Hooch          PG   
3          4                  Born on the Fourth of July           R   
4          5                             Field of Dreams          PG   
6          7                     When Harry Met Sally...           R   
..       ...                                         ...         ...   
601      602               Once Upon a Time in Hollywood           R   
604      605  How to Train Your Dragon: The Hidden World          PG   
607      608                                       Dumbo          PG   
608      609                              Captain Marvel       PG-13   
609      610                                     Aladdin          PG   

        budget       gross release_date      genre  runtime  rating  \
1      7500000   145793296   1989-12-13     Comedy     99.0     

In [85]:
# Show the movies with more than 7 in Rating & greater than 50 million Gross & with Parental guidance as MPAA Rating
print(df[(df['rating'] > 7) & (df['gross'] > 50000000) & (df['mpaa_rating'] == 'PG')])

     movieid                                       title mpaa_rating  \
1          2                          Driving Miss Daisy          PG   
2          3                              Turner & Hooch          PG   
4          5                             Field of Dreams          PG   
7          8                          Dead Poets Society          PG   
13        14                                      Batman          PG   
..       ...                                         ...         ...   
581      582           Spider-Man: Into the Spider-Verse          PG   
600      601                   The Secret Life of Pets 2          PG   
604      605  How to Train Your Dragon: The Hidden World          PG   
607      608                                       Dumbo          PG   
609      610                                     Aladdin          PG   

        budget       gross release_date      genre  runtime  rating  \
1      7500000   145793296   1989-12-13     Comedy     99.0     

In [95]:
# Count of Animation movies with more than 7 in Rating (use the shape() function)
print(df[(df['genre'] == 'Animation') & (df['rating'] > 7)].shape[0])

52


In [98]:
# Show the list of top 5 movies based on Budget
print(df.sort_values('budget', ascending=False).head(5))

     movieid                                        title mpaa_rating  \
594      595                       Avengers: Infinity War       PG-13   
454      455  Pirates of the Caribbean: On Stranger Tides       PG-13   
614      615                            Avengers: Endgame       PG-13   
574      575                     Star Wars: The Last Jedi       PG-13   
375      376     Pirates of the Caribbean: At World's End       PG-13   

        budget       gross release_date            genre  runtime  rating  \
594  400000000  2048000000   2018-04-23           Action    149.0     7.1   
454  380000000  1045713802   2011-05-14           Action    136.0     6.6   
614  356000000 -2147483648   2019-04-22           Action    181.0     6.1   
574  317000000  1333000000   2019-12-09  Science Fiction    152.0     5.4   
375  300000000   961000000   2007-05-19        Adventure    169.0     7.1   

     rating_count  
594       26931.0  
454      455211.0  
614      899833.0  
574       40736.0 