In [13]:
import numpy as np
import pandas as pd
import pint 

df = pd.read_csv("movies_2.csv")
df.head(5)

# print(df.shape)

# print(df.info())

print(df.dtypes)

print('null values', df.isna().any())
print(df.isna().sum())
df.describe()


MovieID           int64
Title            object
MPAA Rating      object
Budget          float64
Gross           float64
Release Date     object
Genre            object
Runtime         float64
Rating          float64
Rating Count    float64
Summary          object
dtype: object
null values MovieID         False
Title            True
MPAA Rating      True
Budget           True
Gross            True
Release Date     True
Genre            True
Runtime          True
Rating           True
Rating Count     True
Summary          True
dtype: bool
MovieID           0
Title             1
MPAA Rating       1
Budget            1
Gross             1
Release Date      1
Genre             1
Runtime           1
Rating          108
Rating Count    108
Summary         120
dtype: int64


Unnamed: 0,MovieID,Budget,Gross,Runtime,Rating,Rating Count
count,616.0,615.0,615.0,615.0,508.0,508.0
mean,307.5,94917100.0,445322100.0,118.642276,6.917323,339252.1
std,177.968162,67481140.0,339407500.0,22.252376,0.888928,321338.8
min,0.0,60000.0,53000000.0,79.0,4.1,14918.0
25%,153.75,40000000.0,215893900.0,102.0,6.4,127592.2
50%,307.5,80000000.0,351040400.0,117.0,6.9,240347.5
75%,461.25,140000000.0,585176600.0,132.0,7.6,425700.0
max,615.0,400000000.0,2796000000.0,201.0,9.0,2127228.0


# Epic 3: Data Cleaning & Exploratory Data Analysis


## remove irrelevant data

In [14]:
df.drop("Summary", axis = 1)

Unnamed: 0,MovieID,Title,MPAA Rating,Budget,Gross,Release Date,Genre,Runtime,Rating,Rating Count
0,1,Look Who's Talking,PG-13,7500000.0,2.960000e+08,1989-10-12,Romance,93.0,5.9,73638.0
1,2,Driving Miss Daisy,PG,7500000.0,1.457933e+08,1989-12-13,Comedy,99.0,7.4,91075.0
2,3,Turner & Hooch,PG,13000000.0,7.107992e+07,1989-07-28,Crime,100.0,7.2,91415.0
3,4,Born on the Fourth of July,R,14000000.0,1.610017e+08,1989-12-20,War,145.0,7.2,91415.0
4,5,Field of Dreams,PG,15000000.0,8.443162e+07,1989-04-21,Drama,107.0,7.5,101702.0
...,...,...,...,...,...,...,...,...,...,...
611,612,Toy Story 4,G,200000000.0,1.062000e+09,2019-06-11,Animation,100.0,,
612,613,Fast & Furious Presents: Hobbs & Shaw,PG-13,200000000.0,7.594000e+08,2019-07-13,Thriller,136.0,,
613,614,The Lion King,PG,250000000.0,1.632000e+09,2019-07-09,Drama,118.0,,
614,615,Avengers: Endgame,PG-13,356000000.0,2.796000e+09,2019-04-22,Action,181.0,,


## Standarize
For strings, make sure all values are either in lower or upper case. For numerical values, make sure all values have a certain measurement unit. Height, for example, can be in meters and centimeters, or feet and inches.    

In [15]:
#to change one column at a time
# title_lower = df["Title"]=df["Title"].str.lower()
# print(title_lower)

# change multiple columns

df = df.map(lambda s: s.lower() if type(s) == str else s)



# df['Budget'] = df['Budget'].apply(lambda x: "${:.1f}k".format((x/1000)))
# df_lower_case = df.rename(str.lower, axis=1)
# df_renamed = df_lower_case.rename(columns = {"mpaa rating" : "mpaa_rating", "release date" : "release_date", "rating count" : "rating_count"})

#titles into lower case 

# df_renamed = df_renamed.rename(str.lower, axis=1)


# df_renamed = df.columns.replace(" ", "_")


# # df_formatted = df_renamed.style.format({"budget" : "{:,.0f}$",
#                         "gross" : "{:,.0f}$",
#                         "runtime" : "{:,.0f} min",
#                         "rating" : "{:,.1f}",
#                         # "rating_count" : "{:,.0f}"})


df.columns = df.columns.str.lower().str.replace(' ','_')







        

# mystring.replace(" ", "_")
# url = "_".join( title.split() )
# isinstance( df['Budget'],object)

# Remove row with null values

In [16]:
df3 = df.drop(615)

# Replace missing values in rating with the median  

In [17]:
# value to refill is a dictionary in this case, key indicates column.
df4 = df3.fillna({'rating': df3['rating'].median()}, inplace=False )


# Remove duplicates

In [18]:
df5 = df4.drop_duplicates()

# Type Conversion
Convert Budget into integer type (use the astype() method)

Convert Gross into integer type

Convert Release Date into date type (use the to_datetime method)

In [19]:
df6 = df5.astype({"budget":int,"gross":int})
df6['gross']




0       296000000
1       145793296
2        71079915
3       161001698
4        84431625
          ...    
610     385900000
611    1062000000
612     759400000
613    1632000000
614    2796000000
Name: gross, Length: 615, dtype: int64

In [20]:
df6['release_date']= pd.to_datetime(df6["release_date"])

df6.dtypes






movieid                  int64
title                   object
mpaa_rating             object
budget                   int64
gross                    int64
release_date    datetime64[ns]
genre                   object
runtime                float64
rating                 float64
rating_count           float64
summary                 object
dtype: object

# Exploratory Analysis

In [21]:
df7 = df6.style.format({"budget" : "{:,.0f}$",
                        "gross" : "{:,.0f}$",
                        "runtime" : "{:,.0f} min",
                        "rating" : "{:,.1f}",
                        "rating_count" : "{:,.0f}"})

## show the movies with more than 7 in Rating & greater than 50 million Gross

In [22]:
df6[(df6['rating'] > 7) & (df6["gross"] > 50000000)]

Unnamed: 0,movieid,title,mpaa_rating,budget,gross,release_date,genre,runtime,rating,rating_count,summary
1,2,driving miss daisy,pg,7500000,145793296,1989-12-13,comedy,99.0,7.4,91075.0,an old jewish woman and her african-american c...
2,3,turner & hooch,pg,13000000,71079915,1989-07-28,crime,100.0,7.2,91415.0,"det. scott turner (tom hanks) is an uptight, b..."
3,4,born on the fourth of july,r,14000000,161001698,1989-12-20,war,145.0,7.2,91415.0,the biography of ron kovic. paralyzed in the v...
4,5,field of dreams,pg,15000000,84431625,1989-04-21,drama,107.0,7.5,101702.0,"an iowa corn farmer, hearing voices, interpret..."
6,7,when harry met sally...,r,16000000,92800000,1989-07-21,romance,96.0,7.6,180871.0,harry and sally have known each other for year...
...,...,...,...,...,...,...,...,...,...,...,...
505,506,big hero 6,pg,165000000,652105443,2014-10-24,animation,102.0,7.8,380953.0,the special bond that develops between plus-si...
506,507,interstellar,pg-13,165000000,675120017,2014-11-05,science fiction,169.0,8.6,1343549.0,a team of explorers travel through a wormhole ...
507,508,captain america: the winter soldier,pg-13,170000000,714766572,2014-03-20,action,136.0,7.7,685903.0,as steve rogers struggles to embrace his role ...
508,509,dawn of the planet of the apes,pg-13,170000000,710644566,2014-06-26,science fiction,130.0,7.6,395425.0,a growing nation of genetically evolved apes l...


## show the movies with more than 7 in Rating & greater than 50 million Gross & with Parental guidance as MPAA Rating

In [23]:
df6[(df6['rating'] > 7) & (df6["gross"] > 50000000) & (df6["mpaa_rating"] == 'pg')].head(5)

Unnamed: 0,movieid,title,mpaa_rating,budget,gross,release_date,genre,runtime,rating,rating_count,summary
1,2,driving miss daisy,pg,7500000,145793296,1989-12-13,comedy,99.0,7.4,91075.0,an old jewish woman and her african-american c...
2,3,turner & hooch,pg,13000000,71079915,1989-07-28,crime,100.0,7.2,91415.0,"det. scott turner (tom hanks) is an uptight, b..."
4,5,field of dreams,pg,15000000,84431625,1989-04-21,drama,107.0,7.5,101702.0,"an iowa corn farmer, hearing voices, interpret..."
7,8,dead poets society,pg,16400000,235860116,1989-06-02,drama,129.0,8.1,382002.0,english teacher john keating inspires his stud...
13,14,batman,pg,35000000,411348924,1989-06-23,action,126.0,7.5,319517.0,the dark knight of gotham city begins his war ...


## show the count of Animation movies with more than 7 in Rating (use the shape() method)

In [24]:
df6[(df6['genre'] == 'animation') & (df6["rating"] > 7)].shape

(39, 11)

## show the top 5 movies based on Budget

In [25]:
df6.sort_values("budget", ascending=False).head(5)




Unnamed: 0,movieid,title,mpaa_rating,budget,gross,release_date,genre,runtime,rating,rating_count,summary
594,595,avengers: infinity war,pg-13,400000000,2048000000,2018-04-23,action,149.0,6.9,,
454,455,pirates of the caribbean: on stranger tides,pg-13,380000000,1045713802,2011-05-14,action,136.0,6.6,455211.0,jack sparrow and barbossa embark on a quest to...
614,615,avengers: endgame,pg-13,356000000,2796000000,2019-04-22,action,181.0,6.9,,
574,575,star wars: the last jedi,pg-13,317000000,1333000000,2019-12-09,science fiction,152.0,6.9,,
375,376,pirates of the caribbean: at world's end,pg-13,300000000,961000000,2007-05-19,adventure,169.0,7.1,565402.0,"captain barbossa, will turner and elizabeth sw..."


## show the top 5 Comedy movies by Rating

In [26]:
df6[df6['genre'] == "comedy"]
df6[df6['genre'] == "comedy"].sort_values("rating", ascending=False).head(5)

df6[df6['genre'] == "comedy"].nlargest(5, 'rating')
#try nlargest.

Unnamed: 0,movieid,title,mpaa_rating,budget,gross,release_date,genre,runtime,rating,rating_count,summary
111,112,forrest gump,pg-13,55000000,677945399,1994-07-06,comedy,142.0,8.8,1657851.0,"the presidencies of kennedy and johnson, the e..."
185,186,the truman show,pg,60000000,264118201,1998-06-04,comedy,103.0,8.1,859224.0,an insurance salesman discovers his whole life...
80,81,groundhog day,pg,14600000,70906973,1993-02-11,comedy,101.0,8.0,549538.0,a weatherman finds himself inexplicably living...
254,255,"monsters, inc.",g,115000000,562816256,2001-11-01,comedy,92.0,8.0,758349.0,"in order to power the city, monsters have to s..."
40,41,fried green tomatoes,pg-13,11000000,119418501,1991-12-27,comedy,130.0,7.7,62493.0,a housewife who is unhappy with her life befri...


## top 5 movie names by Rating

In [27]:
df6.sort_values("rating", ascending=False).head(5)


Unnamed: 0,movieid,title,mpaa_rating,budget,gross,release_date,genre,runtime,rating,rating_count,summary
393,394,the dark knight,pg-13,185000000,1004558444,2008-07-16,action,152.0,9.0,2127228.0,when the menace known as the joker wreaks havo...
98,99,jurassic park iii,pg-13,93000000,368800000,2001-07-16,thriller,92.0,8.9,1690474.0,a decidedly odd couple with ulterior motives ...
99,100,pulp fiction,r,8000000,213928762,1994-09-10,thriller,154.0,8.9,1690474.0,"the lives of two mob hitmen, a boxer, a gangst..."
83,84,schindler's list,r,22000000,321365567,1993-11-29,history,195.0,8.9,1117322.0,"in german-occupied poland during world war ii,..."
287,288,the lord of the rings: the return of the king,pg-13,94000000,1118888979,2003-12-01,fantasy,201.0,8.9,1529953.0,gandalf and aragorn lead the world of men agai...


## top 3 high Gross Romance movies released after 1999 (typecast it to datetime)

In [None]:
from datetime import datetime
df6[(df6['genre'] == "romance") & (df6["release_date"] > datetime.strptime("1999-12-31", "%Y-%m-%d"))].sort_values("gross", ascending=False).head(3)

# romance_movies = movies[(movies['genre'] == 'Romance') & (movies['release_date'].dt.year > 1999)]

Unnamed: 0,movieid,title,mpaa_rating,budget,gross,release_date,genre,runtime,rating,rating_count,summary
464,465,the twilight saga: breaking dawn - part 2,pg-13,120000000,829000000,2012-11-13,romance,115.0,5.5,218357.0,"after the birth of renesmee/nessie, the cullen..."
442,443,the twilight saga: breaking dawn - part 1,pg-13,110000000,712171856,2011-03-15,romance,117.0,4.9,211592.0,the quileutes close in on expecting parents ed...
401,402,the twilight saga: new moon,pg-13,50000000,709827462,2009-03-15,romance,130.0,4.7,252223.0,edward leaves bella after an attack that nearl...


## how many Genres are present in the dataframe? (use value_counts() method which applies to Series, not Dataframe)

In [29]:
genre_series = df6['genre']
print(len(genre_series.value_counts()))
print(genre_series.value_counts())

16
genre
action             110
comedy              99
animation           87
drama               66
thriller            41
science fiction     37
adventure           30
family              29
romance             28
fantasy             27
crime               17
horror              14
mystery             11
war                  9
western              6
history              4
Name: count, dtype: int64


## top 5 most expensive movies released after 1999 (measured by Budget)

In [30]:
df6[df6["release_date"] > datetime.strptime("1999-12-31", "%Y-%m-%d")].sort_values("budget", ascending=False).head(3)

Unnamed: 0,movieid,title,mpaa_rating,budget,gross,release_date,genre,runtime,rating,rating_count,summary
594,595,avengers: infinity war,pg-13,400000000,2048000000,2018-04-23,action,149.0,6.9,,
454,455,pirates of the caribbean: on stranger tides,pg-13,380000000,1045713802,2011-05-14,action,136.0,6.6,455211.0,jack sparrow and barbossa embark on a quest to...
614,615,avengers: endgame,pg-13,356000000,2796000000,2019-04-22,action,181.0,6.9,,


## most & least frequent MPAA Rating in the dataset in terms of occurrences


In [31]:
# rating_series = df6['mpaa_rating'].explode()
rating_series = df6["mpaa_rating"]
print(rating_series.value_counts())

print("Most frequent MPAA rating :", rating_series.max())
print("Less frequent MPAA rating :",rating_series.min())

mpaa_rating
pg-13    285
pg       161
r        142
g         27
Name: count, dtype: int64
Most frequent MPAA rating : r
Less frequent MPAA rating : g


## most & least expensive Genre (take an average of all Budget measures grouped by Genre - use the groupBy() method)

In [32]:
budget_avg_genre = df6.groupby('genre')["budget"].mean().sort_values()
least_expensive_genre = budget_avg_genre.head(1)
most_expensive_genre = budget_avg_genre.tail(1)

print(budget_avg_genre)

print(least_expensive_genre)

print("most expensive genre: ", most_expensive_genre)


genre
horror             3.700429e+07
romance            4.540020e+07
comedy             4.957727e+07
history            5.550000e+07
drama              5.592576e+07
crime              5.658824e+07
family             6.268966e+07
western            7.983333e+07
thriller           8.658537e+07
war                9.075556e+07
mystery            1.035455e+08
animation          1.158793e+08
science fiction    1.267838e+08
adventure          1.338333e+08
action             1.441682e+08
fantasy            1.612111e+08
Name: budget, dtype: float64
genre
horror    3.700429e+07
Name: budget, dtype: float64
most expensive genre:  genre
fantasy    1.612111e+08
Name: budget, dtype: float64


## which Genre is favoured the most?


In [33]:
genre_rating = df6.groupby("genre")["rating"].mean().sort_values()

print( genre_rating.idxmax())

genre_rating.tail(1)

history


genre
history    7.625
Name: rating, dtype: float64

# Multi Index subsetting
pass a tuple of outer and inner column


In [None]:
genre_rating = df6.groupby("genre")["rating"].value_counts(normalize=True)

genre_rating["action",6.6]
genre_rating["western",8.0]


genre    rating
action   6.9       0.345455
         6.7       0.063636
         7.0       0.054545
         6.2       0.036364
         6.6       0.036364
                     ...   
western  6.9       0.166667
         7.6       0.166667
         8.0       0.166667
         8.2       0.166667
         8.4       0.166667
Name: proportion, Length: 281, dtype: float64

In [47]:
# number of observations
n = df6.groupby("genre")["rating"].count()
print(n)

n["comedy"]

genre
action             110
adventure           30
animation           87
comedy              99
crime               17
drama               66
family              29
fantasy             27
history              4
horror              14
mystery             11
romance             28
science fiction     37
thriller            41
war                  9
western              6
Name: rating, dtype: int64


np.int64(99)

In [None]:
comedy_count= df6[df6["genre"]== "comedy"]
len(comedy_count)

Unnamed: 0,movieid,title,mpaa_rating,budget,gross,release_date,genre,runtime,rating,rating_count,summary
1,2,driving miss daisy,pg,7500000,145793296,1989-12-13,comedy,99.0,7.4,91075.0,an old jewish woman and her african-american c...
8,9,parenthood,pg-13,20000000,126297830,1989-07-31,comedy,124.0,7.0,41866.0,the buckmans are a midwestern family all deali...
9,10,lethal weapon 2,r,25000000,227853986,1989-07-07,comedy,114.0,7.2,151737.0,riggs and murtaugh are on the trail of south a...
10,11,the war of the roses,r,26000000,160200000,1989-12-08,comedy,116.0,6.8,45248.0,a married couple try everything to get each ot...
14,15,ghostbusters ii,pg,37000000,215394738,1989-06-15,comedy,108.0,6.6,171196.0,the discovery of a massive river of ectoplasm ...
...,...,...,...,...,...,...,...,...,...,...,...
516,517,pitch perfect 2,pg-13,29000000,287506194,2015-05-07,comedy,115.0,6.9,,
537,538,deadpool,r,58000000,783112979,2016-02-09,comedy,108.0,6.9,,
582,583,deadpool 2,r,110000000,785000000,2018-05-10,comedy,119.0,6.9,,
597,598,the upside,pg-13,37500000,122800000,2019-01-11,comedy,126.0,6.9,,
