In [2]:
# Using Python 3 in Jupyter to run analysis on a predetermined dataset.
# Analysis to track parental rating of a Movie/TV Show over time

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


Importing the datasets 

In [30]:
# Importing from a .csv file stored locally
df = pd.read_csv(r'C:\Users\mark.keogh\Documents\GitHub\-UCDPA_MarkKeogh\Datasets\netflix_titles.csv', index_col='show_id')

In [31]:
# call df.head() to get the first 5 rows of data
df.head()

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [32]:
# call this function to get the column types
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8807 entries, s1 to s8807
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          8807 non-null   object
 1   title         8807 non-null   object
 2   director      6173 non-null   object
 3   cast          7982 non-null   object
 4   country       7976 non-null   object
 5   date_added    8797 non-null   object
 6   release_year  8807 non-null   int64 
 7   rating        8803 non-null   object
 8   duration      8804 non-null   object
 9   listed_in     8807 non-null   object
 10  description   8807 non-null   object
dtypes: int64(1), object(10)
memory usage: 825.7+ KB


In [33]:
df.describe()

Unnamed: 0,release_year
count,8807.0
mean,2014.180198
std,8.819312
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


In [34]:
# Check for null(NaN) values
# We can see below that director, cast and country have multiple missing values.
# For the purpose of this analysis we don not need director and cast
# However for country, will replace missing values with 'No Data'
df.isnull()

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
s1,False,False,False,True,False,False,False,False,False,False,False
s2,False,False,True,False,False,False,False,False,False,False,False
s3,False,False,False,False,True,False,False,False,False,False,False
s4,False,False,True,True,True,False,False,False,False,False,False
s5,False,False,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
s8803,False,False,False,False,False,False,False,False,False,False,False
s8804,False,False,True,True,True,False,False,False,False,False,False
s8805,False,False,False,False,False,False,False,False,False,False,False
s8806,False,False,False,False,False,False,False,False,False,False,False


In [35]:
print(df.head())

            type                  title         director  \
show_id                                                    
s1         Movie   Dick Johnson Is Dead  Kirsten Johnson   
s2       TV Show          Blood & Water              NaN   
s3       TV Show              Ganglands  Julien Leclercq   
s4       TV Show  Jailbirds New Orleans              NaN   
s5       TV Show           Kota Factory              NaN   

                                                      cast        country  \
show_id                                                                     
s1                                                     NaN  United States   
s2       Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
s3       Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
s4                                                     NaN            NaN   
s5       Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

                 date_added  release_ye

In [36]:
# Replacing country NaN with No Data
df['country'] = df['country'].fillna("No Data")

In [37]:
df.head()

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",No Data,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
s4,TV Show,Jailbirds New Orleans,,,No Data,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [66]:
# Removing columns not needed for analysis

df2 = df.drop(['director', 'cast', 'description', 'duration'], axis=1)

In [None]:
# Now left with 7 columns, we can now group by 'type' to separate TV Shows and Movies

Analysing the Data

In [68]:
# Sorting the code by release_year and rating

df2.sort_values(by=["release_year", "rating"], ascending=[False, True])

Unnamed: 0_level_0,type,title,country,date_added,release_year,rating,listed_in
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
s7,Movie,My Little Pony: A New Generation,No Data,"September 24, 2021",2021,PG,Children & Family Movies
s46,Movie,My Heroes Were Cowboys,No Data,"September 16, 2021",2021,PG,Documentaries
s232,Movie,The Water Man,United States,"August 25, 2021",2021,PG,"Children & Family Movies, Dramas"
s301,Movie,Vivo,"Canada, United States","August 6, 2021",2021,PG,"Children & Family Movies, Music & Musicals"
s394,Movie,A Second Chance: Rivals!,Australia,"July 23, 2021",2021,PG,"Children & Family Movies, Sports Movies"
...,...,...,...,...,...,...,...
s8740,Movie,Why We Fight: The Battle of Russia,United States,"March 31, 2017",1943,TV-PG,Documentaries
s8764,Movie,WWII: Report from the Aleutians,United States,"March 31, 2017",1943,TV-PG,Documentaries
s7791,Movie,Prelude to War,United States,"March 31, 2017",1942,TV-14,"Classic Movies, Documentaries"
s8206,Movie,The Battle of Midway,United States,"March 31, 2017",1942,TV-14,"Classic Movies, Documentaries"


In [69]:
# Can get the number of realease per year by using the following
print(df2["release_year"].value_counts())


2018    1147
2017    1032
2019    1030
2020     953
2016     902
        ... 
1959       1
1925       1
1961       1
1947       1
1966       1
Name: release_year, Length: 74, dtype: int64


In [77]:
# Selecting Data with .query()

df2.query(("country == 'No Data' and release_year > 2020"))

Unnamed: 0_level_0,type,title,country,date_added,release_year,rating,listed_in
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
s3,TV Show,Ganglands,No Data,"September 24, 2021",2021,TV-MA,"Crime TV Shows, International TV Shows, TV Act..."
s4,TV Show,Jailbirds New Orleans,No Data,"September 24, 2021",2021,TV-MA,"Docuseries, Reality TV"
s6,TV Show,Midnight Mass,No Data,"September 24, 2021",2021,TV-MA,"TV Dramas, TV Horror, TV Mysteries"
s7,Movie,My Little Pony: A New Generation,No Data,"September 24, 2021",2021,PG,Children & Family Movies
s11,TV Show,"Vendetta: Truth, Lies and The Mafia",No Data,"September 24, 2021",2021,TV-MA,"Crime TV Shows, Docuseries, International TV S..."
...,...,...,...,...,...,...,...
s1251,Movie,Connected,No Data,"March 1, 2021",2021,TV-G,"Documentaries, International Movies, Music & M..."
s1263,Movie,Ginny & Georgia - The Afterparty,No Data,"February 26, 2021",2021,TV-MA,"Comedies, Dramas"
s1300,TV Show,Thus Spoke Kishibe Rohan,No Data,"February 18, 2021",2021,TV-MA,"Anime Series, International TV Shows"
s1419,Movie,Chris Rock Total Blackout: The Tamborine Exten...,No Data,"January 12, 2021",2021,TV-MA,Stand-Up Comedy


In [100]:
# Grouping by column values.
# Here we are grouping by type and shwoing the minimum year

print(df2.groupby("type")["release_year"].agg([min]))

          min
type         
Movie    1942
TV Show  1925


In [90]:
# iterating over rows will give you the values horizontally 
for column in df2[['type', 'rating']]:
    
    # Select column contents by column  
    # name using [] operator
    columnSeriesObj = df2[column]
    print('Column Name : ', column)
    print('Column Contents : ', columnSeriesObj.values)

Column Name :  type
Column Contents :  ['Movie' 'TV Show' 'TV Show' ... 'Movie' 'Movie' 'Movie']
Column Name :  rating
Column Contents :  ['PG-13' 'TV-MA' 'TV-MA' ... 'R' 'PG' 'TV-14']


In [108]:
# Using iterrow and for loop to create a new column
for index, row in df2.iterrows():
    df2.loc[index, "COUNTRY"] = row['country'].upper()
df2.head()

Unnamed: 0_level_0,type,title,country,date_added,release_year,rating,listed_in,COUNTRY
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
s1,Movie,Dick Johnson Is Dead,United States,"September 25, 2021",2020,PG-13,Documentaries,UNITED STATES
s2,TV Show,Blood & Water,South Africa,"September 24, 2021",2021,TV-MA,"International TV Shows, TV Dramas, TV Mysteries",SOUTH AFRICA
s3,TV Show,Ganglands,No Data,"September 24, 2021",2021,TV-MA,"Crime TV Shows, International TV Shows, TV Act...",NO DATA
s4,TV Show,Jailbirds New Orleans,No Data,"September 24, 2021",2021,TV-MA,"Docuseries, Reality TV",NO DATA
s5,TV Show,Kota Factory,India,"September 24, 2021",2021,TV-MA,"International TV Shows, Romantic TV Shows, TV ...",INDIA


In [111]:
# Using .apply() instead of iterrow and for loop to create a new column
# Change to lowercase to show difference
df2["COUNTRY"] = df2["country"].apply(str.lower)

In [112]:
df2.head()

Unnamed: 0_level_0,type,title,country,date_added,release_year,rating,listed_in,COUNTRY
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
s1,Movie,Dick Johnson Is Dead,United States,"September 25, 2021",2020,PG-13,Documentaries,united states
s2,TV Show,Blood & Water,South Africa,"September 24, 2021",2021,TV-MA,"International TV Shows, TV Dramas, TV Mysteries",south africa
s3,TV Show,Ganglands,No Data,"September 24, 2021",2021,TV-MA,"Crime TV Shows, International TV Shows, TV Act...",no data
s4,TV Show,Jailbirds New Orleans,No Data,"September 24, 2021",2021,TV-MA,"Docuseries, Reality TV",no data
s5,TV Show,Kota Factory,India,"September 24, 2021",2021,TV-MA,"International TV Shows, Romantic TV Shows, TV ...",india


In [120]:
# For purposes of project assessment we will now merge the two dataframes (df and df2) 

df_show_id = df.merge(df2, on="show_id", how='right', suffixes=('', '_new2'))

In [121]:
df_show_id.head()

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,type_new2,title_new2,country_new2,date_added_new2,release_year_new2,rating_new2,listed_in_new2,COUNTRY
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",description,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",Movie,Dick Johnson Is Dead,United States,"September 25, 2021",2020,PG-13,Documentaries,united states
s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",description,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",TV Show,Blood & Water,South Africa,"September 24, 2021",2021,TV-MA,"International TV Shows, TV Dramas, TV Mysteries",south africa
s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",No Data,"September 24, 2021",description,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,TV Show,Ganglands,No Data,"September 24, 2021",2021,TV-MA,"Crime TV Shows, International TV Shows, TV Act...",no data
s4,TV Show,Jailbirds New Orleans,,,No Data,"September 24, 2021",description,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",TV Show,Jailbirds New Orleans,No Data,"September 24, 2021",2021,TV-MA,"Docuseries, Reality TV",no data
s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",description,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,TV Show,Kota Factory,India,"September 24, 2021",2021,TV-MA,"International TV Shows, Romantic TV Shows, TV ...",india


In [125]:
# To merge only the different columns
cols_to_use = df2.columns.difference(df.columns)
dfNew = df.merge(df2[cols_to_use], left_index=True, right_index=True, how='outer', suffixes=('', '_new2'))

In [127]:
dfNew.head()

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,COUNTRY
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",description,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",united states
s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",description,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",south africa
s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",No Data,"September 24, 2021",description,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,no data
s4,TV Show,Jailbirds New Orleans,,,No Data,"September 24, 2021",description,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",no data
s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",description,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,india
