# Hackathon Notebook

In [474]:
import pandas as pd
import numpy as np

In [475]:
raw = pd.read_csv('data/netflix_titles.csv')

In [476]:
raw.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...


In [477]:
raw.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [478]:
raw.shape

(8807, 12)

In [479]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


Changes:
- Drop 'description'
- Split 'duration' into 'feature length' and 'Season length'

In [523]:
df = raw.drop('description', axis=1).copy()
df = df.rename({'show_id':'show_id','type':'type', 'title':'title', 'director':'director', 'cast':'cast', 'country':'country', 'date_added':'date_added', 'release_year':'release_year','rating':'rating','duration':'duration', 'listed_in':'categories'})

In [525]:
df.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries"
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."


In [483]:
# Converting the Date added into a DateTime format
df['date_added'] = pd.to_datetime(df['date_added'],format='mixed')
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month_name()
df['day_of_week_added'] = df['date_added'].dt.day_name()
df.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,year_added,month_added,day_of_week_added
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2021-09-25,2020,PG-13,90 min,Documentaries,2021.0,September,Saturday
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries",2021.0,September,Friday
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",2021.0,September,Friday


In [484]:
# df[df['country'].isnull()]
df['country'] = np.where(df['country'].isnull(), 'Unknown', df['country'])


In [485]:
# df['duration'].info()
# Convert missing values into the Mode
df['duration'] = np.where(df['duration'].isnull(), df['duration'].mode()[0], df['duration'])

# Split 'duration' into 'seasons' and 'movie_length' columns
df['seasons'] = np.where(df['duration'].str.contains('Season'), df['duration'], np.nan)
df['movie_length (mins)'] = np.where(df['duration'].str.contains('Season'), np.nan, df['duration'])

# Remove the 'string' to leave the data as numbers

df['seasons'] = df['seasons'].str.replace(' Season', '')
df['seasons'] = df['seasons'].str.replace('s','')

df['movie_length (mins)']  = df['movie_length (mins)'].str.replace(' min', '')

# convert string number into numeric numbers (floats)
df['duration'] = df['duration'].str.replace(' Season', '')
df['duration'] = df['duration'].str.replace('s','')
df['duration']  = df['duration'].str.replace(' min', '')


df['seasons'] = pd.to_numeric(df['seasons'])
df['movie_length (mins)'] = pd.to_numeric(df['movie_length (mins)'])
df['duration'] = pd.to_numeric(df['duration'])
# df['duration'].info()
# We've extracted the duration column info into two other columns, so dropping the now redundant column
df = df.drop('duration', axis=1).copy()

In [497]:
df.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,listed_in,year_added,month_added,day_of_week_added,seasons,movie_length (mins)
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2021-09-25,2020,PG-13,Documentaries,2021.0,September,Saturday,,90.0
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,"International TV Shows, TV Dramas, TV Mysteries",2021.0,September,Friday,2.0,
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,2021-09-24,2021,TV-MA,"Crime TV Shows, International TV Shows, TV Act...",2021.0,September,Friday,1.0,


Standardizing Information

In [370]:
df['rating'].unique()
df['rating'] = df['rating'].fillna('Not Rated')
df['rating'] = df['rating'].str.replace(' min', ' Min')
# df['rating'].unique()

In [371]:
# df['type'].unique()

For numeric columns, fill in missing values with the mean or median. 

In [395]:
df.head(3)

movies_df = df[df['type'] == 'Movie']
movies_df = movies_df.drop('seasons', axis=1)
movies_df = movies_df.drop('type', axis=1)

tv_show_df = df[df['type'] == 'TV Show']
tv_show_df = tv_show_df.drop('movie_length (mins)', axis=1)
tv_show_df = tv_show_df.drop('type', axis=1)

In [467]:
# movies_df.head(3)

In [469]:
# tv_show_df.head(3)

** VISUALS

In [515]:
# Most commonly added month
df['month_added'].mode()[0]
# Most common movie theme
df['']
# Most 

'July'