## Imports

In [1]:
! pip install pycountry

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization library
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
df = pd.read_csv("../input/netflix-shows/netflix_titles.csv")

Collecting pycountry
  Downloading pycountry-23.12.11-py3-none-any.whl.metadata (12 kB)
Downloading pycountry-23.12.11-py3-none-any.whl (6.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycountry
Successfully installed pycountry-23.12.11
/kaggle/input/netflix-shows/netflix_titles.csv


# Netflix Exploratory Data Analysis

## First Investigations

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


Dataframe has 8807 rows and 12 columns, including show_id column.

In [3]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


### Dealing with null values

In [4]:
nan_count = df.isnull().sum()
print(nan_count, "\n")
print("Null percentage on column with most nulls = %.2f%%" %(nan_count.max()*100/len(df)))

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64 

Null percentage on column with most nulls = 29.91%


In [5]:
# Replace null values of cast, country, date_added, rating and duration to Unknown
df_fill = df.fillna({
    'cast': 'Unknown',
    'country': 'Unknown',
    'date_added': 'Unknown',
    'rating': 'Unknown',
    'duration': 'Unknown'
})
df_fill.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast               0
country            0
date_added         0
release_year       0
rating             0
duration           0
listed_in          0
description        0
dtype: int64

## Questions to answer

1. What is the most famous type of show?
2. What countries make the most shows?
3. What year has the most shows released?
4. What are the most famoust genres?
5. Who are the directors that made the higher number of shows?

### Question 1. What is the most famous type of show?

In [6]:
# There are only 2 types of shows: Movie and TV Show
df['type'].unique()

array(['Movie', 'TV Show'], dtype=object)

In [7]:
fig = px.histogram(df['type'],
                   title='Type distribution',
                   color_discrete_sequence=['#E50914'], 
                   width=600, 
                   height=600)
fig.show()

### Question 2. What countries make the most shows?

In [8]:
countries = df_fill['country']

In [9]:
# Since there are shows with multiple countries, answering this question requires separating them so we can be able to count
def separate_strings(arr):
    separated = []
    for item in arr:
        if ',' not in item:

            separated.append(item)
        if ', ' in item:
            splitted = item.split(', ')
            for i in splitted:
                if i != '':
                    separated.append(i)
                    
    return separated

In [10]:
separated_countries = separate_strings(countries)

# Creates pandas Dataframe from separated_countries array
df_countries = pd.DataFrame(data=separated_countries, columns=['country'])

# Grouping by shows and counting in n_shows column
count_countries = df_countries.groupby('country').size().reset_index(name='n_shows')

# Removing Unknown countries
known_countries = count_countries[count_countries.country != 'Unknown']
known_countries.sort_values(by=['n_shows'], ascending=False).head()

Unnamed: 0,country,n_shows
116,United States,3689
46,India,1046
115,United Kingdom,804
21,Canada,445
37,France,393


The 5 biggest show producers in quantity are: United States, India, United Kingdom, Canada and France.

In [11]:
fig = px.choropleth(count_countries, 
                    locations="country",
                    color="n_shows", 
                    locationmode='country names',
                    color_continuous_scale=px.colors.sequential.Redor, 
                    template="seaborn",
                    title="Show production map by country",
                    labels={"n_shows":"Number of shows"},
                    width=1280,
                    height=720)
fig.show()

### Question 3. What year has the most shows released?

In [12]:
# Groups by release_year and adds new column shows_per_year
years = df.groupby('release_year').size().reset_index(name='shows_per_year')

fig = px.line(years, 
              x="release_year", 
              y="shows_per_year",
              color_discrete_sequence=['#E50914'],
              labels={"shows_per_year":"Number of shows", "release_year": "Year"},
              width=1280,
              height=720)
fig.show()

In [13]:
print("The year of most shows released was %s, with %d shows released" %(years[years['shows_per_year'] == years['shows_per_year'].max()].iat[0,0], years['shows_per_year'].max()))

The year of most shows released was 2018, with 1147 shows released


### 4. What are the most famoust genres?

In [14]:
genres = df['listed_in']

separated_genres = separate_strings(genres)

In [15]:
df_genres = pd.DataFrame(separated_genres, columns=['genre'])

In [16]:
df_genres['genre'].unique()

array(['Documentaries', 'International TV Shows', 'TV Dramas',
       'TV Mysteries', 'Crime TV Shows', 'TV Action & Adventure',
       'Docuseries', 'Reality TV', 'Romantic TV Shows', 'TV Comedies',
       'TV Horror', 'Children & Family Movies', 'Dramas',
       'Independent Movies', 'International Movies', 'British TV Shows',
       'Comedies', 'Spanish-Language TV Shows', 'Thrillers',
       'Romantic Movies', 'Music & Musicals', 'Horror Movies',
       'Sci-Fi & Fantasy', 'TV Thrillers', "Kids' TV",
       'Action & Adventure', 'TV Sci-Fi & Fantasy', 'Classic Movies',
       'Anime Features', 'Sports Movies', 'Anime Series',
       'Korean TV Shows', 'Science & Nature TV', 'Teen TV Shows',
       'Cult Movies', 'TV Shows', 'Faith & Spirituality', 'LGBTQ Movies',
       'Stand-Up Comedy', 'Movies', 'Stand-Up Comedy & Talk Shows',
       'Classic & Cult TV'], dtype=object)

In [17]:
genres = ['Documentaries', 'TV Dramas',
       'TV Mysteries', 'Crime TV Shows', 'TV Action & Adventure',
       'Docuseries', 'Reality TV', 'Romantic TV Shows', 'TV Comedies',
       'TV Horror', 'Children & Family Movies', 'Dramas',
       'Comedies', 'Thrillers',
       'Romantic Movies', 'Music & Musicals', 'Horror Movies',
       'Sci-Fi & Fantasy', 'TV Thrillers', "Kids' TV",
       'Action & Adventure', 'TV Sci-Fi & Fantasy'
       'Anime Features', 'Anime Series',
       'Korean TV Shows', 'Science & Nature TV',
       'Cult Movies', 'TV Shows', 'Faith & Spirituality',
       'Stand-Up Comedy', 'Stand-Up Comedy & Talk Shows',
       'Classic & Cult TV']
valid_genres = [item for item in separated_genres if (item in genres)]

In [18]:
fig = px.histogram(valid_genres, title='Shows Genres', color_discrete_sequence=['#E50914'], width=1280, height=720)

fig.show()

In [19]:
print("The most famous genre was Drama")

The most famous genre was Drama


### 5. Who are the directors that made the higher number of shows?

In [20]:
directors = df.dropna()

directors = directors.groupby('director').size().reset_index(name='shows_made')
directors = directors.sort_values(by='shows_made', ascending=False)

In [21]:
fig = px.histogram(directors.head(10),
                   title='Histogram of number of shows per Director',
                   x='director', 
                   y='shows_made',
                   color_discrete_sequence=['#E50914'],
                   width=1280, 
                   height=720, 
                   labels={"shows_made":"Number of shows", "director": "Director"})

fig.show()

In [22]:
print('The colaboration of Raúl Campos and Jan Suter was the direction with most shows published (18 shows)')

The colaboration of Raúl Campos and Jan Suter was the direction with most shows published (18 shows)


### Further investigating

In [23]:
# Copying original Dataframe
durations = df.copy()

# Cleaning Dataframe to analyze only Movies, that have a duration in minutes
durations = durations.loc[durations['type'] == 'Movie']

# Cleaning unwanted strings of duration column (' min')
durations['duration'] = pd.to_numeric(durations['duration'].str.strip(' min'))

# Statistical description of duration (mean, std, min, max and quartiles)
duration_info = durations['duration'].describe()
duration_info

count    6128.000000
mean       99.577187
std        28.290593
min         3.000000
25%        87.000000
50%        98.000000
75%       114.000000
max       312.000000
Name: duration, dtype: float64

In [24]:
fig = px.box(durations, 
             title='Boxplot of Movies Duration', 
             y='duration', 
             color_discrete_sequence=['#E50914'],
             width=600, 
             height=600, 
             labels={"duration":"Duration in minutes"})
fig.show()

As we can see above, the mean duration of the movies is 99.58 minutes. Varying between a minimum of 3 minutes and a maximum of 312 minutes. These two movies can be seen below:

In [25]:
min_title = durations.loc[durations.loc[durations['duration'] == 3].index[0], 'title']
max_title = durations.loc[durations.loc[durations['duration'] == 312].index[0], 'title']

print("The 3 minutes long movie is", min_title )
print("The 312 minutes long movie is", max_title)

The 3 minutes long movie is Silent
The 312 minutes long movie is Black Mirror: Bandersnatch


In [26]:
durations.loc[durations['duration'] == 3].index[0]

3777