#Import and displaying shape and rows

In [None]:
import pandas as pd

df = pd.read_csv('sample_data/netflix_titles.csv')

print(df.shape)
print("========================================================")
print(df.head())
print("========================================================")
print(df.tail())
print("========================================================")
print(df.columns)

(8807, 12)
  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2

#Filled null entries with placeholder "unknown"

In [None]:
df_ukn = df.fillna('Unknown')

#Changed data type

In [None]:
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
print(df.dtypes)

show_id                       object
type                          object
title                         object
director                      object
cast                          object
country                       object
date_added            datetime64[ns]
release_year                   int64
rating                        object
duration                      object
listed_in                     object
description                   object
duration_num                 float64
duration_unit                 object
main_genre                    object
year_added                   float64
release_to_add_lag           float64
dtype: object


#Count of movie and tv shows, ratings, and the year with most shows released

In [None]:
print(df['type'].value_counts())

print("========================================================")

print(df['rating'].value_counts().head(5))

print("========================================================")

print(df['release_year'].value_counts().idxmax())

type
Movie      6131
TV Show    2676
Name: count, dtype: int64
rating
TV-MA    3207
TV-14    2160
TV-PG     863
R         799
PG-13     490
Name: count, dtype: int64
2018


#Changed the type for duration to get length and number of seasons

In [None]:
df[['duration_num', 'duration_unit']] = df['duration'].str.extract(r'(\d+)\s*(\w+)')
df['duration_num'] = pd.to_numeric(df['duration_num'], errors='coerce')

print(df[df['type'] == 'Movie']['duration_num'].mean())

print(df[df['type'] == 'TV Show']['duration_num'].mean())

99.57718668407311
1.764947683109118


#Average release year for each genre

In [None]:
df['main_genre'] = df['listed_in'].str.split(',').str[0].str.strip()

avg_years = df.groupby('main_genre')['release_year'].mean()

avg_years_as_date = pd.to_datetime(avg_years.astype(int), format='%Y')

genre_release_summary = pd.DataFrame({
    'average_release_year_float': avg_years,
    'average_release_date': avg_years_as_date
}).sort_values(by='average_release_year_float', ascending=False)

print(genre_release_summary)

                              average_release_year_float average_release_date
main_genre                                                                   
TV Horror                                    2018.090909           2018-01-01
Stand-Up Comedy & Talk Shows                 2018.029412           2018-01-01
LGBTQ Movies                                 2018.000000           2018-01-01
Crime TV Shows                               2017.879699           2017-01-01
Docuseries                                   2017.805430           2017-01-01
Romantic Movies                              2017.666667           2017-01-01
Reality TV                                   2017.616667           2017-01-01
TV Action & Adventure                        2017.575000           2017-01-01
Anime Features                               2017.380952           2017-01-01
Romantic TV Shows                            2017.343750           2017-01-01
Sports Movies                                2017.000000        

#Temporal Trends

In [None]:
df['year_added'] = df['date_added'].dt.year
df['release_to_add_lag'] = df['year_added'] - df['release_year']

print(df['release_to_add_lag'].describe())

count    8709.000000
mean        4.690894
std         8.792208
min        -3.000000
25%         0.000000
50%         1.000000
75%         5.000000
max        93.000000
Name: release_to_add_lag, dtype: float64


###for the count above the average is 4.69 years before netflix adds the show

#Rating vs Type

In [None]:
print(pd.crosstab(df['type'], df['rating']))

rating   66 min  74 min  84 min   G  NC-17  NR   PG  PG-13    R  TV-14  TV-G  \
type                                                                           
Movie         1       1       1  41      3  75  287    490  797   1427   126   
TV Show       0       0       0   0      0   5    0      0    2    733    94   

rating   TV-MA  TV-PG  TV-Y  TV-Y7  TV-Y7-FV  UR  
type                                              
Movie     2062    540   131    139         5   3  
TV Show   1145    323   176    195         1   0  


#With rating "R" after 2020

In [None]:
filtered = df[(df['rating'] == 'R') & (df['date_added'] >= '2021-01-01')]
print(len(filtered))

190


#Average release of countries

In [None]:
df['release_year'] = pd.to_datetime(df['release_year'], errors='coerce')
print(df.groupby('country')['release_year'].mean())

print("========================================================")

print(df.groupby(['year_added', 'type']).size().groupby(level=0).apply(lambda x: x / x.sum()))

country
, France, Algeria                                     1970-01-01 00:00:00.000002014
, South Korea                                         1970-01-01 00:00:00.000002021
Argentina                                             1970-01-01 00:00:00.000002016
Argentina, Brazil, France, Poland, Germany, Denmark   1970-01-01 00:00:00.000002017
Argentina, Chile                                      1970-01-01 00:00:00.000002011
                                                                   ...             
Venezuela                                             1970-01-01 00:00:00.000002017
Venezuela, Colombia                                   1970-01-01 00:00:00.000002007
Vietnam                                               1970-01-01 00:00:00.000002017
West Germany                                          1970-01-01 00:00:00.000001977
Zimbabwe                                              1970-01-01 00:00:00.000002017
Name: release_year, Length: 748, dtype: datetime64[ns]
year_added   