In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import numpy as np

pd.set_option('display.max_rows', None) 
pd.set_option('display.max_columns', None)

In [None]:
netflix_titles = pd.read_csv('datasets/netflix_titles.csv')

In [None]:
netflix_titles.head()

In [None]:
netflix_titles.tail()

In [None]:
netflix_titles.shape

In [None]:
netflix_titles.info()

In [None]:
netflix_titles.describe()

In [None]:
columns = netflix_titles.columns
columns

In [None]:
for x in columns:
    print(x)

In [None]:
#To print the columns in ascending order
for x in sorted(netflix_titles, reverse=False):
    print(x)

### Checking for missing values

In [None]:
netflix_titles.isnull()

In [None]:
num_missing_columns = netflix_titles.isnull().sum()
num_missing_columns

### Checking for unique values in a column

In [None]:
netflix_titles.nunique()

In [None]:
#In a particular column
num_unique_netflix_type = netflix_titles['type'].nunique()
num_unique_netflix_type

In [None]:
unique_netflix_types = netflix_titles['type'].unique()
unique_netflix_types

### Grouping columns

In [None]:
group_types = netflix_titles.groupby('type')
group_types.size()

In [None]:
group_countries = netflix_titles.groupby('country')
group_countries.size()

In [None]:
#Top 10 countries with the most movies and tv shows
group_countries.size().sort_values(ascending=False).head(10)

In [None]:
kenyan_shows = netflix_titles[netflix_titles['country'] == 'Kenya']
kenyan_shows.shape[0]

In [None]:
#Group by ratings and find average of their release year
grouped_ratings = netflix_titles.groupby('rating')
grouped_ratings['release_year'].mean().round(1)

In [None]:
# Group by type, release year and how many types have been released in that year
netflix_titles.groupby(['type', 'release_year']).size().reset_index(name = 'counts').head()

In [None]:
# Distribution of content type
content_type_counts = netflix_titles['type'].value_counts()

plt.figure(figsize=(8, 5))
content_type_counts.plot(kind='bar', color=['#FF6347', '#4682B4'])
plt.title('Distribution of Content Type')
plt.xlabel('Type')
plt.ylabel('Count')
plt.show()

In [None]:
# Top 10 countries by content count
top_countries = netflix_titles['country'].value_counts().head(10)

plt.figure(figsize=(12, 7))
top_countries.plot(kind='bar', color='#FF6347')
plt.title('Top 10 Countries by Content Count')
plt.xlabel('Country')
plt.ylabel('Count')
plt.show()

In [None]:
# Count plot of ratings
plt.figure(figsize=(14, 7))
sns.countplot(y='rating', data=netflix_titles, order=netflix_titles['rating'].value_counts().index, hue='rating', palette='Set3')
plt.title('Count Plot of Ratings')
plt.xlabel('Count')
plt.ylabel('Rating')

In [None]:
# Extracting top 10 directors
top_10_directors = netflix_titles['director'].value_counts().head(10).index

# Count plot of top 10 directors
plt.figure(figsize=(14, 7))
sns.countplot(y='director', data=netflix_titles[netflix_titles['director'].isin(top_10_directors)], hue='director', palette='Set1', order=top_10_directors)
plt.title('Count Plot of Top 10 Directors')
plt.xlabel('Count')
plt.ylabel('Director')
plt.show()

In [None]:
netflix_titles.duration.head()

#### Converts the minutes from a string to an integer by:
1. Replacing mins and seasons from duration string with empty string
2. If it's a season, we assume that each has 10 episodes with each episode having 40 minutes

In [None]:
def minutes_to_integer_conversion(duration):
    minutes = 0
    duration = duration.strip().lower()
    if "min" in duration:
        duration = duration.replace(" min", "")
        minutes = int(duration)
        return minutes
    elif "seasons" in duration:
        duration = duration.replace("seasons","")

        num_of_episodes = 10
        min_per_episode = 40
        minutes = int(duration) * num_of_episodes * min_per_episode
        return minutes

In [None]:
for idx, duration in enumerate(netflix_titles['duration']): 
    netflix_titles.loc[idx, 'parsed_duration'] = minutes_to_integer_conversion(duration)

In [None]:
netflix_titles['parsed_duration'].tail()

In [None]:
# Separate movies and TV shows
movies = netflix_titles[netflix_titles['type'] == 'Movie']
tv_shows = netflix_titles[netflix_titles['type'] == 'TV Show']

# Prepare data for bar chart
movies_duration_per_year = movies.groupby('release_year')['parsed_duration'].mean().reset_index()
tv_shows_duration_per_year = tv_shows.groupby('release_year')['parsed_duration'].mean().reset_index()

In [None]:
# Bar Chart for Movies and TV Shows Duration
fig_bar = px.bar(
    movies_duration_per_year,
    x='release_year',
    y='parsed_duration',
    labels={'parsed_duration': 'Average Duration (minutes)', 'release_year': 'Release Year'},
    title='Average Duration of Movies per Year',
    hover_data=['parsed_duration']
)
fig_bar.show()

In [None]:
fig_bar = px.bar(
    tv_shows_duration_per_year,
    x = 'release_year',
    y = 'parsed_duration',
    labels={'parsed_duration':'Average Duration(minutes)', 'release_year':'Release Year'},
    title='Average Duration of TV Shows per Year',
    hover_data=['parsed_duration']
)
fig_bar.show()

In [None]:
# Line Chart for Movies and TV Shows Duration
fig_line = px.line(
    movies_duration_per_year,
    x='release_year',
    y='parsed_duration',
    labels={'parsed_duration': 'Average Duration (minutes)', 'release_year': 'Release Year'},
    title='Average Duration of Movies per Year'
)
fig_line.show()

In [None]:
#Line charts - tv shows
fig_line = px.line(
    tv_shows_duration_per_year,
    x='release_year',
    y='parsed_duration',
    labels={'parsed_duration': 'Average Duration (minutes)', 'release_year': 'Release Year'},
    title='Average Duration of Movies per Year'
)
fig_line.show()

### Creating new columns based off the categories in the listed_in column

#### Logic:

- loop through the listed_in column values /n
- split them into their own individual columns
- if movie contains the category, set it to true else false


In [None]:
listed = netflix_titles.listed_in

In [None]:
#loop through the listed_in column values to get all the categories needed as a set
categories_set = set()

for cat in listed:
    x = cat.split(',')
    for y in x:
        categories_set.add(y.strip())

categories_list= list(categories_set)

In [None]:
categories_list[:10]

In [None]:
#Add columns for each of the rows and set default values to none
netflix_titles[categories_list] = np.array(categories_list).fill(False)

In [None]:
netflix_titles.head()

In [None]:
splitting_categories = netflix_titles.listed_in.str.split(',')

In [None]:
#Convert the splitting_categories into a dataframe
splitting_categories_df = pd.DataFrame(splitting_categories)

In [None]:
#Brute force approach - has a logarithmic time complexity (todo: make the time complexity more efficient)
for index, cat in splitting_categories_df['listed_in'].items():
    for item in cat:
        stripped_category = item.strip()
        if stripped_category in categories_set:
            netflix_titles.loc[index, stripped_category] = True
        else:
            netflix_titles.loc[index, stripped_category] = False
            
#Todo: Find more efficient way might be convert the categories into a dictionary, loop through splitting and find based on the category key

In [None]:
netflix_titles.head()

In [None]:
#Show movies that only belong to the sports category
sport_movies = netflix_titles[(netflix_titles['Sports Movies'] == True) & (netflix_titles['Documentaries'] != True)]
sport_movies.shape

In [None]:
sport_movies.head()

In [None]:
#Check duplicate values
netflix_titles.duplicated().sum()

In [None]:
#Check number of null values
netflix_titles.isnull().value_counts()

In [None]:
#Remove unncessary description column
netflix_titles.drop(columns = ['description'], inplace = True)

In [None]:
netflix_titles.head()