5 basic steps for Data analysis:
             
1. Define the problem
2. Collect data
3. Prepare data
4. Analyse data
5. Interpret results

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import builtins
import rich
builtins.print = rich.print

In [2]:
df = pd.read_csv("../data/netflix_titles.csv")

## Task 0: Clean Dataset
1. There are some values in the column "country" that aren't formatted correctly, "contry_name1," not "country_name1, country_name2"

In [3]:
# 1 example the issue:
df[df.title == 'The Present']

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
1192,s1193,Movie,The Present,Farah Nabulsi,"Saleh Bakri, Maryam Kanj, Maryam Kamiel Basha,...","United Kingdom,","March 18, 2021",2020,TV-MA,24 min,"Dramas, International Movies",Yusuf and his daughter set out to buy his wife...


Solution: Reformat all values

In [4]:
# Iterate through all values of the 'country' column to re-format wrong values
for id, country_names in enumerate(df.country):
    # Ignore where the value is nan
    if type(country_names) == float:
        continue
    # Check if the last character of the string is a comma and remove it
    elif country_names[-1] == ",":
        # Update the dataframe directly by removing the trailing comma
        df.loc[df['country'] == country_names, 'country'] = country_names[:-1]

In [5]:
# After addressing the issue
df[df.title == 'The Present']

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
1192,s1193,Movie,The Present,Farah Nabulsi,"Saleh Bakri, Maryam Kanj, Maryam Kamiel Basha,...",United Kingdom,"March 18, 2021",2020,TV-MA,24 min,"Dramas, International Movies",Yusuf and his daughter set out to buy his wife...


In [6]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


## Task 1: Transform the initial table

- Initially, each row is for one movie, available in potentially MANY countries.
- For easier analsys concerning country, we want each row for a movie in a single country
- In other words, one row of the initial table corresponds to multiple rows in the new table

In [7]:
df_split_country = df.assign(country=df["country"].str.split(", ")).explode("country")

In [8]:
df_split_country.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [9]:
df_split_country[df_split_country.title == 'Zak Storm']

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
8797,s8798,TV Show,Zak Storm,,"Michael Johnston, Jessica Gee-George, Christin...",United States,"September 13, 2018",2016,TV-Y7,3 Seasons,Kids' TV,Teen surfer Zak Storm is mysteriously transpor...
8797,s8798,TV Show,Zak Storm,,"Michael Johnston, Jessica Gee-George, Christin...",France,"September 13, 2018",2016,TV-Y7,3 Seasons,Kids' TV,Teen surfer Zak Storm is mysteriously transpor...
8797,s8798,TV Show,Zak Storm,,"Michael Johnston, Jessica Gee-George, Christin...",South Korea,"September 13, 2018",2016,TV-Y7,3 Seasons,Kids' TV,Teen surfer Zak Storm is mysteriously transpor...
8797,s8798,TV Show,Zak Storm,,"Michael Johnston, Jessica Gee-George, Christin...",Indonesia,"September 13, 2018",2016,TV-Y7,3 Seasons,Kids' TV,Teen surfer Zak Storm is mysteriously transpor...


In [10]:
df_split_country.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
8802    False
8803    False
8804    False
8805    False
8806    False
Length: 10845, dtype: bool

In [11]:
df_split_country = df_split_country.dropna()
df_split_country[df_split_country['country'].str.contains(",")]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description


## Task 2: List all TV-Shows

Given the dataset, I will filter out by column type

In [None]:
df_tv_show = df.query('type == "TV Show"')
df_tv_show

## Task 3: How many movies and TV-shows are available in each country?

Target result: a bar plot:
- x-axis: country name
- y-axis: no. movies and TV-shows

I want a table with 2 columns
1. country
2. Count of movies and TV-shows available in that country

Approach: create a dictionary with
key: country name
value: count of movies and tv-shows

In [None]:
# Remove nan values
df = df.dropna()

# A dictionary
# key: country name
# value: count of movies and tv-shows
dict_country_cnt = {}

# Iterate through the array df.country.unique()
for id_country, country_names in enumerate(df.country.unique()):
    # If country_names == nan, skip
    if isinstance(country_names, float):
        continue

    # If it is available in many countries, split the string
    for country_name in country_names.split(', '):
        # If the country_name is already in dict_country_cnt.keys(), skip
        if country_name in list(dict_country_cnt.keys()):
            continue
        # else, save it in the list
        else:
            dict_country_cnt[country_name] = 0

# For each country, count movies that are available in that country
for country_name in dict_country_cnt.keys():
    filtered_df = df[df['country'].str.contains(country_name)]
    dict_country_cnt[country_name] = len(filtered_df.index)

In [None]:
del dict_country_cnt['']

dict_country_cnt = sorted(dict_country_cnt.items(), key=lambda x:x[1], reverse=True)
dict_country_cnt = dict(dict_country_cnt)

# Prepare the data
df_new = pd.DataFrame(dict_country_cnt.items(), columns=['country', 'cnt'])

sns.set_theme(rc={'figure.figsize':(7,22)})
sns.barplot(df_new, y="country", x='cnt')

plt.savefig('../results/barplot_country_no_movies.png')

plt.show()
plt.close()

## Task 4: In the top 5 countries with the highest number of available movies and TV-shows, show the percentage (or number) of movies versus TV-shows

Target result: a bar plot
- x-asis: Top 5 country names
- y-asis: no. or percentage of movies and TV-Shows

## Task 5: In USA, is there any trend on no. movies & TV-shows release over the years

## Task 6: In USA, is there any trend on the duration of movies & TV-shows, considering the released years?

# 

# 