In [32]:
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

## Mapa que irá guardar todas as tabelas

In [33]:
dataframes = dict()

## Abrindo o dado

In [34]:
raw_df = pd.read_csv('csv_data/netflix_titles.csv')
# renomeando as colunas show_id de "s1", "s2", ..., "s_n"
# para show_id = 1, 2, ..., n
raw_df["show_id"] = range(1, len(raw_df)+1)

raw_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


## Content

In [35]:
# criando a tabela content com suas informações
content_df = raw_df[["show_id", "title", "date_added", "rating", "description"]]
content_df.rename(columns={"show_id": "id", "description": "content_description"}, inplace=True)

content_df["date_added"] = content_df["date_added"].str.strip()
content_df["date_added"] = pd.to_datetime(content_df["date_added"], format="%B %d, %Y")
content_df["date_added"] = content_df["date_added"].dt.strftime("%Y-%m-%d")

dataframes["content"] = content_df

content_df.head()

Unnamed: 0,id,title,date_added,rating,content_description
0,1,Dick Johnson Is Dead,2021-09-25,PG-13,"As her father nears the end of his life, filmm..."
1,2,Blood & Water,2021-09-24,TV-MA,"After crossing paths at a party, a Cape Town t..."
2,3,Ganglands,2021-09-24,TV-MA,To protect his family from a powerful drug lor...
3,4,Jailbirds New Orleans,2021-09-24,TV-MA,"Feuds, flirtations and toilet talk go down amo..."
4,5,Kota Factory,2021-09-24,TV-MA,In a city of coaching centers known to train I...


## Rating

In [51]:
raw_ratings = raw_df[raw_df["rating"].notna()]["rating"].unique()

rating_df = pd.DataFrame({"id": range(1, len(ratings)+1), "rating_name": ratings})
dataframes["rating"] = rating_df

rating_df.head()

Unnamed: 0,id,rating_name
0,1,PG-13
1,2,TV-MA
2,3,PG
3,4,TV-14
4,5,TV-PG


In [37]:
raw_ratings = raw_df[raw_df["rating"].notna()]

content_rating = {"content_id": [], "rating_id": []}

for idx, rating in zip(raw_ratings["show_id"].values, raw_ratings["rating"].values):
    content_rating["content_id"].append(idx)
    content_rating["rating_id"].append(rating_df[rating_df["rating_name"] == rating]["id"].values[0])

content_rating_df = pd.DataFrame(content_rating)
content_rating_df = content_rating_df[["content_id", "rating_id"]].drop_duplicates()
dataframes["content_rating"] = content_rating_df

content_rating_df.head()

Unnamed: 0,content_id,rating_id
0,1,1
1,2,2
2,3,2
3,4,2
4,5,2


## Director

In [38]:
raw_directors = raw_df[raw_df["director"].notna()]["director"].values

directors = []
for director_names in raw_directors:
    for name in director_names.split(", "):
        directors.append(name)

directors = list(set(directors))
director_df = pd.DataFrame({"id": range(1, len(directors)+1), "director_name": directors})
dataframes["director"] = director_df

director_df.head()

Unnamed: 0,id,director_name
0,1,Milan Luthria
1,2,Guy Ritchie
2,3,Johnson Esthappan
3,4,Jon YonKondy
4,5,Hannah Fidell


In [39]:
raw_director = raw_df[raw_df["director"].notna()]

content_director = {"content_id": [], "director_id": []}

for idx, directors in zip(raw_director["show_id"].values, raw_director["director"].values):
    directors_names = directors.split(", ")
    for name in directors_names:
        content_director["content_id"].append(idx)
        content_director["director_id"].append(director_df[director_df["director_name"] == name]["id"].values[0])
    
content_director_df = pd.DataFrame(content_director)
content_director_df = content_director_df[["content_id", "director_id"]].drop_duplicates()
dataframes["content_director"] = content_director_df

content_director_df.head()

Unnamed: 0,content_id,director_id
0,1,256
1,3,4964
2,6,3622
3,7,1318
4,7,2135


## Type 

In [40]:
types = raw_df[raw_df["type"].notna()]["type"].unique()
type_df = pd.DataFrame({"id": list(range(1, len(types)+1)), "type_name": types})
dataframes["type"] = type_df

type_df.head()

Unnamed: 0,id,type_name
0,1,Movie
1,2,TV Show


In [41]:
raw_type = raw_df[raw_df["type"].notna()]

content_type = {"content_id": [], "type_id": []}

for idx, type_ in zip(raw_type["show_id"].values, raw_type["type"].values):
    content_type["content_id"].append(idx)
    content_type["type_id"].append(type_df[type_df["type_name"] == type_]["id"].values[0])

content_type_df = pd.DataFrame(content_type)
content_type_df = content_type_df[["content_id", "type_id"]].drop_duplicates()
dataframes["content_type"] = content_type_df

content_type_df.head()

Unnamed: 0,content_id,type_id
0,1,1
1,2,2
2,3,2
3,4,2
4,5,2


## Genre

In [42]:
raw_genre = raw_df[raw_df["listed_in"].notna()]["listed_in"].values

genre = []
for genre_names in raw_genre:
    for name in genre_names.split(", "):
        genre.append(name)
        
genre = list(set(genre))
genre_df = pd.DataFrame({"id": range(1, len(genre)+1), "genre_name": genre})
dataframes["genre"] = genre_df

genre_df.head()

Unnamed: 0,id,genre_name
0,1,British TV Shows
1,2,Music & Musicals
2,3,TV Dramas
3,4,Romantic TV Shows
4,5,TV Sci-Fi & Fantasy


In [43]:
raw_genre = raw_df[raw_df["listed_in"].notna()]

content_genre = {"content_id": [], "genre_id": []}
for idx, genre in zip(raw_genre["show_id"].values, raw_genre["listed_in"].values):
    genre_names = genre.split(", ")
    for name in genre_names:
        content_genre["content_id"].append(idx)
        content_genre["genre_id"].append(genre_df[genre_df["genre_name"] == name]["id"].values[0]) 
        
content_genre_df = pd.DataFrame(content_genre)
content_genre_df = content_genre_df[["content_id", "genre_id"]].drop_duplicates()
dataframes["content_genre"] = content_genre_df

content_genre_df.head() 

Unnamed: 0,content_id,genre_id
0,1,13
1,2,22
2,2,3
3,2,25
4,3,39


## Actor

In [44]:
# o formato do .csv para elenco é uma grande string com o nome de todos
# então temos que separar nome por nome
raw_cast = raw_df[raw_df["cast"].notna()]["cast"].values

cast = []
for cast_names in raw_cast:
    for name in cast_names.split(", "):
        cast.append(name)

# pegando os nomes sem repetição
cast = list(set(cast))
cast_df = pd.DataFrame({"id": range(1, len(cast)+1), "actor_name": cast})
dataframes["actor"] = cast_df

cast_df.head()

Unnamed: 0,id,actor_name
0,1,Atikah Suhaime
1,2,Jonas Bloquet
2,3,Doron Ben-David
3,4,Christian Tappan
4,5,Nkwah Kingsley


In [45]:
raw_cast = raw_df[raw_df["cast"].notna()]

content_cast = {"content_id": [], "actor_id": []}

for idx, cast in zip(raw_cast["show_id"].values, raw_cast["cast"].values):
    cast_names = cast.split(", ")
    for name in cast_names:
        content_cast["content_id"].append(idx)
        content_cast["actor_id"].append(cast_df[cast_df["actor_name"] == name]["id"].values[0])

content_cast_df = pd.DataFrame(content_cast)
content_cast_df = content_cast_df[["content_id", "actor_id"]].drop_duplicates()
dataframes["content_cast"] = content_cast_df

content_cast_df.head()

Unnamed: 0,content_id,actor_id
0,2,2612
1,2,22396
2,2,6340
3,2,14799
4,2,12274


## Movie and Tv Show durations

In [46]:
raw_durations = raw_df[raw_df["duration"].notna()]

movie_duration = {"content_id": [], "time_in_minutes": []}
tv_show_duration = {"content_id": [], "seasons": []}

for idx, duration in zip(raw_durations["show_id"].values, raw_durations["duration"].values):
    if "min" in duration:
        movie_duration["content_id"].append(idx)
        movie_duration["time_in_minutes"].append(duration.split(" ")[0])
    else:
        tv_show_duration["content_id"].append(idx)
        tv_show_duration["seasons"].append(duration.split(" ")[0])
        
movie_duration_df = pd.DataFrame(movie_duration)
movie_duration_df = movie_duration_df[["content_id", "time_in_minutes"]].drop_duplicates()
tv_show_duration_df = pd.DataFrame(tv_show_duration)
tv_show_duration_df = tv_show_duration_df[["content_id", "seasons"]].drop_duplicates()

dataframes["movie_duration"] = movie_duration_df
dataframes["tv_show_duration"] = tv_show_duration_df

In [47]:
movie_duration_df.head()

Unnamed: 0,content_id,time_in_minutes
0,1,90
1,7,91
2,8,125
3,10,104
4,13,127


In [48]:
tv_show_duration_df.head()

Unnamed: 0,content_id,seasons
0,2,2
1,3,1
2,4,1
3,5,2
4,6,1


## Final data

In [52]:
dataframes.keys()

dict_keys(['content', 'content_rating', 'director', 'content_director', 'type', 'content_type', 'genre', 'content_genre', 'actor', 'content_cast', 'movie_duration', 'tv_show_duration', 'rating'])

In [57]:
path = os.getcwd()
for key in dataframes.keys():
    dataframes[key].to_csv(f"{path}/csv_data/{key}.csv", index=False)