<a href="https://colab.research.google.com/github/mlnayusuf24/revou_mini-course/blob/main/data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Import Library**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## **Prepare the Dataset**

**Read the csv file**

In [None]:
path = '/content/drive/MyDrive/Dataset/amazon_prime_titles.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,The Grand Seduction,Don McKellar,"Brendan Gleeson, Taylor Kitsch, Gordon Pinsent",Canada,"March 30, 2021",2014,,113 min,"Comedy, Drama",A small fishing village must procure a local d...
1,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",India,"March 30, 2021",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...
2,s3,Movie,Secrets of Deception,Josh Webber,"Tom Sizemore, Lorenzo Lamas, Robert LaSardo, R...",United States,"March 30, 2021",2017,,74 min,"Action, Drama, Suspense",After a man discovers his wife is cheating on ...
3,s4,Movie,Pink: Staying True,Sonia Anderson,"Interviews with: Pink, Adele, Beyoncé, Britney...",United States,"March 30, 2021",2014,,69 min,Documentary,"Pink breaks the mold once again, bringing her ..."
4,s5,Movie,Monster Maker,Giles Foster,"Harry Dean Stanton, Kieran O'Brien, George Cos...",United Kingdom,"March 30, 2021",1989,,45 min,"Drama, Fantasy",Teenage Matt Banting wants to work with a famo...


**Add new features in dataset**

Rating Guide:
https://www.primevideo.com/help/ref=atv_hp_nd_cnt?nodeId=GFGQU3WYEG6FSJFJ

In [None]:
df["date_added"] = pd.to_datetime(df['date_added'])
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month

df['season_count'] = df.apply(lambda x : x['duration'].split(" ")[0] if "Season" in x['duration'] else "", axis = 1)
df['duration'] = df.apply(lambda x : x['duration'].split(" ")[0] if "Season" not in x['duration'] else "", axis = 1)

adult = ['18+','R','NR','NC-17','UNRATED','TV-MA','NOT_RATE','AGES_18_','TV-NR']
df['adult'] = df.apply(lambda x: "Adult" if x['rating'] in adult else "Non-adult", axis = 1)
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added,season_count,adult
0,s1,Movie,The Grand Seduction,Don McKellar,"Brendan Gleeson, Taylor Kitsch, Gordon Pinsent",Canada,2021-03-30,2014,,113,"Comedy, Drama",A small fishing village must procure a local d...,2021.0,3.0,,Non-adult
1,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",India,2021-03-30,2018,13+,110,"Drama, International",A Metro Family decides to fight a Cyber Crimin...,2021.0,3.0,,Non-adult
2,s3,Movie,Secrets of Deception,Josh Webber,"Tom Sizemore, Lorenzo Lamas, Robert LaSardo, R...",United States,2021-03-30,2017,,74,"Action, Drama, Suspense",After a man discovers his wife is cheating on ...,2021.0,3.0,,Non-adult
3,s4,Movie,Pink: Staying True,Sonia Anderson,"Interviews with: Pink, Adele, Beyoncé, Britney...",United States,2021-03-30,2014,,69,Documentary,"Pink breaks the mold once again, bringing her ...",2021.0,3.0,,Non-adult
4,s5,Movie,Monster Maker,Giles Foster,"Harry Dean Stanton, Kieran O'Brien, George Cos...",United Kingdom,2021-03-30,1989,,45,"Drama, Fantasy",Teenage Matt Banting wants to work with a famo...,2021.0,3.0,,Non-adult


In [None]:
df.to_csv('/content/drive/MyDrive/Dataset/amazon_prime_clean.csv')

**What are the top Genre?**

In [None]:
tv_show = df[df['type'] == 'TV Show']
movie = df[df['type'] == 'Movie']

In [None]:
from collections import Counter
# TV Shows
categories1 = ", ".join(tv_show['listed_in']).split(", ")
counter_list1 = Counter(categories1).most_common()[::-1]
labels1 = [_[0] for _ in counter_list1][::-1]
values1 = [_[1] for _ in counter_list1][::-1]

# Movies
categories2 = ", ".join(movie['listed_in']).split(", ")
counter_list2 = Counter(categories2).most_common()[::-1]
labels2 = [_[0] for _ in counter_list2][::-1]
values2 = [_[1] for _ in counter_list2][::-1]

# Create Dataframe
genre1 = pd.DataFrame(values1, labels1).reset_index()
genre1.columns = ['Genre','TV Show']
genre2 = pd.DataFrame(values2, labels2).reset_index()
genre2.columns = ['Genre','Movie']

# Merge Dataframe
genre = genre1.merge(genre2, on='Genre', how = 'outer')
genre['Movie'] = genre['Movie'].fillna(0)
genre['Total'] = genre['TV Show'] + genre['Movie']

**What are the Top Countries?**

In [None]:
# TV Shows
categories1 = ", ".join(tv_show['country'].dropna()).split(", ")
counter_list1 = Counter(categories1).most_common()[::-1]
labels1 = [_[0] for _ in counter_list1][::-1]
values1 = [_[1] for _ in counter_list1][::-1]

# Movies
categories2 = ", ".join(movie['country'].dropna()).split(", ")
counter_list2 = Counter(categories2).most_common()[::-1]
labels2 = [_[0] for _ in counter_list2][::-1]
values2 = [_[1] for _ in counter_list2][::-1]

# Create Dataframe
country1 = pd.DataFrame(values1, labels1).reset_index()
country1.columns = ['Country','TV Show']
country2 = pd.DataFrame(values2, labels2).reset_index()
country2.columns = ['Country','Movie']

# Merge Dataframe
country = country1.merge(country2, on='Country', how = 'outer')
country['TV Show'] = country['TV Show'].fillna(0)
country['Movie'] = country['Movie'].fillna(0)
country['Total'] = country['TV Show'] + country['Movie']

**Who are the Top Director?**

In [None]:
# TV Shows
categories1 = ", ".join(tv_show['director'].dropna()).split(", ")
counter_list1 = Counter(categories1).most_common()[::-1]
labels1 = [_[0] for _ in counter_list1][::-1]
values1 = [_[1] for _ in counter_list1][::-1]

# Movies
categories2 = ", ".join(movie['director'].dropna()).split(", ")
counter_list2 = Counter(categories2).most_common()[::-1]
labels2 = [_[0] for _ in counter_list2][::-1]
values2 = [_[1] for _ in counter_list2][::-1]

# Create Dataframe
director1 = pd.DataFrame(values1, labels1).reset_index()
director1.columns = ['Director','TV Show']
director2 = pd.DataFrame(values2, labels2).reset_index()
director2.columns = ['Director','Movie']

# Merge Dataframe
director = director1.merge(director2, on='Director', how = 'outer')
director['TV Show'] = director['TV Show'].fillna(0)
director['Movie'] = director['Movie'].fillna(0)
director['Total'] = director['TV Show'] + director['Movie']
director.drop(director[director['Director'] == ''].index, axis=0, inplace=True)
director.drop(director[director['Director'] == '1'].index, axis=0, inplace=True)

**Who are the Top Actor?**

In [None]:
# TV Shows
categories1 = ", ".join(tv_show['cast'].dropna()).split(", ")
counter_list1 = Counter(categories1).most_common()[::-1]
labels1 = [_[0] for _ in counter_list1][::-1]
values1 = [_[1] for _ in counter_list1][::-1]

# Movies
categories2 = ", ".join(movie['cast'].dropna()).split(", ")
counter_list2 = Counter(categories2).most_common()[::-1]
labels2 = [_[0] for _ in counter_list2][::-1]
values2 = [_[1] for _ in counter_list2][::-1]

# Create Dataframe
actor1 = pd.DataFrame(values1, labels1).reset_index()
actor1.columns = ['Actor','TV Show']
actor2 = pd.DataFrame(values2, labels2).reset_index()
actor2.columns = ['Actor','Movie']

# Merge Dataframe
actor = actor1.merge(actor2, on='Actor', how = 'outer')
actor['TV Show'] = actor['TV Show'].fillna(0)
actor['Movie'] = actor['Movie'].fillna(0)
actor['Total'] = actor['TV Show'] + actor['Movie']
actor.drop(actor[actor['Actor'] == '1'].index, axis=0, inplace=True)

In [None]:
genre.to_csv('/content/drive/MyDrive/Dataset/top_genre.csv')
country.to_csv('/content/drive/MyDrive/Dataset/top_country.csv')
director.to_csv('/content/drive/MyDrive/Dataset/top_director.csv')
actor.to_csv('/content/drive/MyDrive/Dataset/top_actor.csv')