In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
import ast
import numpy as np

In [4]:
df = pd.read_csv('NetFlix.csv')
df.shape

(7787, 12)

In [6]:
#tfidf
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['description'])

In [8]:
#fill in the blank cells
df = df.fillna('Unknown')

split_countries = df['country'].str.split(',')

#make country list
all_countries = split_countries.explode()

#get unique countries
unique_countries = all_countries.unique()

print("Unique countries:", unique_countries)

Unique countries: ['Brazil' 'India' 'Indonesia' 'United States' 'United Kingdom' ' France'
 'Germany' ' United States' 'Canada' 'Japan' 'Thailand' 'Unknown'
 'Denmark' ' Sweden' ' Israel' 'Ireland' ' United Kingdom' 'Spain'
 ' Iceland' 'Colombia' 'United Arab Emirates' ' India' 'New Zealand'
 'Netherlands' 'Egypt' 'Australia' 'Sweden' 'Russia' 'Norway' ' Germany'
 'Finland' 'Philippines' ' Spain' ' Russia' ' Poland' 'Lebanon'
 ' Australia' 'Singapore' 'Poland' 'Bulgaria' 'South Korea' ' Canada'
 ' Ireland' ' Italy' ' South Africa' 'France' ' Belgium' 'Hong Kong'
 'Kenya' 'Taiwan' 'Turkey' 'China' 'Italy' ' China' ' Singapore'
 ' Denmark' ' Japan' ' Malta' 'Kuwait' 'Belgium' 'Argentina' 'Nigeria'
 ' Pakistan' 'Mexico' ' New Zealand' ' Mexico' 'Iceland' ' Czech Republic'
 ' Bahamas' 'South Africa' 'Romania' 'Pakistan' ' Sri Lanka'
 ' Cayman Islands' ' Bangladesh' 'Malaysia' '' ' Hong Kong' ' Switzerland'
 ' Argentina' ' Luxembourg' 'Czech Republic' 'Vietnam' 'Zimbabwe'
 ' Brazil' 'Hungar

In [10]:
cleaned_countries = [country.strip() for country in unique_countries if country.strip() != ""]

#define regions
regions = {
    'Africa': ['South Africa', 'Kenya', 'Nigeria', 'Egypt', 'Zimbabwe', 'Angola', 'Namibia', 'Botswana', 'Uganda',
               'Sudan', 'Somalia', 'Ghana', 'Senegal', 'Mauritius', 'Algeria', 'Morocco', 'Tunisia', 'Libya'],
    'Asia': ['India', 'China', 'Japan', 'South Korea', 'Indonesia', 'Thailand', 'Vietnam', 'Singapore', 'Malaysia',
             'Pakistan', 'Bangladesh', 'Sri Lanka', 'Taiwan', 'Philippines', 'Israel', 'Turkey', 'Iran', 'Iraq',
             'Jordan', 'Lebanon', 'Syria', 'Kazakhstan', 'Mongolia', 'Afghanistan'],
    'Europe': ['United Kingdom', 'Germany', 'France', 'Italy', 'Spain', 'Poland', 'Sweden', 'Denmark', 'Norway',
               'Finland', 'Netherlands', 'Belgium', 'Switzerland', 'Portugal', 'Ireland', 'Greece', 'Czech Republic',
               'Hungary', 'Austria', 'Russia', 'Ukraine', 'Slovakia', 'Lithuania', 'Latvia', 'Serbia', 'Slovenia',
               'Bulgaria', 'Romania', 'Iceland', 'Malta', 'Liechtenstein', 'Luxembourg'],
    'North America': ['United States', 'Canada', 'Mexico', 'Cuba', 'Bahamas', 'Puerto Rico', 'Dominican Republic'],
    'South America': ['Brazil', 'Argentina', 'Chile', 'Colombia', 'Uruguay', 'Venezuela', 'Paraguay', 'Peru'],
    'Oceania': ['Australia', 'New Zealand', 'Samoa'],
    'Middle East': ['Saudi Arabia', 'United Arab Emirates', 'Kuwait', 'Qatar', 'Bahrain', 'Oman'],
    'Unknown': ['Unknown']
}

def map_to_region(country):
    for region, countries in regions.items():
        if country in countries:
            return region
    return 'Unknown'

def map_to_regions(countries):
    # Split country names
    country_list = [country.strip() for country in countries.split(',') if country.strip()]
    # Map regions for each country
    region_list = {map_to_region(country) for country in country_list}
    return ', '.join(region_list)

# Apply function to 'country' column
df['region'] = df['country'].apply(lambda x: map_to_regions(x) if isinstance(x, str) else 'Other')

df.head(20)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,genres,description,region
0,s1,TV Show,3%,Unknown,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,14-Aug-20,2020,TV-MA,4,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...,South America
1,s10,Movie,1920,Vikram Bhatt,"Rajneesh Duggal, Adah Sharma, Indraneil Sengup...",India,15-Dec-17,2008,TV-MA,143,"Horror Movies, International Movies, Thrillers",An architect and his wife move into a castle t...,Asia
2,s100,Movie,3 Heroines,Iman Brotoseno,"Reza Rahadian, Bunga Citra Lestari, Tara Basro...",Indonesia,5-Jan-19,2016,TV-PG,124,"Dramas, International Movies, Sports Movies",Three Indonesian women break records by becomi...,Asia
3,s1000,Movie,Blue Mountain State: The Rise of Thadland,Lev L. Spiro,"Alan Ritchson, Darin Brooks, James Cade, Rob R...",United States,1-Mar-16,2016,R,90,Comedies,New NFL star Thad buys his old teammates' belo...,North America
4,s1001,TV Show,Blue Planet II,Unknown,David Attenborough,United Kingdom,3-Dec-18,2017,TV-G,1,"British TV Shows, Docuseries, Science & Nature TV",This sequel to the award-winning nature series...,Europe
5,s1002,Movie,Blue Ruin,Jeremy Saulnier,"Macon Blair, Devin Ratray, Amy Hargreaves, Kev...","United States, France",25-Feb-19,2013,R,90,"Independent Movies, Thrillers",Bad news from the past unhinges vagabond Dwigh...,"Europe, North America"
6,s1003,Movie,Blue Streak,Les Mayfield,"Martin Lawrence, Luke Wilson, Peter Greene, Da...","Germany, United States",1-Jan-21,1999,PG-13,94,"Action & Adventure, Comedies",A jewel thief returns to his hiding place afte...,"Europe, North America"
7,s1004,Movie,Blue Valentine,Derek Cianfrance,"Ryan Gosling, Michelle Williams, Faith Wladyka...",United States,5-Jul-18,2010,R,112,"Dramas, Independent Movies, Romantic Movies",As Cindy and Dean muddle through their languis...,North America
8,s1005,Movie,BluffMaster!,Rohan Sippy,"Abhishek Bachchan, Priyanka Chopra, Riteish De...",India,8-Jan-21,2005,TV-14,129,"Comedies, International Movies, Romantic Movies",When his girlfriend learns the truth about his...,Asia
9,s1006,Movie,Blurred Lines: Inside the Art World,Barry Avrich,Unknown,Canada,31-Dec-17,2017,TV-MA,85,Documentaries,Artists and industry insiders shed light on th...,North America


In [22]:
# make some columns list
df['genres'] = df['genres'].apply(lambda x: [genre.strip() for genre in x.split(',')])
df['region'] = df['region'].apply(lambda x: [region.strip() for region in x.split(',')])
df['director'] = df['director'].apply(lambda x: [director.strip() for director in x.split(',')])

#clean TV and Movies from genres
def clean_genres(genres):
    filtered_genres = []
    for genre in genres:
        if 'TV' not in genre and 'Movies' not in genre and 'TV Shows' not in genre:
            if '&' in genre:
                filtered_genres.extend([g.strip() for g in genre.split(' & ')])
            else:
                filtered_genres.append(genre.strip())
    return filtered_genres

#unique genres list
all_genres = set()
df['genres_cleaned'] = df['genres'].apply(clean_genres)
for genres in df['genres_cleaned']:
    all_genres.update(genres)

#unique regions list
all_regions = ['Africa', 'Asia', 'Europe', 'North America', 'South America', 'Oceania', 'Middle East', 'Unknown']

#unique directors list
all_directors = sorted(set(director for directors in df['director'] for director in directors))

#one hot encoding
genre_columns = pd.DataFrame({genre: df['genres_cleaned'].apply(lambda x: 1 if genre in x else 0) for genre in all_genres})
region_columns = pd.DataFrame({region: df['region'].apply(lambda x: 1 if region in x else 0) for region in all_regions})
director_columns = pd.DataFrame({director: df['director'].apply(lambda x: 1 if director in x else 0) for director in all_directors})

#one hot encoding
other_one_hot = pd.get_dummies(df['type'])
other_one_hot = other_one_hot.astype(int)

#merge data
df_final = pd.concat([genre_columns, region_columns, director_columns, other_one_hot], axis=1)


In [23]:
df_final.head()

Unnamed: 0,Dramas,Spirituality,Stand-Up Comedy,Sci-Fi,Thrillers,Action,Adventure,Talk Shows,Documentaries,Anime Features,...,Çagan Irmak,Éric Judor,Éric Toledano,Éric Warin,Ísold Uggadóttir,Óskar Thór Axelsson,Ömer Faruk Sorak,Şenol Sönmez,Movie,TV Show
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [24]:
column_names = df_final.columns.tolist()

#delete Unknown column
df_final = df_final.drop(columns=["Unknown"])
print(column_names)

['Dramas', 'Spirituality', 'Stand-Up Comedy', 'Sci-Fi', 'Thrillers', 'Action', 'Adventure', 'Talk Shows', 'Documentaries', 'Anime Features', 'Faith', 'Docuseries', 'Music', 'Musicals', 'Comedies', 'Anime Series', 'Fantasy', 'Africa', 'Asia', 'Europe', 'North America', 'South America', 'Oceania', 'Middle East', 'Unknown', 'A. L. Vijay', 'A. Raajdheep', 'A. Salaam', 'A.R. Murugadoss', 'Aadish Keluskar', 'Aamir Bashir', 'Aamir Khan', 'Aanand Rai', 'Aaron Burns', 'Aaron Hancox', 'Aaron Hann', 'Aaron Lieber', 'Aaron Moorhead', 'Aaron Nee', 'Aaron Sorkin', 'Aaron Woodley', 'Aaron Woolf', 'Aatmaram Dharne', 'Abba T. Makama', 'Abbas Alibhai Burmawalla', 'Abbas Mustan', 'Abbas Tyrewala', 'Abby Epstein', 'Abdellatif Kechiche', 'Abdul Aziz Hashad', 'Abdullah Al Noor', 'Abel Ferrara', 'Abhay Chopra', 'Abhijeet Deshpande', 'Abhijit Kokate', 'Abhijit Panse', 'Abhinav Shiv Tiwari', 'Abhinay Deo', 'Abhishek Chaubey', 'Abhishek Kapoor', 'Abhishek Saxena', 'Abhishek Sharma', 'Abhishek Varman', 'Abu Bakr

In [25]:
#one hot encoding for duration
def categorize_duration(value):
    if value <= 50:
        return '0-50'
    elif value <= 100:
        return '51-100'
    elif value <= 150:
        return '101-150'
    else:
        return '151+'

df['duration_category'] = df['duration'].apply(categorize_duration)
df_duration_one_hot = pd.get_dummies(df['duration_category'], prefix='duration')

df_duration_one_hot = df_duration_one_hot.astype(int)

df_final = pd.concat([df_final, df_duration_one_hot], axis=1)


In [26]:
df_final.head()

Unnamed: 0,Dramas,Spirituality,Stand-Up Comedy,Sci-Fi,Thrillers,Action,Adventure,Talk Shows,Documentaries,Anime Features,...,Ísold Uggadóttir,Óskar Thór Axelsson,Ömer Faruk Sorak,Şenol Sönmez,Movie,TV Show,duration_0-50,duration_101-150,duration_151+,duration_51-100
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0


In [46]:
# output_path = 'df_final.csv'
# df_final.to_csv(output_path, index=False)

# print(f"DataFrame saved successfully to {output_path}")

DataFrame saved successfully to df_final.csv


In [27]:
#make lists of unique genres
print(all_genres)

{'Dramas', 'Spirituality', 'Stand-Up Comedy', 'Sci-Fi', 'Thrillers', 'Action', 'Adventure', 'Talk Shows', 'Documentaries', 'Anime Features', 'Faith', 'Docuseries', 'Music', 'Musicals', 'Comedies', 'Anime Series', 'Fantasy'}


In [51]:
df['text_combined'] = df['title'] + " " + df['description']

#set tfidf
vectorizer = TfidfVectorizer(ngram_range=(1, 3))  # UnigramからTrigramを指定
tfidf_matrix = vectorizer.fit_transform(df['text_combined'])

#def
def recommend_show(user_input, df, tfidf_matrix, vectorizer):
    #vectorize user input
    user_input_vector = vectorizer.transform([user_input])
    
    similarities = cosine_similarity(user_input_vector, tfidf_matrix)
    
    #sort
    recommendations = similarities.argsort()[0][::-1]
    
    top_recommendations = recommendations[:10]
    return df.iloc[top_recommendations][['title', 'description','genres','director']]

In [53]:
#user input
user_input = input("Please enter keywords")
recommended_shows = recommend_show(user_input, df, tfidf_matrix, vectorizer)

recommended_shows.head()

Please enter keywords Ömer Faruk Sorak


Unnamed: 0,title,description,genres,director
1742,Hükümet Kadin 2,"In 1949, with freedoms threatened, housewife H...","[Comedies, International Movies]",[Sermiyan Midyat]
3074,Love Me As I Am,After wealthy college boy Ömer and lower class...,"[International TV Shows, Romantic TV Shows, TV...",[Unknown]
7786,Blue Jay,Two former high school sweethearts unexpectedl...,"[Dramas, Independent Movies, Romantic Movies]",[Alex Lehmann]
2586,Katt Williams: The Pimp Chronicles: Pt. 1,"In this 2006 show, comic Katt Williams uses hi...",[Stand-Up Comedy],[Gary Binkow]
2588,Kavin Jay: Everybody Calm Down!,"On a mission to defy stereotypes, Malaysian st...",[Stand-Up Comedy],[Michael McKay]
