In [275]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [276]:
df = pd.read_csv('NetFlix.csv')

df['description'] = df['description'].fillna('')

In [277]:
df.shape

(7787, 12)

In [278]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,genres,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,14-Aug-20,2020,TV-MA,4,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s10,Movie,1920,Vikram Bhatt,"Rajneesh Duggal, Adah Sharma, Indraneil Sengup...",India,15-Dec-17,2008,TV-MA,143,"Horror Movies, International Movies, Thrillers",An architect and his wife move into a castle t...
2,s100,Movie,3 Heroines,Iman Brotoseno,"Reza Rahadian, Bunga Citra Lestari, Tara Basro...",Indonesia,5-Jan-19,2016,TV-PG,124,"Dramas, International Movies, Sports Movies",Three Indonesian women break records by becomi...
3,s1000,Movie,Blue Mountain State: The Rise of Thadland,Lev L. Spiro,"Alan Ritchson, Darin Brooks, James Cade, Rob R...",United States,1-Mar-16,2016,R,90,Comedies,New NFL star Thad buys his old teammates' belo...
4,s1001,TV Show,Blue Planet II,,David Attenborough,United Kingdom,3-Dec-18,2017,TV-G,1,"British TV Shows, Docuseries, Science & Nature TV",This sequel to the award-winning nature series...


In [279]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['description'])
tfidf_matrix.shape

(7787, 17905)

In [280]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()


In [281]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [282]:
def get_recommendations(title, cosine_sim=cosine_sim, num_recommend = 10):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_similar = sim_scores[1:num_recommend+1]
    movie_indices = [i[0] for i in top_similar]
    return df['title'].iloc[movie_indices]

In [283]:
#cosine sim with title
get_recommendations('3%', num_recommend = 20)

5359                            Stoked
6141                 The Keeping Hours
5584                            Teresa
6565                      The Stranded
2929                       Lifechanger
3726      My Schoolmate, the Barbarian
6957                Tu Hai Mera Sunday
2350                              Jail
4361                            Powder
2643                      Kill Command
2599    Ken Jeong: You Complete Me, Ho
7535                               Zoo
3594                        Mossad 101
1928                      High Society
2059        How to Train Your Dragon 2
7283         What Happens to My Family
6426             The Princess Weiyoung
4754                 Room on the Broom
6078                             Azali
3597                             Mosul
Name: title, dtype: object

In [284]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import ast

df['genres'] = df['genres'].apply(lambda x: [genre.strip() for genre in x.split(',')])

all_genres = set([genre for sublist in df['genres'] for genre in sublist])

genre_columns = pd.DataFrame({genre: df['genres'].apply(lambda x: 1 if genre in x else 0) for genre in all_genres})

df = pd.concat([df, genre_columns], axis=1)


In [285]:
df.head()


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,...,TV Dramas,Anime Features,Children & Family Movies,Comedies,Thrillers,International Movies,Faith & Spirituality,Reality TV,Classic Movies,LGBTQ Movies
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,14-Aug-20,2020,TV-MA,4,...,1,0,0,0,0,0,0,0,0,0
1,s10,Movie,1920,Vikram Bhatt,"Rajneesh Duggal, Adah Sharma, Indraneil Sengup...",India,15-Dec-17,2008,TV-MA,143,...,0,0,0,0,1,1,0,0,0,0
2,s100,Movie,3 Heroines,Iman Brotoseno,"Reza Rahadian, Bunga Citra Lestari, Tara Basro...",Indonesia,5-Jan-19,2016,TV-PG,124,...,0,0,0,0,0,1,0,0,0,0
3,s1000,Movie,Blue Mountain State: The Rise of Thadland,Lev L. Spiro,"Alan Ritchson, Darin Brooks, James Cade, Rob R...",United States,1-Mar-16,2016,R,90,...,0,0,0,1,0,0,0,0,0,0
4,s1001,TV Show,Blue Planet II,,David Attenborough,United Kingdom,3-Dec-18,2017,TV-G,1,...,0,0,0,0,0,0,0,0,0,0


In [286]:
unique_genres = set([genre for sublist in df['genres'] for genre in sublist])

unique_genre_list = sorted(list(unique_genres))

print(unique_genre_list)

['Action & Adventure', 'Anime Features', 'Anime Series', 'British TV Shows', 'Children & Family Movies', 'Classic & Cult TV', 'Classic Movies', 'Comedies', 'Crime TV Shows', 'Cult Movies', 'Documentaries', 'Docuseries', 'Dramas', 'Faith & Spirituality', 'Horror Movies', 'Independent Movies', 'International Movies', 'International TV Shows', "Kids' TV", 'Korean TV Shows', 'LGBTQ Movies', 'Movies', 'Music & Musicals', 'Reality TV', 'Romantic Movies', 'Romantic TV Shows', 'Sci-Fi & Fantasy', 'Science & Nature TV', 'Spanish-Language TV Shows', 'Sports Movies', 'Stand-Up Comedy', 'Stand-Up Comedy & Talk Shows', 'TV Action & Adventure', 'TV Comedies', 'TV Dramas', 'TV Horror', 'TV Mysteries', 'TV Sci-Fi & Fantasy', 'TV Shows', 'TV Thrillers', 'Teen TV Shows', 'Thrillers']


In [287]:
df_genres['TV Show'] = df['type'].apply(lambda x: 1 if x == 'TV Shows' else 0)
df_genres['Movie'] = df['type'].apply(lambda x: 1 if x == 'Movies' else 0)

In [288]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,...,TV Dramas,Anime Features,Children & Family Movies,Comedies,Thrillers,International Movies,Faith & Spirituality,Reality TV,Classic Movies,LGBTQ Movies
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,14-Aug-20,2020,TV-MA,4,...,1,0,0,0,0,0,0,0,0,0
1,s10,Movie,1920,Vikram Bhatt,"Rajneesh Duggal, Adah Sharma, Indraneil Sengup...",India,15-Dec-17,2008,TV-MA,143,...,0,0,0,0,1,1,0,0,0,0
2,s100,Movie,3 Heroines,Iman Brotoseno,"Reza Rahadian, Bunga Citra Lestari, Tara Basro...",Indonesia,5-Jan-19,2016,TV-PG,124,...,0,0,0,0,0,1,0,0,0,0
3,s1000,Movie,Blue Mountain State: The Rise of Thadland,Lev L. Spiro,"Alan Ritchson, Darin Brooks, James Cade, Rob R...",United States,1-Mar-16,2016,R,90,...,0,0,0,1,0,0,0,0,0,0
4,s1001,TV Show,Blue Planet II,,David Attenborough,United Kingdom,3-Dec-18,2017,TV-G,1,...,0,0,0,0,0,0,0,0,0,0


In [289]:
def convert_to_list(x):
    if isinstance(x, str):
        return ast.literal_eval(x)
    elif isinstance(x, list):
        return x
    else:
        return []

df['genres'] = df['genres'].apply(convert_to_list)

unique_genres = set([genre for sublist in df['genres'] for genre in sublist])

unique_genre_list = sorted(list(unique_genres))

print(unique_genre_list)

['Action & Adventure', 'Anime Features', 'Anime Series', 'British TV Shows', 'Children & Family Movies', 'Classic & Cult TV', 'Classic Movies', 'Comedies', 'Crime TV Shows', 'Cult Movies', 'Documentaries', 'Docuseries', 'Dramas', 'Faith & Spirituality', 'Horror Movies', 'Independent Movies', 'International Movies', 'International TV Shows', "Kids' TV", 'Korean TV Shows', 'LGBTQ Movies', 'Movies', 'Music & Musicals', 'Reality TV', 'Romantic Movies', 'Romantic TV Shows', 'Sci-Fi & Fantasy', 'Science & Nature TV', 'Spanish-Language TV Shows', 'Sports Movies', 'Stand-Up Comedy', 'Stand-Up Comedy & Talk Shows', 'TV Action & Adventure', 'TV Comedies', 'TV Dramas', 'TV Horror', 'TV Mysteries', 'TV Sci-Fi & Fantasy', 'TV Shows', 'TV Thrillers', 'Teen TV Shows', 'Thrillers']


In [290]:
def recommend(selected_genres):
    user_profile = {genre: 1 if genre in selected_genres else 0 for genre in all_genres}
    user_profile_vector = pd.DataFrame([user_profile])
    
    book_genre_vectors = df[list(all_genres)]
    
    similarity_scores = cosine_similarity(user_profile_vector, book_genre_vectors)
    
    df['matching_genres'] = df['genres'].apply(lambda x: len(set(x) & set(selected_genres)))
    
    df['similarity'] = similarity_scores[0] + (df['matching_genres']/10)
    
    df_sorted = df.sort_values(by='similarity', ascending=False)
    
    df_filtered = df_sorted[df_sorted['matching_genres'] > 0]
    
    N = 3
    top_recommendations = df_filtered[['title', 'type', 'genres', 'similarity']].head(N)
    
    print(top_recommendations)

In [296]:
#cosine sim with genres
user_genres = ['Action & Adventure', 'TV Show']
recommend(user_genres)

           title   type                genres  similarity
5616     The 2nd  Movie  [Action & Adventure]         1.1
7683    Big Kill  Movie  [Action & Adventure]         1.1
5945  Avengement  Movie  [Action & Adventure]         1.1


In [298]:
df['combined_features'] = df['description'] + " " + df['genres'].apply(lambda x: " ".join(x))

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

import numpy as np
genre_matrix = genre_columns.to_numpy()
combined_matrix = np.hstack([tfidf_matrix.toarray(), genre_matrix])

cosine_sim = cosine_similarity(combined_matrix)

def get_recommendations(title, cosine_sim, df, num_recommend=10):
    if title not in df['title'].values:
        raise ValueError(f"Title '{title}' not found in the DataFrame.")
    
    indices = pd.Series(df.index, index=df['title']).drop_duplicates()
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_similar = sim_scores[1:num_recommend + 1]
    movie_indices = [i[0] for i in top_similar]
    return df['title'].iloc[movie_indices]

In [300]:
#cosine sim genre+descriptions and title as user input
recommendations = get_recommendations('1920', cosine_sim, df, num_recommend=3)
print(recommendations)

7669             Bhoot
2000            #Alive
693     Darna Mana Hai
Name: title, dtype: object


In [None]:
#trying to combine all text data

In [325]:
df['combined_text'] = (
    df['title'].fillna('') + ' ' +
    df['director'].fillna('') + ' ' +
    df['cast'].fillna('') + ' ' +
    df['country'].fillna('') + ' ' +
    df['rating'].fillna('') + ' ' +
    df['genres'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
)

tfidf_matrix2 = tfidf.fit_transform(df['combined_text'])
cosine_sim = cosine_similarity(tfidf_matrix2, tfidf_matrix2)

def recommend2(title, cosine_sim=cosine_sim, df=df):
    title = title.strip().lower()
    df['title'] = df['title'].str.strip().str.lower()
    
    indices = pd.Series(df.index, index=df['title']).drop_duplicates()
    
    if title not in indices:
        print(f"Title '{title}' not found in the dataset.")
        return pd.DataFrame() 
    
    idx = indices[title]
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6] 
    
    show_indices = [i[0] for i in sim_scores]
    print("Selected indices for recommendations:", show_indices)
    print("Similarity scores for selected indices:", [sim_scores[i] for i in range(len(sim_scores))])
    
    return df.iloc[show_indices]


In [326]:
recommendations = recommend2("Avengement")
print(recommendations)

Selected indices for recommendations: [3013, 4903, 5870, 6924, 238]
Similarity scores for selected indices: [(3013, 0.19311859134620044), (4903, 0.17746880216969307), (5870, 0.16742880899154608), (6924, 0.16467334030309488), (238, 0.13285169979690312)]
     show_id   type                title                         director  \
3013   s3710  Movie         london heist                     Mark McQueen   
4903   s5411  Movie           savage dog                 Jesse V. Johnson   
5870   s6282  Movie   the debt collector                 Jesse V. Johnson   
6924   s7230  Movie        triple threat  Jesse V. Johnson, Jesse Johnson   
238    s1212  Movie  cardboard gangsters                    Mark O'Connor   

                                                   cast  \
3013  Craig Fairbrass, James Cosmo, Mem Ferda, Nick ...   
4903  Scott Adkins, Juju Chan, Marko Zaror, Vladimir...   
5870  Scott Adkins, Louis Mandylor, Vladimir Kulich,...   
6924  Tony Jaa, Iko Uwais, Tiger Chen, Scott Adk