In [1]:
import pandas as pd
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

##https://www.youtube.com/redirect?event=video_description&redir_token=QUFFLUhqa2tPTEM2MkJwUVJNc0N5R3cwSWN1Q3RPS2RVZ3xBQ3Jtc0tuaXRYQWh4Y2h4UHZaYTRUcFloYkNlWVV2WlAwM3lPSGtvZVZzRzR5RFNYNEZvZmRVMHpTcnV6cFJUWE54NjhrUFVSbVE0Rzl0S0NGUTRvdEM4eU1iMTNYWFlmcEtHaTl5Uk1pcDEyYUZWUTZHbkpGSQ&q=https%3A%2F%2Ffiles.grouplens.org%2Fdatasets%2Fmovielens%2Fml-25m.zip&v=eyEabQRBMQA

In [2]:
import re ##regular expression library

def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)

movies["clean_title"] = movies["title"].apply(clean_title)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

## transform the title to vector among the title we have
vectorizer = TfidfVectorizer(ngram_range=(1,2)) ## Includes both unigrams and bigrams, scroll down for more notes
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [12]:
def search(title):
    ## clean the title
    title = clean_title(title)
    
    ## transform the title to vector among the dataset
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()

    indices = np.argpartition(similarity, -5)[-5:] ## find the top 5 sort in the last 
    results = movies.iloc[indices][::-1] ## reverse the order
    return results

In [13]:
def find_similar_movies(movie_id):

    ### find users that also like this specific movie
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

    ### find movies that these users like
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

    ### count the numbers of likes of the movies among the users
    similar_user_recs.value_counts()

    ### find the % of like & get likes with 10% rate
    similar_user_recs = similar_user_recs.value_counts()/ len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]

    ### get the ratings table filtered with movie rec with 4+ rating
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

    ### to find out if the rec % is the same as all user rec or unique to similar user rec
    all_users_recs = all_users["movieId"].value_counts()/ len(all_users["userId"].unique())
    
    ### rec list 
    rec_percentage = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentage.columns = ["similar", "all"]
    rec_percentage["diff"] = rec_percentage["similar"]/ rec_percentage["all"]
    rec_percentage.sort_values("diff", ascending=False, inplace = True)
    rec_list = rec_percentage.merge(movies, left_index=True, right_on="movieId")
    
    return rec_list.iloc[0:10,2:6]


In [14]:
### The movie ID that you want to generate rec from
movie = "Toy Story 1995"
movie_id = search(movie).iloc[0,0]
movie_id

find_similar_movies(movie_id)

Unnamed: 0,diff,movieId,title,genres
0,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3021,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
2264,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy
14813,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX
4780,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
580,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
6258,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy
587,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
8246,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy
359,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX


In [7]:
## hasn't resloved Node.js environment issue otherwise can play around interactive widget/

# import ipywidgets as widgets
# from IPython.display import display

# movie_input = widgets.Text(
#     value='Toy Story',
#     description='Movie Title:',
#     disabled=False
# )
# movie_list = widgets.Output()

# def on_type(data):
#     with movie_list:
#         movie_list.clear_output()
#         title = data["new"]
#         if len(title) > 5:
#             display(search(title))

# movie_input.observe(on_type, names='value')


# display(movie_input, movie_list)

In [9]:
# https://www.youtube.com/watch?v=eyEabQRBMQA&lc=Ugydd1bHyAir29MHSBZ4AaABAg

In [10]:
## ngram_range parameter in TfidfVectorizer:

# ngram_range=(1, 1):
# Only unigrams (single words) are used. 
# Example: "The quick brown fox" -> ["The", "quick", "brown", "fox"].

# ngram_range=(1, 2):
# Includes both unigrams and bigrams.
# Example: "The quick brown fox" -> ["The", "quick", "brown", "fox", "The quick", "quick brown", "brown fox"].

# ngram_range=(1, 3):
# Covers unigrams, bigrams, and trigrams.
# Example: "The quick brown fox" -> ["The", "quick", "brown", "fox", "The quick", "quick brown", "brown fox", "The quick brown", "quick brown fox"].

# ngram_range=(2, 2):
# Only bigrams.
# Example: "The quick brown fox" -> ["The quick", "quick brown", "brown fox"].

# ngram_range=(2, 3):
# Includes bigrams and trigrams, but not unigrams.
# Example: "The quick brown fox" -> ["The quick", "quick brown", "brown fox", "The quick brown", "quick brown fox"].

# ngram_range=(3, 3):
# Only trigrams.
# Example: "The quick brown fox" -> ["The quick brown", "quick brown fox"].

## Notes:
# - Unigrams (1,1) are suitable for most text classification tasks.
# - Bigrams (2,2) or unigrams and bigrams (1,2) help capture meaningful phrases.
# - Higher n-grams (e.g., trigrams) might be useful in specific contexts but increase feature space dimensionality.
# - Choose ngram_range based on model performance and analysis requirements. Start with (1,1) or (1,2) and experiment as needed.


In [None]:
## Next steps:
# matching genres
# using meta data to improve the algorithm