In [23]:
import pandas as pd
import numpy as np
import re # Regular expression module
from sklearn.feature_extraction.text import TfidfVectorizer # Vectorizer for text data
from sklearn.metrics.pairwise import cosine_similarity # Function to calculate cosine similarity between vectors
import ipywidgets as widgets # Widgets for interactive UI in Jupyter Notebooks
from IPython.display import display # Function to display widgets in Jupyter Notebooks

# Build Search engine

In [None]:
move_path = f"D:\Machine_Learning_course\Dataset\movies.csv"
movies = pd.read_csv(move_path)

In [None]:
movies

In [None]:
# Define a function to clean movie titles
def clean_title(title):
    new_title = re.sub("[^a-zA-Z0-9 ]", "", title)  # Remove special characters
    return new_title

In [None]:
# Add new_title column to the DataFrame
movies["new_title"] = movies["title"].apply(clean_title)
movies

In [None]:
# Create a TF-IDF vectorizer to convert movie titles into vectors
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf = vectorizer.fit_transform(movies["new_title"])

In [None]:
# Define a function to search for movies by title
def search(title):
    title = clean_title(title)
    query_vector = vectorizer.transform([title])
    similarity = cosine_similarity(query_vector, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results


In [None]:
# Create a text input widget for movie title search
movie_input = widgets.Text(
    value="Toy Story 1995",
    description="Movie Title:",
    placeholder="Enter movie title",
    dísabled=False
)
movie_list = widgets.Output() # Output widget to display search results

# Define a function to handle changes in the text input
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names="value") # Observe changes in the text input

display(movie_input, movie_list) # Display the input widget and output widget

# Recommendation model

In [24]:
# Load the movies dataset
movie_path = f"D:\Machine_Learning_course\Dataset\movies.csv"
movies = pd.read_csv(movie_path)
movies

  movie_path = f"D:\Machine_Learning_course\Dataset\movies.csv"


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [25]:
# Load ratings data
rating_path = r"D:\Machine_Learning_course\Dataset\ratings.csv"
ratings = pd.read_csv(rating_path)
ratings


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [27]:
n_user = len(ratings["userId"].unique()) # Count of unique users
n_movie = len(movies["movieId"].unique()) # Count of unique movies

print("Number of unique users:", n_user)
print("Number of unique movies:", n_movie)

Number of unique users: 162541
Number of unique movies: 62423


In [None]:
# Find users who rated a specific movie with a rating of 5 or higher
movie_id = 1
similar_user = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4.0)]\
               ["userId"].unique()
similar_user

In [None]:
len(similar_user) # Count of users who rated the movie with a rating of 5 or higher

In [None]:
# Get recommendations for movies rated by similar users with a rating greater than 4
similar_user_recs = ratings[(ratings["userId"].isin(similar_user)) \
                            & (ratings["rating"] > 4)] \
                            ["movieId"]

In [None]:
similar_user_recs.value_counts()

In [None]:
# Normalize the recommendations by dividing by the total number of similar users
similar_user_recs = similar_user_recs.value_counts() / len(similar_user)
similar_user_recs = similar_user_recs[similar_user_recs > .1]
similar_user_recs

In [None]:
# Find users who rated a specific movie with a rating of 5 or higher
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index))\
                     & (ratings["rating"] > 4)]
all_users


In [None]:
all_users["movieId"].value_counts()

In [None]:
# Calculate the average number of recommendations per user
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
all_users_recs

In [None]:
# Calculate the percentage of recommendations for similar users and all users
rec_percentage = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentage.columns = ["similar","all"]
rec_percentage

In [None]:
# score giúp bạn biết phim nào đặc biệt phù hợp với nhóm user giống bạn, để ưu tiên gợi ý.
# similar tìm ra những người user tương tự bạn đang thích phim nào
# all tìm ra trong những bộ phim đang thích của nhóm similar thì có bao nhiêu người cũng thích
# score trả lời cho câu hỏi 
# "Phim nào đặc biệt phù hợp với nhóm user có sở thích giống bạn hơn so với toàn bộ user?"
rec_percentage["score"] = rec_percentage["similar"] / rec_percentage["all"]

In [None]:
rec_percentage = rec_percentage.sort_values("score", ascending=False)
rec_percentage

In [None]:
rec_percentage.head(10).merge(movies, left_index=True, right_on="movieId")

In [None]:
def get_recommendations(movie_id):
    similar_user = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4.0)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_user)) & (ratings["rating"] > 4)]["movieId"]
    
    similar_user_recs = similar_user_recs.value_counts() / len(similar_user)
    similar_user_recs = similar_user_recs[similar_user_recs > .1]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    rec_percentage = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentage.columns = ["similar","all"]
    rec_percentage["score"] = rec_percentage["similar"] / rec_percentage["all"]
    
    rec_percentage = rec_percentage.sort_values("score", ascending=False)
    rec_percentage = rec_percentage.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]
    
    return rec_percentage

In [None]:
# Create a TF-IDF vectorizer to convert movie titles into vectors
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf = vectorizer.fit_transform(movies["new_title"])

In [None]:
# Define a function to search for movies by title
def search(title):
    title = clean_title(title)
    query_vector = vectorizer.transform([title])
    similarity = cosine_similarity(query_vector, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

In [None]:
movie_input_name = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    placeholder="Enter movie title",
    disabled=False
)

recommendation_lists = widgets.Output()

def on_type(data):
    with recommendation_lists:
        recommendation_lists.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(get_recommendations(movie_id))

movie_input_name.observe(on_type, names="value")

display(movie_input_name, recommendation_lists) # Display the input widget and output widget