## Understanding tf-idf

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

# Step 1: Sample Corpus
corpus = [
    "Harry Potter and the Sorcerer's Stone",
    "The Dark Knight",
    "Harry Potter and the Goblet of Fire",
    "Harry Potter"
]

# Step 2: Raw Term Frequency (TF)
count_vec = CountVectorizer()
count_matrix = count_vec.fit_transform(corpus)
tf_df = pd.DataFrame(count_matrix.toarray(), columns=count_vec.get_feature_names_out())
print("\n🔢 Term Frequency (TF) Matrix:")
print(tf_df)

# Step 3: TF-IDF Matrix
tfidf_vec = TfidfVectorizer(norm=None)  # ✅ Now it's defined before using
tfidf_matrix = tfidf_vec.fit_transform(corpus)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vec.get_feature_names_out())
print("\n📊 TF-IDF Matrix:")
print(tfidf_df.round(3))

# Step 4: IDF Values
idf_scores = dict(zip(tfidf_vec.get_feature_names_out(), tfidf_vec.idf_))
idf_df = pd.Series(idf_scores).sort_values()
print("\n📈 IDF Values:")
print(idf_df)

# Step 5: Vocabulary (term to column index)
print("\n📚 Vocabulary (term → column index):")
print(tfidf_vec.vocabulary_)


In [None]:
query_vec = tfidf_vec.transform(["Harry Potter"])
pd.DataFrame(query_vec.toarray(), columns=tfidf_vec.get_feature_names_out()).round(3)

# Movie Recommender

## TF-ID & Cosine Similarity Search Engine

In [None]:
# @title
pip install pandas requests ipywidgets --quiet

In [None]:
# @title
import os
import requests, zipfile, io
import pandas as pd

import re

# Constants
zip_url = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
zip_path = "ml-25m.zip"
extract_dir = "movielens_data"

# Step 1: Download only if zip file is missing
if not os.path.exists(zip_path):
    print("Downloading dataset...")
    response = requests.get(zip_url)
    with open(zip_path, "wb") as f:
        f.write(response.content)
    print("Download complete.")
    print("Extracting dataset...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print("Extraction complete.")

def read_csv_pandas(fileName):
  return pd.read_csv(os.path.join(extract_dir, "ml-25m", f"{fileName}.csv"))

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [None]:
# Step 3: Load the CSV
movies = read_csv_pandas("movies")
movies["clean_title"] = movies["title"].apply(clean_title)

ratings = read_csv_pandas("ratings")

In [None]:
movies.head()

In [None]:
ratings.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2))

tfidf = vectorizer.fit_transform(movies["clean_title"])
print(vectorizer.vocabulary_)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]

    return results

## Recomendation

In [None]:
def find_similar_movies(movie_id, rating_threshold, pick_x_percentage, number_of_recommendations):
    # Filter ratings once for those above the threshold
    filtered_ratings = ratings[ratings["rating"] > rating_threshold]

    # Step 1: Find users who rated the target movie above the threshold
    similar_users = filtered_ratings[filtered_ratings["movieId"] == movie_id]["userId"]

    # Step 2: Find other movies rated by these users (also above threshold)
    similar_user_recs = filtered_ratings[filtered_ratings["userId"].isin(similar_users)]["movieId"]

    # Step 3: Calculate % of similar users who liked each movie
    unique_movies = similar_user_recs.value_counts() / len(similar_users)
    unique_movies = unique_movies[unique_movies > pick_x_percentage]

    # Step 4: Among all users, calculate how generally popular these movies are
    all_users = filtered_ratings[filtered_ratings["movieId"].isin(unique_movies.index)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    # Step 5: Compare specific-vs-general popularity to calculate a score
    rec_percentages = pd.concat([unique_movies, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)

    # Step 6: Return top N recommendations
    recommended_movies = rec_percentages.head(number_of_recommendations).merge(
        movies, left_index=True, right_on="movieId"
    )

    return recommended_movies[["score", "title", "genres"]]


In [None]:
from IPython.display import display, clear_output
import ipywidgets as widgets

# Text input
movie_name_input = widgets.Text(
    placeholder='Toy Story',
    description='Movie Title:',
    disabled=False
)

# Output areas
recommendation_list = widgets.Output()
movie_list = widgets.Output()

# Headers
movie_list_header = widgets.HTML("<h3>🔍 Matching Movies</h3>")
recommendation_header = widgets.HTML("<h3>🎬 Recommended Movies</h3>")

# Loader widget
loader = widgets.HTML("<span style='color:gray'>⏳ Loading recommendations...</span>")

# Recommendation configuration parameters
number_of_recommendations = 10
rating_threshold = 4
pick_x_percentage = 0.1

def on_type(data):
    title = data.value.strip()
    if len(title) > 5:
        # Show loader immediately
        with recommendation_list:
            recommendation_list.clear_output()
            display(loader)

        try:
            results = search(title)

            with movie_list:
                movie_list.clear_output()
                display(movie_list_header)
                display(results)

            if results.empty:
                with recommendation_list:
                    recommendation_list.clear_output()
                    print("No movies found with that title.")
                return

            movie_id = results.iloc[0]["movieId"]
            recommendations = find_similar_movies(
                movie_id,
                rating_threshold,
                pick_x_percentage,
                number_of_recommendations
            )

            with recommendation_list:
                recommendation_list.clear_output()
                display(recommendation_header)
                display(recommendations)

        except Exception as e:
            with recommendation_list:
                recommendation_list.clear_output()
                print(f"An error occurred: {e}")

# Link input to callback
movie_name_input.on_submit(on_type)

# Display UI
display(movie_name_input, movie_list, recommendation_list)
