## **Import Libraries** 

In [3]:
import pandas as pd
import torch
from torch.utils.data import DataLoader 
from sentence_transformers import SentenceTransformer, InputExample, losses
from sklearn.metrics.pairwise import cosine_similarity

## **Loading Movie and Tag Data**

In [4]:
movies_df = pd.read_csv("../data/movies.csv")  
tags_df = pd.read_csv("../data/tags.csv")

# Print a sample to check data
print(movies_df.head())
print(tags_df.head())


   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId          tag   timestamp
0      22    26479  Kevin Kline  1583038886
1      22    79592     misogyny  1581476297
2      22   247150   acrophobia  1622483469
3      34     2174        music  1249808064
4      34     2174        weird  1249808102


## **Data Preprocessing**

1. Remove NaN values from the `tag` column.
2. Convert the `tag` column to strings for easy concatenation.
3. Group tags by `movieId` and concatenate unique tags.
4. Merge movie metadata with tags to create a unified dataset.
5. Create a new `description` column combining genres and tags.

In [5]:
# Drop NaN values from 'tag' column (since they are floats)
tags_df = tags_df.dropna(subset=["tag"])

# Convert 'tag' column to string (necessary for concatenation)
tags_df["tag"] = tags_df["tag"].astype(str)

# Group tags by movieId and concatenate unique tags
tags_grouped = tags_df.groupby("movieId")["tag"].apply(lambda x: " ".join(set(x))).reset_index()


In [6]:
# Ensure movieId types match before merging
movies_df["movieId"] = movies_df["movieId"].astype(int)
tags_grouped["movieId"] = tags_grouped["movieId"].astype(int)

# Merge movies and tags
movies_df = movies_df.merge(tags_grouped, on="movieId", how="left")

# Fill NaN tags with empty string
movies_df["tag"] = movies_df["tag"].fillna("")

# Create movie descriptions by combining genres and tags
movies_df["description"] = movies_df["genres"] + " " + movies_df["tag"]


## **Generating Sentence Embeddings**

We use the `sentence-transformers` library to encode movie descriptions into vector embeddings.

**Model Used:** `'all-MiniLM-L6-v2'`  
**Normalization:** `normalize_embeddings=True`


In [None]:
# Ensure 'description' column exists
if "description" in movies_df.columns:
    movie_descriptions = movies_df["description"].astype(str).tolist()
else:
    raise ValueError("The 'description' column is missing from movies_df.")

# Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(movie_descriptions, normalize_embeddings=True)

print("Embeddings generated successfully!")


Embeddings generated successfully!


## **Fine-Tuning Sentence Transformer Model**

We fine-tune the model using contrastive loss.
- **Training Data:** Pairs of movie descriptions labeled as similar or dissimilar.
- **Loss Function:** `ContrastiveLoss`
- **Epochs:** `1` (for quick testing)

In [None]:
train_data = [
    ("Toy Story is a great animation", "Finding Nemo is also amazing", 1),  # Similar
    ("Horror movies are scary", "Romantic comedies are funny", 0),  # Dissimilar
]

# Convert data into Sentence Transformers format
train_examples = [InputExample(texts=[a, b], label=float(score)) for a, b, score in train_data]

# Create DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)

loss_function = losses.ContrastiveLoss(model)

# Train the model
model.fit(train_objectives=[(train_dataloader, loss_function)], epochs=1)

print("Fine-tuning complete!")


## **Computing Movie Similarity** 

We compute the **cosine similarity** between all movie embeddings.

**Method Used:** `cosine_similarity(embeddings)`
**Purpose:** To measure how similar two movies are based on text descriptions.


In [None]:
# ERROR : Taking too long to run 
similarity_matrix = cosine_similarity(embeddings)

print("Cosine similarity matrix computed successfully!")


In [None]:
# Function to get top-N similar movies based on cosine similarity
def get_similar_movies(movie_index, top_n=5):
    # Get similarity scores for the given movie
    similar_movies = list(enumerate(similarity_matrix[movie_index]))

    # Sort movies by similarity score in descending order
    similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)

    # Exclude the movie itself (first entry) and select top-N similar movies
    similar_movies = similar_movies[1:top_n+1]

    # Return movie titles of the most similar movies
    return [movies_df.iloc[i[0]]["title"] for i in similar_movies]

# Example: Get similar movies for the first movie in the dataset
movie_index = 0  # Change this index to get recommendations for different movies
print(f"Top 5 similar movies to '{movies_df.iloc[movie_index]['title']}':")
print(get_similar_movies(movie_index, top_n=5))