In [10]:
import pandas as pd
import torch
from torch.utils.data import DataLoader 
from sentence_transformers import SentenceTransformer, InputExample, losses

In [11]:
movies_df = pd.read_csv("../data/movies.csv")  
tags_df = pd.read_csv("../data/tags.csv")

# Print a sample to check data
print(movies_df.head())
print(tags_df.head())


   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId          tag   timestamp
0      22    26479  Kevin Kline  1583038886
1      22    79592     misogyny  1581476297
2      22   247150   acrophobia  1622483469
3      34     2174        music  1249808064
4      34     2174        weird  1249808102


In [12]:
# Drop NaN values from 'tag' column (since they are floats)
tags_df = tags_df.dropna(subset=["tag"])

# Convert 'tag' column to string (necessary for concatenation)
tags_df["tag"] = tags_df["tag"].astype(str)

# Group tags by movieId and concatenate unique tags
tags_grouped = tags_df.groupby("movieId")["tag"].apply(lambda x: " ".join(set(x))).reset_index()


In [13]:
# Ensure movieId types match before merging
movies_df["movieId"] = movies_df["movieId"].astype(int)
tags_grouped["movieId"] = tags_grouped["movieId"].astype(int)

# Merge movies and tags
movies_df = movies_df.merge(tags_grouped, on="movieId", how="left")

# Fill NaN tags with empty string
movies_df["tag"] = movies_df["tag"].fillna("")

# Create movie descriptions by combining genres and tags
movies_df["description"] = movies_df["genres"] + " " + movies_df["tag"]


In [14]:
# Ensure 'description' column exists
if "description" in movies_df.columns:
    movie_descriptions = movies_df["description"].astype(str).tolist()
else:
    raise ValueError("The 'description' column is missing from movies_df.")

# Generate embeddings
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(movie_descriptions, normalize_embeddings=True)

print("Embeddings generated successfully!")


Embeddings generated successfully!


In [15]:
train_data = [
    ("Toy Story is a great animation", "Finding Nemo is also amazing", 1),  # Similar
    ("Horror movies are scary", "Romantic comedies are funny", 0),  # Dissimilar
]


In [16]:
# Convert data into Sentence Transformers format
train_examples = [InputExample(texts=[a, b], label=float(score)) for a, b, score in train_data]

# Create DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)


In [17]:
loss_function = losses.ContrastiveLoss(model)


In [18]:
# Train the model
model.fit(train_objectives=[(train_dataloader, loss_function)], epochs=1)

print("Fine-tuning complete!")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


Fine-tuning complete!
