# Fine Tune Sentence-Transofrmer

In [None]:
!pip install transformers[torch]
!pip install accelerate -U
!pip install -U sentence-transformers

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2
Collecting sentence-transformers
  Downloading sentence_transformers-2.4.0-py3-none-any.whl (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.5/149.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.4.0


In [None]:
import numpy as np
import pandas as pd
import os, json, gc, re, random
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import torch
from transformers import pipeline, BertForSequenceClassification, BertTokenizerFast
from torch.utils.data import Dataset

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns


In [None]:
movies_df = pd.read_csv('./Movies.csv')
movies_df = movies_df[["Plot", "Genre"]]
drop_indices = movies_df[movies_df["Genre"] == "unknown" ].index
movies_df.drop(drop_indices, inplace=True)

movies_df["Genre"].replace({"sci-fi": "science fiction", "romantic comedy": "romance"}, inplace=True)
shortlisted_genres = movies_df["Genre"].value_counts().reset_index(name="count").query("count > 200")["index"].tolist()
movies_df = movies_df[movies_df["Genre"].isin(shortlisted_genres)].reset_index(drop=True)
movies_df = movies_df.sample(frac=1).reset_index(drop=True)
movies_df = movies_df.groupby("Genre").head(400).reset_index(drop=True)
movies_df = movies_df[["Plot", "Genre"]]

label_encoder = LabelEncoder()
movies_df['genre_encoded'] = label_encoder.fit_transform(movies_df['Genre'])
num_labels = len(label_encoder.classes_)
print(num_labels)

21


In [None]:
label_encoder.classes_

array(['Drama', 'action', 'comedy', 'drama', 'horror', 'romance',
       'science fiction', 'thriller', 'western'], dtype=object)

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
import pandas as pd

a = dict()
for plot, label in zip(movies_df['Plot'], movies_df['genre_encoded']):
  if label not in a:
    a[label] = list()
  a[label].append(plot)
train_examples = []

for label, plots in a.items():
  for i in range(0,len(plots) - 1, 2):
    train_examples.append(InputExample(texts=[plots[i], plots[i+1]], label=label))

model = SentenceTransformer('thenlper/gte-base')

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
train_loss = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=num_labels
)

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=3, warmup_steps=100)
model.save('movie-genre-fine-tuned')

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/457 [00:00<?, ?it/s]

Iteration:   0%|          | 0/457 [00:00<?, ?it/s]

Iteration:   0%|          | 0/457 [00:00<?, ?it/s]