In [None]:
!pip install faiss-gpu-cu12
!pip install -U langchain-community
!pip install langchain_openai
!pip install langchain_google_genai

In [None]:
import os
import pandas as pd
import psycopg2
import numpy as np
from sklearn.preprocessing import MinMaxScaler, MultiLabelBinarizer, normalize
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import faiss
import joblib
from transformers import pipeline
from tqdm import tqdm
import torch
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

In [None]:
import nltk
from nltk.tokenize import word_tokenize
import time

# Download NLTK data for tokenization
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
GENRE_WEIGHT = 1.0
KEYWORD_WEIGHT = 2.0
VOTE_WEIGHT = 0.3
ADULT_WEIGHT = 0.2
SUMMARY_WEIGHT = 0.5
ATMOSPHERE_WEIGHT = 0.5
NARRATIVE_WEIGHT = 0.5
EMOTION_WEIGHT = 0.5
THEME_WEIGHT = 0.5
CHARACTER_WEIGHT = 0.5
PACING_WEIGHT = 0.5

INITIAL_DATA_PATH = '/content/drive/My Drive/colab.pkl'
BEST_MODEL_PATH = '/content/drive/My Drive/movie_similarity/roberta_multilabel_improved'
MODEL_INFO_PATH = '/content/drive/My Drive/movie_similarity/roberta_multilabel_improved/model_info.pkl'
NON_CLASSIFIED_EMBEDDINGS_PATH = '/content/drive/My Drive/movie_similarity/non_classified_embeddings.pkl'
CLASSIFIED_EMBEDDINGS_PATH = '/content/drive/My Drive/movie_similarity/classified_embeddings.pkl'
TRAINING_DATA_PATH = '/content/drive/My Drive/movie_similarity/training_data.pkl'
TRAINING_DATA_CLEANED_PATH = '/content/drive/My Drive/movie_similarity/training_data_cleaned.pkl'
SBERT_MODEL = 'all-MiniLM-L6-v2'
ZSC_MODEL = 'FacebookAI/roberta-base'

atmosphere_labels = [
    'dark', 'tense', 'neutral',
    'melancholic', 'cheerful', 'mysterious', 'foreboding',
    'dreamy', 'chaotic', 'nostalgic', 'surreal',
]

narrative_labels = ['linear', 'non-linear', 'episodic', 'circular']

theme_candidates = [
    'redemption', 'betrayal', 'love', 'identity', 'family',
    'loss', 'justice', 'freedom', 'corruption', 'hope',
    'loyalty', 'power', 'isolation', 'transformation'
]

In [None]:
def create_non_classified_embeddings(df):
    df['genres'] = df['genres'].apply(lambda x: [item.strip() for item in x.split(',')] if x else [])
    df['keywords'] = df['keywords'].apply(lambda x: [item.strip() for item in x.split(',')] if x else [])
    df['overview'] = df['overview'].fillna('')

    df['vote_average'] = pd.to_numeric(df['vote_average'], errors='coerce')
    df['vote_average'] = df['vote_average'].fillna(df['vote_average'].mean())

    scaler = MinMaxScaler()
    vote_average_scaled = scaler.fit_transform(df[['vote_average']]).flatten()

    model = SentenceTransformer(SBERT_MODEL)

    genre_texts = [' '.join(genres) for genres in df['genres']]
    keyword_texts = [' '.join(keywords) for keywords in df['keywords']]

    all_texts = genre_texts + keyword_texts + df['overview'].tolist()
    all_embeddings = model.encode(all_texts, show_progress_bar=True).astype('float32')

    n_genres = len(genre_texts)
    n_keywords = len(keyword_texts)
    genres_emb = all_embeddings[:n_genres]
    keywords_emb = all_embeddings[n_genres:n_genres+n_keywords]
    summary_embeddings = all_embeddings[n_genres+n_keywords:]

    final_df = pd.DataFrame({
        'id': df['id'],
        'overview_emb': list(summary_embeddings),
        'genres_emb': list(genres_emb),
        'keywords_emb': list(keywords_emb),
        'vote_average_scaled': vote_average_scaled
    })

    print(final_df.head())

    final_df.to_pickle(NON_CLASSIFIED_EMBEDDINGS_PATH)

In [None]:
initial_df = pd.read_pickle(INITIAL_DATA_PATH)

print(initial_df.head())

create_non_classified_embeddings(initial_df)

   id                title                                           overview  \
0   2                Ariel  A Finnish man goes to the city to find a job a...   
1   3  Shadows in Paradise  Nikander, a rubbish collector and would-be ent...   
2   5           Four Rooms  It's Ted the Bellhop's first night on the job....   
3   6       Judgment Night  Four young friends, while taking a shortcut en...   
4  11            Star Wars  Princess Leia is captured and held hostage by ...   

   vote_average                            genres  \
0         7.100        Comedy,Crime,Drama,Romance   
1         7.291              Comedy,Drama,Romance   
2         5.864                            Comedy   
3         6.455             Action,Crime,Thriller   
4         8.206  Action,Adventure,Science Fiction   

                                            keywords  \
0  factory worker,falling in love,helsinki, finla...   
1               garbage,helsinki, finland,salesclerk   
2  anthology,bet,hoodlum,h

Batches:   0%|          | 0/938 [00:00<?, ?it/s]

   id                                       overview_emb  \
0   2  [0.012737188, 0.013031937, -0.07787705, -0.039...   
1   3  [-0.120130956, -0.0150178075, -0.01258817, -0....   
2   5  [-0.073654644, -0.008337193, 0.022501769, 0.01...   
3   6  [0.093476705, 0.08604528, -0.049391024, -0.011...   
4  11  [-0.061168995, -0.039084088, 0.055229034, -0.0...   

                                          genres_emb  \
0  [-0.07958339, -0.06317735, -0.04624551, 0.0377...   
1  [-0.056528702, -0.07941368, -0.029316708, 0.07...   
2  [-0.042611323, -0.046201073, -0.03457101, 0.01...   
3  [-0.042185202, -0.00575423, -0.08346808, 0.032...   
4  [-0.024891889, -0.02195114, -0.0148923695, 0.0...   

                                        keywords_emb  vote_average_scaled  
0  [-0.067110725, 0.029397095, 0.0063949055, 0.04...             0.571505  
1  [-0.038277473, 0.065131254, -0.037371606, -0.0...             0.622275  
2  [0.04833135, -0.05277274, -0.05252021, 0.02396...             0.242956 

In [None]:
import sys
import os
import re
import time
import numpy as np
import datetime
from typing import List, Dict, Any, Tuple
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, SequentialChain, SimpleSequentialChain
from langchain.memory import ConversationBufferWindowMemory, FileChatMessageHistory, ConversationSummaryBufferMemory
from langchain.agents import AgentType, initialize_agent, Tool
from langchain.agents import AgentExecutor
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.vectorstores import FAISS
from dateutil import parser
from dateutil.relativedelta import relativedelta
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

gemini = ChatGoogleGenerativeAI(
    google_api_key="your-gemini-api-key",
    model="gemini-2.0-flash", # would be better to use gemini 2.5 flash, this model sometimes generates labels that are not specified
    temperature=0.0,
)

def create_labeled_data(model, df):
    system_prompt = PromptTemplate(
        input_variables=["query"],
        template=
        f"""
        You are a highly analytical and precise movie review labeler. Your job is to determine the movie's features based *only* on the provided user review. You will classify the movie features into specific categories. You are given very specific instructions for this and must not deviate from the categories or their definitions.

        The possible labels for atmosphere: {atmosphere_labels}. **YOU MUST CHOOSE ONLY FROM THIS LIST.**
        The possible labels for narrative_structure: {narrative_labels}. **YOU MUST CHOOSE ONLY FROM THIS LIST.**
        The possible labels for themes: {theme_candidates}. **YOU MUST CHOOSE ONLY FROM THIS LIST.**

         **Instructions for Themes:**
        - Only include a theme if it is strongly and unambiguously expressed or implied by the review.
        - Be particularly careful with 'hope' and 'redemption'. These should only be applied if there is a clear, sustained sense of optimism or a definitive moral turnaround, respectively. If a film introduces hope only to crush it, or shows no clear path to absolution, do NOT label it with 'hope' or 'redemption'.

        **CRITICAL RULE: ALL LABELS YOU PROVIDE MUST BE EXACTLY MATCHED FROM THE RESPECTIVE ALLOWED LISTS. DO NOT INVENT NEW LABELS. IF NO APPLICABLE LABEL FROM THE LISTS IS FOUND FOR A CATEGORY, PROVIDE AN EMPTY LIST [] FOR THAT CATEGORY.**

        The output format of your answer should be: ATMOSPHERE: (list of labels you determined) ; NARRATIVE_STRUCTURE: (list of labels you determined) ; THEMES: (list of labels you determined)

        Query: {{query}}
        Answer:"""
    )

    batch_size = 4
    max_retries = 3
    retry_delay = 5

    chain = LLMChain(llm=model, prompt=system_prompt)

    processing_df = df.copy().reset_index(drop=True)

    if processing_df.empty:
        print("Warning: Input DataFrame is empty. Returning an empty DataFrame.")
        return pd.DataFrame(columns=['movie_id', 'review', 'atmosphere', 'narrative_structure', 'themes'])

    print(f"Processing {len(processing_df)} reviews in batches of {batch_size}...")
    print(processing_df.head())

    processing_df['review'] = processing_df['review'].fillna('').astype(str)

    processing_df['atmosphere'] = None
    processing_df['narrative_structure'] = None
    processing_df['themes'] = None

    list_content_regex = re.compile(r"\[\s*([^\]]*?)\s*\]")

    def parse_labels_from_llm_output(text: str) -> Tuple[List[str], List[str], List[str]]:
        atmosphere = []
        narrative_structure = []
        themes = []

        try:
            atmosphere_match = re.search(r'ATMOSPHERE:\s*\[(.*?)\]', text)
            if atmosphere_match:
                atmosphere_str = atmosphere_match.group(1)
                atmosphere = [label.strip().strip("'\"") for label in atmosphere_str.split(',') if label.strip()]

            narrative_match = re.search(r'NARRATIVE_STRUCTURE:\s*\[(.*?)\]', text)
            if narrative_match:
                narrative_str = narrative_match.group(1)
                narrative_structure = [label.strip().strip("'\"") for label in narrative_str.split(',') if label.strip()]

            themes_match = re.search(r'THEMES:\s*\[(.*?)\]', text)
            if themes_match:
                themes_str = themes_match.group(1)
                themes = [label.strip().strip("'\"") for label in themes_str.split(',') if label.strip()]

        except Exception as e:
            print(f"Error parsing labels from LLM output: {e}")
            print(f"Raw LLM output causing error: {text[:200]}...")
            return [], [], []

        return atmosphere, narrative_structure, themes

    total_reviews = len(processing_df)

    for i in range(0, total_reviews, batch_size):
        batch_start_time = time.time()
        batch_end_index = min(i + batch_size, total_reviews)
        current_batch_df = processing_df.iloc[i:batch_end_index]

        inputs_for_current_batch = [{"query": review_text} for review_text in current_batch_df['review'].tolist()]

        print(f"\nProcessing batch {i // batch_size + 1}/{(total_reviews + batch_size - 1) // batch_size} (Reviews {i}-{batch_end_index-1})")

        retries = 0
        batch_labeled_results = []
        while retries < max_retries:
            try:
                batch_labeled_results = chain.batch(inputs_for_current_batch)
                break
            except Exception as e:
                retries += 1
                current_delay = retry_delay * (2 ** (retries - 1))
                print(f"Error during LLM batch call (Batch {i // batch_size + 1}): {e}")
                print(f"Retrying in {current_delay} seconds... (Attempt {retries}/{max_retries})")
                time.sleep(current_delay)

        if not batch_labeled_results:
            print(f"Failed to get results for batch {i // batch_size + 1} after {max_retries} retries. Skipping this batch.")
            for j in range(len(current_batch_df)):
                row_index = i + j
                processing_df.at[row_index, 'atmosphere'] = []
                processing_df.at[row_index, 'narrative_structure'] = []
                processing_df.at[row_index, 'themes'] = []
            continue

        for j, result in enumerate(batch_labeled_results):
            row_index = i + j

            if isinstance(result, dict):
                result_text = result.get('text', str(result))
            else:
                result_text = str(result)

            atmosphere, narrative_structure, themes = parse_labels_from_llm_output(result_text)

            processing_df.at[row_index, 'atmosphere'] = atmosphere
            processing_df.at[row_index, 'narrative_structure'] = narrative_structure
            processing_df.at[row_index, 'themes'] = themes

        batch_elapsed_time = time.time() - batch_start_time
        print(f"Batch {i // batch_size + 1} completed in {batch_elapsed_time:.2f} seconds.")
        if batch_elapsed_time < 1.0:
            time.sleep(1.0 - batch_elapsed_time)

    print(f"\nLabeling complete. Successfully processed {len(processing_df)} reviews")
    print(processing_df[['id', 'review', 'atmosphere', 'narrative_structure', 'themes']].head())

    try:
        print(processing_df[['atmosphere', 'narrative_structure', 'themes']].apply(lambda col: col.explode().value_counts()).T.fillna(0).astype(int))
    except Exception as e:
        print(f"Error displaying label counts: {e}")
        print("Label assignment completed successfully")

    processing_df.to_pickle(TRAINING_DATA_PATH)

    return processing_df

In [None]:
initial_df = pd.read_pickle(INITIAL_DATA_PATH)

print(initial_df.head())

create_labeled_data(gemini, initial_df)

In [None]:
training_df = pd.read_pickle(TRAINING_DATA_PATH)

expected_labels = {
    'atmosphere': set(atmosphere_labels),
    'narrative_structure': set(narrative_labels),
    'themes': set(theme_candidates)
}

def clean_labels(value, valid_labels):
    if value is None:
        return None

    try:
        if pd.isna(value):
            return None
    except (TypeError, ValueError):
        pass

    if isinstance(value, str):
        labels = [label.strip() for label in value.split(',') if label.strip()]
    elif isinstance(value, (list, np.ndarray)):
        labels = [str(item).strip() for item in value if str(item).strip()]
    else:
        labels = [str(value).strip()]

    clean_labels = [label for label in labels if label in valid_labels]

    if clean_labels:
        return ', '.join(clean_labels)
    else:
        return None

clean_df = training_df.copy()

for col in ['atmosphere', 'narrative_structure', 'themes']:
    if col in clean_df.columns:
        print(f"Cleaning {col}...")
        before_count = clean_df[col].notna().sum()

        clean_df[col] = clean_df[col].apply(lambda x: clean_labels(x, expected_labels[col]))

        after_count = clean_df[col].notna().sum()
        print(f"Before: {before_count} non-null entries")
        print(f"After: {after_count} non-null entries")
        print(f"Removed: {before_count - after_count} entries")

label_columns = ['atmosphere', 'narrative_structure', 'themes']
existing_label_cols = [col for col in label_columns if col in clean_df.columns]

if existing_label_cols:
    before_rows = len(clean_df)
    clean_df = clean_df.dropna(subset=existing_label_cols, how='all')
    after_rows = len(clean_df)
    print(f"\nRemoved {before_rows - after_rows} rows with no valid labels")

print(f"\nFinal DataFrame shape: {clean_df.shape}")
print(f"Original DataFrame shape: {initial_df.shape}")

clean_df.to_pickle(TRAINING_DATA_CLEANED_PATH)
print("\nCleaned DataFrame saved.")

print("\nFinal unique labels:")
for col in existing_label_cols:
    all_labels = set()
    for value in clean_df[col].dropna():
        if isinstance(value, str):
            labels = [label.strip() for label in value.split(',') if label.strip()]
            all_labels.update(labels)
    print(f"{col}: {sorted(list(all_labels))}")

Cleaning atmosphere...
  Before: 7997 non-null entries
  After: 7725 non-null entries
  Removed: 272 entries
Cleaning narrative_structure...


  if pd.isna(value):


  Before: 7997 non-null entries
  After: 7638 non-null entries
  Removed: 359 entries
Cleaning themes...
  Before: 7997 non-null entries
  After: 7539 non-null entries
  Removed: 458 entries

Removed 2099 rows with no valid labels

Final DataFrame shape: (7898, 10)
Original DataFrame shape: (9997, 10)

Cleaned DataFrame saved as 'training_cleaned.pkl'

Final unique labels:
atmosphere: ['chaotic', 'cheerful', 'dark', 'dreamy', 'foreboding', 'melancholic', 'mysterious', 'neutral', 'nostalgic', 'surreal', 'tense']
narrative_structure: ['circular', 'episodic', 'linear', 'non-linear']
themes: ['betrayal', 'corruption', 'family', 'freedom', 'hope', 'identity', 'isolation', 'justice', 'loss', 'love', 'loyalty', 'power', 'redemption', 'transformation']


In [None]:
total_mislabels = 0

for col in ['atmosphere', 'narrative_structure', 'themes']:
    if col in clean_df.columns:
        print(f"\n{col.upper()}")
        mislabel_count = 0
        mislabel_examples = []

        for idx, value in clean_df[col].dropna().items():
            if isinstance(value, str):
                labels = [label.strip() for label in value.split(',') if label.strip()]
            elif isinstance(value, (list, np.ndarray)):
                labels = [str(item).strip() for item in value if str(item).strip()]
            else:
                labels = [str(value).strip()]

            bad_labels = [label for label in labels if label not in expected_labels[col]]
            if bad_labels:
                mislabel_count += 1
                mislabel_examples.append((idx, bad_labels, labels))

        print(f"Bad entries: {mislabel_count}")
        print(f"Bad labels found: {set([label for _, bad_labels, _ in mislabel_examples for label in bad_labels])}")

        # Show some examples
        if mislabel_examples:
            print("Examples of bad entries:")
            for idx, bad_labels, all_labels in mislabel_examples[:3]:
                print(f"Row {idx}: {all_labels} (bad: {bad_labels})")

        total_mislabels += total_mislabels

print(f"\nTotal bad entries across all columns: {total_mislabels}")

In [None]:
test_df = clean_df[clean_df['id'] == 13].copy()

print(test_df.head())

In [None]:
pd.set_option('display.max_rows', None)        # Show all rows (for the 100 samples)
pd.set_option('display.max_columns', None)     # Show all columns
pd.set_option('display.max_colwidth', None)    # Show full content of each cell (important for reviews)
pd.set_option('display.width', 1000)

random_samples_df = clean_df.sample(n=100, random_state=42)

print(random_samples_df[['id', 'review', 'atmosphere', 'narrative_structure', 'themes']])

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from torch.utils.data import Dataset, DataLoader, SequentialSampler
from sklearn.metrics import classification_report, hamming_loss, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
import joblib
import warnings
import os
from collections import Counter

warnings.filterwarnings('ignore')

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


class WeightedBCELoss(nn.Module):
    def __init__(self, pos_weights):
        super(WeightedBCELoss, self).__init__()
        self.pos_weights = pos_weights

    def forward(self, inputs, targets):
        return F.binary_cross_entropy_with_logits(
            inputs, targets, pos_weight=self.pos_weights
        )

class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_length=256):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        labels = self.labels[idx]

        encoding = self.tokenizer(
            review,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(labels)
        }

class CustomTrainer(Trainer):
    def __init__(self, loss_fn=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = loss_fn

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        if self.loss_fn is not None:
            loss = self.loss_fn(logits, labels)
        else:
            loss = outputs.loss

        return (loss, outputs) if return_outputs else loss

def load_and_merge_data(path1, path2):
    try:
        df1 = pd.read_pickle(path1)
        print(f"Dataset 1 shape: {df1.shape}")

        try:
            df2 = pd.read_pickle(path2)
            print(f"Dataset 2 shape: {df2.shape}")
            if set(df1.columns) != set(df2.columns):
                common_cols = list(set(df1.columns) & set(df2.columns))
                df1 = df1[common_cols]
                df2 = df2[common_cols]
            combined_df = pd.concat([df1, df2], ignore_index=True)

        except FileNotFoundError:
            print("Second dataset not found, using only the first dataset")
            combined_df = df1

        if 'review' in combined_df.columns:
            initial_len = len(combined_df)
            combined_df = combined_df.drop_duplicates(subset=['review'], keep='first')
            final_len = len(combined_df)
            if initial_len != final_len:
                print(f"Removed {initial_len - final_len} duplicate reviews")

        print(f"Combined dataset shape: {combined_df.shape}")
        return combined_df

    except Exception as e:
        print(f"Error loading data: {e}")
        raise

def prepare_labels(df, min_label_count=20):
    label_columns = ['atmosphere', 'narrative_structure', 'themes']
    all_labels, label_counter = [], Counter()

    for _, row in df.iterrows():
        row_labels = []
        for col in label_columns:
            if col in df.columns:
                value = row[col]
                if value is not None and not (isinstance(value, float) and np.isnan(value)):
                    if isinstance(value, str): labels = [l.strip().lower() for l in value.split(',') if l.strip()]
                    elif isinstance(value, (list, np.ndarray)): labels = [str(i).strip().lower() for i in value if str(i).strip()]
                    else: labels = [str(value).strip().lower()] if str(value).strip().lower() != 'nan' else []
                    row_labels.extend(labels)
                    label_counter.update(labels)
        all_labels.append(row_labels)

    frequent_labels = {label for label, count in label_counter.items() if count >= min_label_count}
    filtered_labels = [[label for label in row_labels if label in frequent_labels] for row_labels in all_labels]

    mlb = MultiLabelBinarizer()
    label_matrix = mlb.fit_transform(filtered_labels)

    return label_matrix, mlb, label_counter

def compute_class_weights(y_train):
    pos_counts = np.sum(y_train, axis=0)
    neg_counts = len(y_train) - pos_counts
    pos_weights = np.clip(neg_counts / (pos_counts + 1e-8), 1.0, 10.0)
    return torch.FloatTensor(pos_weights).to(device)

def compute_metrics_for_trainer(eval_pred):
    predictions, labels = eval_pred
    predictions = torch.sigmoid(torch.tensor(predictions)) > 0.5
    predictions = predictions.numpy().astype(int)
    labels = labels.astype(int)
    f1_macro = f1_score(labels, predictions, average='macro', zero_division=0)
    return {'f1_macro': f1_macro}

def get_validation_predictions(model, val_dataset):
    print("\nGetting predictions from the best model for the validation set...")
    model.eval()
    model.to(device)

    val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=32)
    all_logits, all_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            all_logits.append(outputs.logits.cpu())
            all_labels.append(labels.cpu())

    probabilities = torch.sigmoid(torch.cat(all_logits, dim=0))
    true_labels = torch.cat(all_labels, dim=0)

    return probabilities.numpy(), true_labels.numpy()

def tune_thresholds(probabilities, true_labels, label_names):
    print("\nFinding optimal thresholds for each label...")
    best_thresholds = {}
    for i in range(probabilities.shape[1]):
        label_name = label_names[i]
        y_prob = probabilities[:, i]
        y_true = true_labels[:, i]

        best_f1, best_thresh = 0, 0.5
        for thresh in np.arange(0.1, 0.9, 0.01):
            y_pred = (y_prob > thresh).astype(int)
            f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_thresh = thresh

        best_thresholds[label_name] = best_thresh
        if i % 5 == 0:  # Print progress
             print(f"Processed label {i+1}/{len(label_names)}: '{label_name}' -> best thresh={best_thresh:.2f}")
    return best_thresholds

def evaluate_with_optimal_thresholds(probabilities, true_labels, optimal_thresholds_dict, label_names):
    print("\nFinal Evaluation Report (with Optimal Thresholds)")
    thresholds_array = np.array([optimal_thresholds_dict[label] for label in label_names])
    y_pred = (probabilities > thresholds_array).astype(int)

    report = classification_report(true_labels, y_pred, target_names=label_names, zero_division=0)
    print(report)

    f1_macro_optimal = f1_score(true_labels, y_pred, average='macro', zero_division=0)
    return {'f1_macro_optimal': f1_macro_optimal}

def run_pipeline():
    # Part 1: Data Preparation
    print("Part 1: Loading and Preparing Data")
    processing_df = pd.read_pickle(TRAINING_DATA_CLEANED_PATH) # If done in 2 parts with 2 training data files, use load and merge
    label_matrix, mlb, label_counter = prepare_labels(processing_df, min_label_count=20)

    X = processing_df['review'].values
    y = label_matrix
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    print(f"Training samples: {len(X_train)}, Validation samples: {len(X_val)}")

    pos_weights = compute_class_weights(y_train)

    # Part 2: Model and Trainer Setup
    print("\nPart 2: Setting up Model and Trainer")
    model_name = ZSC_MODEL
    output_dir = '/content/drive/My Drive/movie_similarity/roberta_multilabel_improved'

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(mlb.classes_),
        problem_type="multi_label_classification"
    )

    train_dataset = ReviewDataset(X_train, y_train, tokenizer)
    val_dataset = ReviewDataset(X_val, y_val, tokenizer)

    loss_fn = WeightedBCELoss(pos_weights)

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=6,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=4,
        warmup_ratio=0.2,
        weight_decay=0.01,
        logging_steps=150,
        eval_strategy="steps",
        eval_steps=150,
        save_strategy="steps",
        save_steps=150,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        save_total_limit=2,
        report_to="none",
        learning_rate=1e-5,
        fp16=torch.cuda.is_available(),
    )

    trainer = CustomTrainer(
        loss_fn=loss_fn,
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_for_trainer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Part 3: Training
    print("\nPart 3: Starting Model Training")
    try:
        train_result = trainer.train()
        print("Training completed successfully!")
        original_final_metrics = trainer.evaluate() # Get the final metrics with the 0.5 threshold
    except Exception as e:
        print(f"An error occurred during training: {e}")
        raise

    # Part 4: Post-Training Analysis
    print("\nPart 4: Post-Training Analysis")

    best_checkpoint_path = trainer.state.best_model_checkpoint
    if not best_checkpoint_path:
        print("Could not determine the best model checkpoint. Using the last model state.")
        best_model = trainer.model
    else:
        print(f"Loading best model from checkpoint: {best_checkpoint_path}")
        best_model = AutoModelForSequenceClassification.from_pretrained(best_checkpoint_path)

    probabilities, true_labels = get_validation_predictions(best_model, val_dataset)

    # Find the optimal threshold for each label
    optimal_thresholds = tune_thresholds(probabilities, true_labels, mlb.classes_)

    # Save everything needed for future predictions
    print("\nSaving model, tokenizer, and all analysis artifacts...")
    trainer.save_model(output_dir) # Saves best model and tokenizer
    tokenizer.save_pretrained(output_dir)

    model_info = {
        'label_binarizer': mlb,
        'label_counter': label_counter,
        'optimal_thresholds': optimal_thresholds,
        'model_name': model_name
    }
    joblib.dump(model_info, os.path.join(output_dir, 'model_info.pkl'))
    print(f"All artifacts saved to: {output_dir}")

    new_final_metrics = evaluate_with_optimal_thresholds(probabilities, true_labels, optimal_thresholds, mlb.classes_)

    print("\n\n================= FINAL PERFORMANCE SUMMARY =================")
    print(f"Original F1 Macro: {original_final_metrics['eval_f1_macro']:.4f}")
    print(f"Optimal F1 Macro (tuned thresholds):  {new_final_metrics['f1_macro_optimal']:.4f}")
    print("===========================================================")

def predict_with_optimal_thresholds(review_text, model_path):
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model path not found: {model_path}")

    # Load all artifacts
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model_info = joblib.load(os.path.join(model_path, 'model_info.pkl'))

    mlb = model_info['label_binarizer']
    optimal_thresholds = model_info['optimal_thresholds']

    # Ensure thresholds are in the correct order
    thresholds_tensor = torch.tensor([optimal_thresholds[label] for label in mlb.classes_])

    model.eval()
    model.to(device)
    thresholds_tensor = thresholds_tensor.to(device)

    # Tokenize and predict
    encoding = tokenizer(review_text, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probabilities = torch.sigmoid(outputs.logits)
        predictions = (probabilities > thresholds_tensor).cpu().numpy().astype(int)

    predicted_labels = mlb.inverse_transform(predictions)

    return predicted_labels[0] if predicted_labels else []

def tune_thresholds_and_compare():
    # Step 1: Load all necessary artifacts
    print("Step 1: Loading model and data artifacts")
    if not os.path.exists(BEST_MODEL_PATH):
        raise FileNotFoundError(f"CRITICAL: The specified best model path does not exist: {BEST_MODEL_PATH}")

    model = AutoModelForSequenceClassification.from_pretrained(BEST_MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(BEST_MODEL_PATH)

    model_info = joblib.load(MODEL_INFO_PATH)
    mlb = model_info['label_binarizer']

    # Step 2: Recreate the exact same validation set
    print("\nStep 2: Recreating the validation dataset")

    df_full = pd.read_pickle(TRAINING_DATA_CLEANED_PATH)

    # Use the loaded binarizer to transform the labels
    all_labels = []
    label_columns = ['atmosphere', 'narrative_structure', 'themes']
    for _, row in df_full.iterrows():
        row_labels = []
        for col in label_columns:
            if col in df_full.columns:
                value = row[col]
                if value is not None and not (isinstance(value, float) and np.isnan(value)):
                    if isinstance(value, str): labels = [l.strip().lower() for l in value.split(',') if l.strip()]
                    elif isinstance(value, (list, np.ndarray)): labels = [str(i).strip().lower() for i in value if str(i).strip()]
                    else: labels = [str(value).strip().lower()] if str(value).strip().lower() != 'nan' else []
                    row_labels.extend(labels)
        all_labels.append(row_labels)

    y = mlb.transform(all_labels)
    X = df_full['review'].values

    # Use the same random_state to get the exact same split
    _, X_val, _, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    val_dataset = ReviewDataset(X_val, y_val, tokenizer)
    print(f"Successfully recreated validation set with {len(val_dataset)} samples.")

    # Step 3: Run the analysis pipeline
    print("\nStep 3: Running the analysis pipeline")
    probabilities, true_labels = get_validation_predictions(model, val_dataset)
    optimal_thresholds = tune_thresholds(probabilities, true_labels, mlb.classes_)
    final_metrics = evaluate_with_optimal_thresholds(probabilities, true_labels, optimal_thresholds, mlb.classes_)

    # Step 4: Display final results
    print("\n\n================= FINAL PERFORMANCE SUMMARY =================")
    print(f"Optimal F1 Macro (tuned thresholds):                  {final_metrics['f1_macro_optimal']:.4f}")
    print("===========================================================")

Using device: cuda


In [None]:
# Run pipeline to get the full model saved with all model info

run_pipeline()

In [None]:
import joblib

# **WHEN RAN PIPELINE WITHOUT SAVING OPTIMAL_THRESHOLDS IN MODEL INFO**

# You need to have the 'optimal_thresholds' dictionary available in your session.
# If you don't, you must re-run the analysis part of the standalone script to get it.

print(f"Loading existing model info from: {MODEL_INFO_PATH}")
model_info = joblib.load(MODEL_INFO_PATH)

print("Adding optimal thresholds to model_info...")
model_info['optimal_thresholds'] = optimal_thresholds # Add the thresholds dictionary

# Overwrite the old file with the new, improved one
joblib.dump(model_info, MODEL_INFO_PATH)

print("\nSUCCESS: Your optimal thresholds have been permanently saved!")
print("The file 'model_info.pkl' now contains your label binarizer AND your thresholds.")

In [None]:
# Test run
my_review = "The movie was a bit strange and surreal. It didn't have much of a plot, but the visuals were stunning and it felt like a dream."

# Get the optimized predictions
final_labels = predict_with_optimal_thresholds(review_text=my_review, model_path=BEST_MODEL_PATH)

print(f"Review: '{my_review}'")
print(f"Predicted Labels: {final_labels}")

Review: 'The movie was a bit strange and surreal. It didn't have much of a plot, but the visuals were stunning and it felt like a dream.'
Predicted Labels: ('chaotic', 'circular', 'dreamy', 'episodic', 'family', 'identity', 'isolation', 'loss', 'mysterious', 'non-linear', 'surreal', 'transformation')


In [None]:
import torch
import joblib
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, SequentialSampler # Good for batching
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

FINAL_MODEL_DIR = BEST_MODEL_PATH

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def classify_reviews_batch_custom(reviews, model_path, batch_size = 32):
    print(f"Using Custom Fine-Tuned Model for Batch Classification")
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model directory not found: {model_path}")

    # Load all artifacts ONCE
    print("Loading model and artifacts...")
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model_info = joblib.load(os.path.join(model_path, 'model_info.pkl'))
    mlb = model_info['label_binarizer']
    optimal_thresholds = model_info['optimal_thresholds']

    # Prepare model and thresholds for inference
    model.eval()
    model.to(device)
    thresholds_tensor = torch.tensor([optimal_thresholds[label] for label in mlb.classes_]).to(device)

    # Process reviews in mini-batches
    all_predicted_labels = []
    print(f"Starting classification of {len(reviews)} reviews with batch size {batch_size}...")

    for i in range(0, len(reviews), batch_size):
        batch_reviews = reviews[i:i+batch_size]
        print(f"  Processing batch {i//batch_size + 1}...")

        with torch.no_grad():
            # Tokenize the entire batch at once. `padding=True` handles different lengths.
            encoding = tokenizer(
                batch_reviews,
                truncation=True,
                padding=True, # Use dynamic padding for batches
                max_length=256,
                return_tensors='pt'
            )

            # Send the whole batch to the GPU
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)

            # Get model outputs for the batch
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probabilities = torch.sigmoid(outputs.logits)

            # Apply thresholds to the entire batch
            predictions = (probabilities > thresholds_tensor).cpu().numpy().astype(int)

            # Decode the batch of predictions back into label names
            batch_labels = mlb.inverse_transform(predictions)
            all_predicted_labels.extend(batch_labels)

    print("Batch classification complete.")
    return all_predicted_labels

def separate_labels_by_category(all_labels):
    atmosphere_results = []
    theme_results = []
    narrative_results = []

    for labels in all_labels:
        label_set = set(labels) if labels else set()

        atmosphere = [label for label in labels if label in atmosphere_labels] if labels else []
        themes = [label for label in labels if label in theme_candidates] if labels else []
        narrative = [label for label in labels if label in narrative_labels] if labels else []

        atmosphere_results.append(atmosphere)
        theme_results.append(themes)
        narrative_results.append(narrative)

    return atmosphere_results, theme_results, narrative_results

def create_classified_embeddings(df, model_path: str, batch_size: int = 32):
    df['review'] = df['review'].fillna('')

    # Load SentenceTransformer for embeddings
    sbert_model = SentenceTransformer(SBERT_MODEL)

    print("Starting classification with custom model...")

    # Filter out empty reviews for classification
    valid_reviews = [(i, review) for i, review in enumerate(df['review'].tolist()) if review.strip()]

    if not valid_reviews:
        print("No valid reviews found for classification.")
        # Create empty results
        all_labels = [[] for _ in range(len(df))]
    else:
        # Extract just the review texts for classification
        review_texts = [review for _, review in valid_reviews]

        # Classify using the custom model
        classified_labels = classify_reviews_batch_custom(review_texts, model_path, batch_size)

        # Map results back to original indices
        results = {}
        for j, (orig_idx, _) in enumerate(valid_reviews):
            results[orig_idx] = list(classified_labels[j]) if classified_labels[j] else []

        # Create final list with all indices
        all_labels = [results.get(i, []) for i in range(len(df))]

    print(f"Completed classification. Processing {len([labels for labels in all_labels if labels])} non-empty classifications.")

    # Separate labels by category
    print("Separating labels by category...")
    atmosphere_labels, theme_labels, narrative_labels = separate_labels_by_category(all_labels)

    def encode_labels(labels):
        """Encode a list of labels into a single embedding vector."""
        if not labels:
            return np.zeros(384, dtype='float32')
        label_embeds = sbert_model.encode(labels, batch_size=32, show_progress_bar=False)
        return np.mean(label_embeds, axis=0).astype('float32')

    print("Encoding atmosphere labels...")
    atmosphere_emb = np.array([encode_labels(labels) for labels in atmosphere_labels])

    print("Encoding narrative_structure labels...")
    narrative_emb = np.array([encode_labels(labels) for labels in narrative_labels])

    print("Encoding themes labels...")
    themes_emb = np.array([encode_labels(labels) for labels in theme_labels])

    print("Encoding combined labels...")
    combined_emb = np.array([encode_labels(labels) for labels in all_labels])

    print("Completed label encoding.")

    # Create final DataFrame
    final_df = pd.DataFrame({
        'id': df['id'],
        'atmosphere': atmosphere_labels,
        'narrative_structure': narrative_labels,
        'themes': theme_labels,
        'atmosphere_emb': list(atmosphere_emb),
        'narrative_emb': list(narrative_emb),
        'themes_emb': list(themes_emb),
        'combined_emb': list(combined_emb)
    })

    print("Sample results:")
    print(final_df.head())

    non_empty_indices = [i for i, labels in enumerate(all_labels) if labels]
    if non_empty_indices:
        print(f"\nExample classifications:")
        for i, idx in enumerate(non_empty_indices[:5]):
            print(f"Example {i+1}:")
            print(f"Atmosphere: {atmosphere_labels[idx]}")
            print(f"Themes: {theme_labels[idx]}")
            print(f"Narrative: {narrative_labels[idx]}")
            print(f"All labels: {all_labels[idx]}")

    output_path = CLASSIFIED_EMBEDDINGS_PATH
    final_df.to_pickle(output_path)
    print(f"\nSaved results to: {output_path}")

    return final_df

def test_create_classified_embeddings_custom(df):
    test_df = df[df['title'] == 'Lost Highway'].copy()
    print(test_df)
    if test_df.empty:
        raise ValueError("Movie Lost Highway not found in the dataframe")

    print(f"Testing with movie ID: {test_df['id'].iloc[0]}")
    test_df['review'] = test_df['review'].fillna('')
    reviews_to_classify = test_df['review'].tolist()

    # Single classification call
    all_labels = classify_reviews_batch_custom(reviews_to_classify, FINAL_MODEL_DIR)

    print("\nFiltering results into categories...")
    test_df['atmosphere'] = [[label for label in review_labels if label in atmosphere_labels] for review_labels in all_labels]
    test_df['narrative_structure'] = [[label for label in review_labels if label in narrative_labels] for review_labels in all_labels]
    test_df['themes'] = [[label for label in review_labels if label in theme_candidates] for review_labels in all_labels]

    print(f"Atmosphere results: {test_df['atmosphere'].iloc[0]}")
    print(f"Narrative structure results: {test_df['narrative_structure'].iloc[0]}")
    print(f"Themes results: {test_df['themes'].iloc[0]}")

    # Embedding
    print("\nStarting label encoding...")
    sbert_model = SentenceTransformer(SBERT_MODEL)

    def encode_labels(labels):
        if not labels:
            return np.zeros(384, dtype='float32')
        label_embeds = sbert_model.encode(labels, batch_size=32, show_progress_bar=False)
        return np.mean(label_embeds, axis=0).astype('float32')

    atmosphere_emb = np.array([encode_labels(labels) for labels in test_df['atmosphere']])
    narrative_emb = np.array([encode_labels(labels) for labels in test_df['narrative_structure']])
    themes_emb = np.array([encode_labels(labels) for labels in test_df['themes']])

    print("Completed label encoding.")

    final_df = pd.DataFrame({
        'id': test_df['id'],
        'atmosphere_emb': list(atmosphere_emb),
        'narrative_emb': list(narrative_emb),
        'themes_emb': list(themes_emb),
    })

    print("\nFinal test results with custom model:")
    print(final_df.head())

In [None]:
initial_df = pd.read_pickle(INITIAL_DATA_PATH)

print(initial_df.head())

final_df = create_classified_embeddings(initial_df, BEST_MODEL_PATH)

print(final_df.head())

   id                title                                           overview  \
0   2                Ariel  A Finnish man goes to the city to find a job a...   
1   3  Shadows in Paradise  Nikander, a rubbish collector and would-be ent...   
2   5           Four Rooms  It's Ted the Bellhop's first night on the job....   
3   6       Judgment Night  Four young friends, while taking a shortcut en...   
4  11            Star Wars  Princess Leia is captured and held hostage by ...   

   vote_average                            genres  \
0         7.100        Comedy,Crime,Drama,Romance   
1         7.291              Comedy,Drama,Romance   
2         5.864                            Comedy   
3         6.455             Action,Crime,Thriller   
4         8.206  Action,Adventure,Science Fiction   

                                            keywords  \
0  factory worker,falling in love,helsinki, finla...   
1               garbage,helsinki, finland,salesclerk   
2  anthology,bet,hoodlum,h

In [None]:
test_df = final_df[final_df['id'] == 13].copy()
print(test_df)

In [None]:
# **IF FORGOT TO SAVE TOKENIZER**

import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

FINAL_SAVE_PATH = BEST_MODEL_PATH

BEST_CHECKPOINT_PATH = '/content/drive/My Drive/movie_similarity/roberta_multilabel_improved/checkpoint-600'

MODEL_NAME = ZSC_MODEL

print(f"Loading components")

# Check if the checkpoint path exists
if not os.path.exists(BEST_CHECKPOINT_PATH):
    raise FileNotFoundError(
        f"CRITICAL: The checkpoint path was not found: {BEST_CHECKPOINT_PATH}\n"
        "Please check the folder and update the path to the correct checkpoint directory."
    )

# Load the best model from the specified checkpoint
print(f"Loading model from checkpoint: {BEST_CHECKPOINT_PATH}")
model = AutoModelForSequenceClassification.from_pretrained(BEST_CHECKPOINT_PATH)

# CRITICAL STEP: Load a fresh tokenizer from the original Hugging Face source.
# We do this because the tokenizer files are what's missing in your saved folder.
print(f"Loading fresh tokenizer from source: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print("\nSaving all components to the final directory")

# Ensure the final destination directory exists
os.makedirs(FINAL_SAVE_PATH, exist_ok=True)

# Save the model and the tokenizer to the FINAL, clean directory.
# This will create all the necessary files, including vocab.json.
model.save_pretrained(FINAL_SAVE_PATH)
tokenizer.save_pretrained(FINAL_SAVE_PATH)

print(f"\nSuccessfully saved model and tokenizer to: {FINAL_SAVE_PATH}")

print("\nVerifying the contents of the final directory ---")
# Let's list the files to make sure everything is there now.
!ls -l "{FINAL_SAVE_PATH}"

print("\nVerification complete. You should see 'vocab.json' and 'merges.txt' in the list above.")
print("You can now safely run your 'predict_with_optimal_thresholds' or other functions.")

--- Step 1: Loading components ---
Loading model from checkpoint: /content/drive/My Drive/movie_similarity/roberta_multilabel_improved/checkpoint-600
Loading fresh tokenizer from source: FacebookAI/roberta-base

--- Step 2: Saving all components to the final directory ---

Successfully saved model and tokenizer to: /content/drive/My Drive/movie_similarity/roberta_multilabel_improved

--- Step 3: Verifying the contents of the final directory ---
total 491729
drwx------ 2 root root      4096 Jul 22 17:28 checkpoint-600
drwx------ 2 root root      4096 Jul 22 17:39 checkpoint-738
-rw------- 1 root root      1917 Jul 22 17:56 config.json
-rw------- 1 root root    456318 Jul 22 17:57 merges.txt
-rw------- 1 root root      2035 Jul 22 17:44 model_info.pkl
-rw------- 1 root root 498695876 Jul 22 17:57 model.safetensors
-rw------- 1 root root       280 Jul 22 17:57 special_tokens_map.json
-rw------- 1 root root      1246 Jul 22 17:57 tokenizer_config.json
-rw------- 1 root root   3558643 Jul 2