In [1]:
import json
import pandas as pd

# Load dataset
with open('../data/new_spans_labeled.json', 'r') as file:
    dataset = json.load(file)

# Aspects of interest
aspects = ['Cinematography', 'Direction', 'Story', 'Characters', "Production Design", "Unique Concept", "Emotions"]

# Aspect encoding mapping
aspect_encoding = {aspect: index for index, aspect in enumerate(aspects)}

# Prepare data for DataFrame
data = []

for review in dataset:
    review_text = review["review"]
    for aspect in aspects:
        snippets = review.get(aspect, [])  # Get the snippets for the aspect, default to an empty list
        if not snippets:  # Ensure a row even if there are no snippets
            snippets = [""]  # Use an empty string for consistency

        # Combine snippets into a single list for each aspect
        entry = {
            "review": review_text,
            "aspect": aspect,
            "snippets": [snippet.replace('"','') for snippet in snippets],  # Store snippets as a list
            "aspect_encoded": aspect_encoding[aspect]
        }
        data.append(entry)

# Convert to DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,review,aspect,snippets,aspect_encoded
0,One of the other reviewers has mentioned that ...,Cinematography,[],0
1,One of the other reviewers has mentioned that ...,Direction,[],1
2,One of the other reviewers has mentioned that ...,Story,[The first thing that struck me about Oz was i...,2
3,One of the other reviewers has mentioned that ...,Characters,"[Em City is home to many..Aryans, Muslims, gan...",3
4,One of the other reviewers has mentioned that ...,Production Design,[],4
...,...,...,...,...
345795,The first time I ever saw this movie was when ...,Story,"[The only problem I have with this movie, howe...",2
345796,The first time I ever saw this movie was when ...,Characters,"[The actors were amazing., Treat Williams is g...",3
345797,The first time I ever saw this movie was when ...,Production Design,[],4
345798,The first time I ever saw this movie was when ...,Unique Concept,[],5


In [2]:
df.to_csv("../data/formatted_data.csv")

In [3]:
new_rows=[]
for index, row in df.iterrows():
    snippets=[]
    not_found=False
    for snippet in row['snippets']:
        if snippet!='':
            if snippet.lower() in row['review'].lower():
                snippets.append(snippet.lower())
            else:
                # print(snippet)
                not_found=True
                break
    if not not_found:
        new_rows.append([row['review'].lower(),row['aspect'],snippets])

In [27]:
import json
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments

from sklearn.model_selection import train_test_split

In [5]:
df = pd.DataFrame(new_rows, columns = ['review', 'aspect','snippet'])
aspect_encoder = LabelEncoder()
df['aspect_encoded'] = aspect_encoder.fit_transform(df['aspect'])

In [6]:
aspect_counts = df['aspect'].value_counts()

In [7]:
balanced_df = pd.DataFrame()

for aspect in aspect_counts.index:
    subset = df[df['aspect'] == aspect]
    if len(subset) > 20000:
        subset = subset.sample(n=20000, random_state=1)  # Random sampling
    elif len(subset) < 20000:
        # Duplicating entries if there are fewer than 20,000
        subset = subset.sample(n=20000, replace=True, random_state=1)
    balanced_df = pd.concat([balanced_df, subset])

print(balanced_df['aspect'].value_counts())


aspect
Production Design    20000
Emotions             20000
Cinematography       20000
Direction            20000
Unique Concept       20000
Characters           20000
Story                20000
Name: count, dtype: int64


In [8]:
# Define the dataset class
class ReviewAspectDataset(Dataset):
    def __init__(self, reviews, aspects, snippets, tokenizer, max_len):
        self.reviews = reviews
        self.aspects = aspects
        self.snippets = snippets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        aspect = str(self.aspects[idx])
        snippets = self.snippets[idx]

        inputs = self.tokenizer.encode_plus(
            review,
            aspect,
            add_special_tokens=False,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        tokens = self.tokenizer.tokenize(review)
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        labels = [0] * len(token_ids)

        for snippet in snippets:
            snippet_tokens = self.tokenizer.tokenize(snippet)
            snippet_token_ids = self.tokenizer.convert_tokens_to_ids(snippet_tokens)

            for i in range(len(token_ids) - len(snippet_token_ids) + 1):
                if token_ids[i:i+len(snippet_token_ids)] == snippet_token_ids:
                    labels[i:i+len(snippet_token_ids)] = [1] * len(snippet_token_ids)
                    break  # Assuming one occurrence of snippet in review

        # Pad or truncate labels to match max_len
        if len(labels) < self.max_len:
            labels += [0] * (self.max_len - len(labels))
        else:
            labels = labels[:self.max_len]

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }
# Tokenizer and dataset preparation
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 512

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [9]:
train_dataset = ReviewAspectDataset(
    reviews=train_df['review'].to_numpy(),
    aspects=train_df['aspect'].to_numpy(),
    snippets=train_df['snippet'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
)

val_dataset = ReviewAspectDataset(
    reviews=val_df['review'].to_numpy(),
    aspects=val_df['aspect'].to_numpy(),
    snippets=val_df['snippet'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
)

In [26]:
# Define the model
import os
from transformers import EarlyStoppingCallback
os.environ['WANDB_DISABLED'] = 'true'
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True,  # To ensure the best model is loaded at the end
    save_total_limit=1,  # To keep only the best model
    learning_rate= 5e-05
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [11]:
import logging
logging.disable(logging.WARNING)
trainer.train()

  1%|          | 500/59314 [09:10<20:31:57,  1.26s/it]

{'loss': 0.1072, 'grad_norm': 0.5402780175209045, 'learning_rate': 5e-05, 'epoch': 0.02}


                                                      
  1%|          | 500/59314 [1:01:37<20:31:57,  1.26s/it]

{'eval_loss': 0.07571308314800262, 'eval_runtime': 3147.1905, 'eval_samples_per_second': 18.847, 'eval_steps_per_second': 2.356, 'epoch': 0.02}


  2%|▏         | 1000/59314 [1:12:36<21:28:23,  1.33s/it]   

{'loss': 0.0736, 'grad_norm': 0.20583581924438477, 'learning_rate': 4.957493113884449e-05, 'epoch': 0.03}


                                                         
  2%|▏         | 1000/59314 [2:06:15<21:28:23,  1.33s/it]

{'eval_loss': 0.0688648670911789, 'eval_runtime': 3219.6376, 'eval_samples_per_second': 18.423, 'eval_steps_per_second': 2.303, 'epoch': 0.03}


  3%|▎         | 1500/59314 [2:17:24<20:41:03,  1.29s/it]    

{'loss': 0.0686, 'grad_norm': 0.2538255453109741, 'learning_rate': 4.914986227768899e-05, 'epoch': 0.05}


                                                         
  3%|▎         | 1500/59314 [3:11:07<20:41:03,  1.29s/it]

{'eval_loss': 0.060462091118097305, 'eval_runtime': 3223.2685, 'eval_samples_per_second': 18.402, 'eval_steps_per_second': 2.3, 'epoch': 0.05}


  3%|▎         | 2000/59314 [3:21:58<21:18:06,  1.34s/it]    

{'loss': 0.0604, 'grad_norm': 0.26426786184310913, 'learning_rate': 4.872479341653348e-05, 'epoch': 0.07}


                                                         
  3%|▎         | 2000/59314 [4:11:53<21:18:06,  1.34s/it]

{'eval_loss': 0.060468945652246475, 'eval_runtime': 2995.095, 'eval_samples_per_second': 19.804, 'eval_steps_per_second': 2.476, 'epoch': 0.07}


  4%|▍         | 2500/59314 [4:20:14<15:58:37,  1.01s/it]    

{'loss': 0.0614, 'grad_norm': 0.41730356216430664, 'learning_rate': 4.8299724555377975e-05, 'epoch': 0.08}


                                                         
  4%|▍         | 2500/59314 [5:00:19<15:58:37,  1.01s/it]

{'eval_loss': 0.056259021162986755, 'eval_runtime': 2404.5265, 'eval_samples_per_second': 24.668, 'eval_steps_per_second': 3.084, 'epoch': 0.08}


  5%|▌         | 3000/59314 [5:08:40<15:39:24,  1.00s/it]    

{'loss': 0.0558, 'grad_norm': 0.8817415833473206, 'learning_rate': 4.7874655694222465e-05, 'epoch': 0.1}


                                                         
  5%|▌         | 3000/59314 [5:48:45<15:39:24,  1.00s/it]

{'eval_loss': 0.0542791448533535, 'eval_runtime': 2404.8832, 'eval_samples_per_second': 24.664, 'eval_steps_per_second': 3.083, 'epoch': 0.1}


  6%|▌         | 3500/59314 [5:57:07<15:46:28,  1.02s/it]    

{'loss': 0.0536, 'grad_norm': 0.533261239528656, 'learning_rate': 4.744958683306696e-05, 'epoch': 0.12}


                                                         
  6%|▌         | 3500/59314 [6:34:51<15:46:28,  1.02s/it]

{'eval_loss': 0.05335463210940361, 'eval_runtime': 2263.9965, 'eval_samples_per_second': 26.199, 'eval_steps_per_second': 3.275, 'epoch': 0.12}


  7%|▋         | 4000/59314 [6:41:58<12:57:38,  1.19it/s]    

{'loss': 0.0549, 'grad_norm': 0.5137315392494202, 'learning_rate': 4.702451797191145e-05, 'epoch': 0.13}


                                                         
  7%|▋         | 4000/59314 [7:17:52<12:57:38,  1.19it/s]

{'eval_loss': 0.05094146728515625, 'eval_runtime': 2153.5387, 'eval_samples_per_second': 27.543, 'eval_steps_per_second': 3.443, 'epoch': 0.13}


  8%|▊         | 4500/59314 [7:27:38<20:46:05,  1.36s/it]   

{'loss': 0.054, 'grad_norm': 0.3622477948665619, 'learning_rate': 4.659944911075594e-05, 'epoch': 0.15}


                                                         
  8%|▊         | 4500/59314 [8:12:09<20:46:05,  1.36s/it]

{'eval_loss': 0.051433391869068146, 'eval_runtime': 2671.5079, 'eval_samples_per_second': 22.202, 'eval_steps_per_second': 2.776, 'epoch': 0.15}


  8%|▊         | 5000/59314 [8:19:18<12:51:59,  1.17it/s]    

{'loss': 0.0529, 'grad_norm': 0.25286808609962463, 'learning_rate': 4.617438024960044e-05, 'epoch': 0.17}


                                                         
  8%|▊         | 5000/59314 [8:52:05<12:51:59,  1.17it/s]

{'eval_loss': 0.051521990448236465, 'eval_runtime': 1967.3512, 'eval_samples_per_second': 30.149, 'eval_steps_per_second': 3.769, 'epoch': 0.17}


  8%|▊         | 5000/59314 [8:52:07<96:20:20,  6.39s/it]

{'train_runtime': 31927.1848, 'train_samples_per_second': 14.862, 'train_steps_per_second': 1.858, 'train_loss': 0.06423976974487304, 'epoch': 0.17}





TrainOutput(global_step=5000, training_loss=0.06423976974487304, metrics={'train_runtime': 31927.1848, 'train_samples_per_second': 14.862, 'train_steps_per_second': 1.858, 'total_flos': 1.045187026944e+16, 'train_loss': 0.06423976974487304, 'epoch': 0.1685942610513538})

In [12]:
torch.cuda.empty_cache()

In [18]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
from transformers import EvalPrediction

# Custom evaluation function to generate classification report
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    # Flatten the predictions and labels
    preds_flat = preds.flatten()
    labels_flat = labels.flatten()

    # Filter out the labels where the label is -100 (ignore index)
    mask = labels_flat != -100
    preds_flat = preds_flat[mask]
    labels_flat = labels_flat[mask]

    # Generate classification report
    report = classification_report(labels_flat, preds_flat, target_names=['O', 'B-SNIPPET'], digits=4)
    accuracy = accuracy_score(labels_flat, preds_flat)

    print("\nClassification Report:\n", report)
    print(f"Accuracy: {accuracy:.4f}")

    # Safely extract metrics from the report
    report_lines = report.split('\n')
    f1_scores = []
    precision_scores = []
    recall_scores = []
    
    # Extract metrics from each class, skipping the header and footer lines
    for line in report_lines[2:-3]:
        parts = line.split()
        if len(parts) >= 4:  # Ensure that the line has enough parts
            recall_scores.append(float(parts[-4]))
            precision_scores.append(float(parts[-3]))
            f1_scores.append(float(parts[-2]))

    return {
        "accuracy": accuracy,
        "f1": np.mean(f1_scores) if f1_scores else 0.0,
        "precision": np.mean(precision_scores) if precision_scores else 0.0,
        "recall": np.mean(recall_scores) if recall_scores else 0.0
    }

# Trainer with custom metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Adding the custom compute_metrics function
)

# Evaluate the model on validation dataset
trainer.evaluate()


100%|██████████| 7415/7415 [33:33<00:00,  3.68it/s]


Classification Report:
               precision    recall  f1-score   support

           O     0.9871    0.9973    0.9921  29752068
   B-SNIPPET     0.7366    0.3690    0.4917    616700

    accuracy                         0.9845  30368768
   macro avg     0.8618    0.6831    0.7419  30368768
weighted avg     0.9820    0.9845    0.9820  30368768

Accuracy: 0.9845





{'eval_loss': 0.04379426687955856,
 'eval_model_preparation_time': 0.0016,
 'eval_accuracy': 0.9845067801235796,
 'eval_f1': 0.7419,
 'eval_precision': 0.6831499999999999,
 'eval_recall': 0.86185,
 'eval_runtime': 2014.0076,
 'eval_samples_per_second': 29.451,
 'eval_steps_per_second': 3.682}

In [19]:

# Save the model
model.save_pretrained('./spans_based_bert_model')
tokenizer.save_pretrained('./spans_based_bert_model')


('./spans_based_bert_model/tokenizer_config.json',
 './spans_based_bert_model/special_tokens_map.json',
 './spans_based_bert_model/vocab.txt',
 './spans_based_bert_model/added_tokens.json')

In [22]:
import torch
from transformers import BertTokenizer, BertForTokenClassification

import re
# Load the fine-tuned model and tokenizer
model = BertForTokenClassification.from_pretrained('./spans_based_bert_model')
tokenizer = BertTokenizer.from_pretrained('./spans_based_bert_model')

def remove_specific_characters(strings_list):
    # Define the characters to be removed
    characters_to_remove = {
    '\x8d', '\x8b', '\x8c', '\x8f', '\x87', '\x8e', '\x81',
    '\x8a', '\x83', '\x94', '\x95', '\x97', '\x91', '\x89',
    '\x80', '\x99', '\x9e', '\xad', '\x9d', '\x98', '\x93',
    '\x82', '\x9c', '\x9f'"®", "´", "¿", "¥",
        "\u00c3", "\u00a2", "\u00c2", "\u0080", "\u00c2", "\u0099"
    }

    cleaned_strings_list = []

    for string in strings_list:
        cleaned_string = ''.join(char for char in string if char not in characters_to_remove)
        cleaned_strings_list.append(cleaned_string)

    return cleaned_strings_list

def remove_double_spaces(strings):
    pattern = re.compile(r'\s{2,}')  # Regex to match two or more spaces
    return [pattern.sub(' ', text) for text in strings]

def remove_multiple_punctuation(strings):
    # Create patterns to find multiple occurrences of ., !, and ,
    patterns = {
        r'\.{2,}': '.',
        r'\!{2,}': '!',
        r'\,{2,}': ','
    }

    # Process each string in the list
    cleaned_strings = []
    for text in strings:
        for pattern, replacement in patterns.items():
            text = re.sub(pattern, replacement, text)
        cleaned_strings.append(text)

    return cleaned_strings



def predict_snippet(review, aspect, model, tokenizer, max_len=256):
    model.eval()

    # Tokenize the input
    inputs = tokenizer.encode_plus(
        review,
        aspect,
        add_special_tokens=False,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation='longest_first'
    )
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Make predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    predictions = torch.argmax(logits, dim=2).flatten().tolist()
    new_predictions=predictions.copy()
    # Decode the tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids.flatten().tolist())
    # print(tokens)
    snippets=[]
    snippet=[]
    i = 0
    for token, label in zip(tokens, predictions):
        if label == 1:
            new_predictions[i] = 1
            snippet.append(token)
        elif label == 0 and i > 0 and i + 1 < len(tokens) and predictions[i - 1] == 1 and predictions[i + 1] == 1:
            new_predictions[i] = 1
            snippet.append(token)
        elif len(snippet):
            snippets.append(' '.join(snippet))
            snippet = []
        i += 1

    for i in range(1, len(new_predictions) - 2):
        # Check for the pattern 1,0,0,1
        if new_predictions[i] == 0 and new_predictions[i+1] == 0 and new_predictions[i-1] == 1 and new_predictions[i+2] == 1:
            new_predictions[i] = 1
            new_predictions[i+1] = 1

    # print(snippets)
    return snippets


In [25]:
import re


def clean_text(text):
    original_review= remove_double_spaces([text])
    original_review= remove_multiple_punctuation(original_review)
    original_review = remove_specific_characters(original_review)[0]
    text = original_review.lower()
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with a space
    text = re.sub(r'\.\.+', '.', text)  # Replace multiple periods with a single period
    text=text.replace(',','')
    text=text.replace('.','')
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space
    return text

def fix_special_characters(snippet):
    snippet=snippet.replace("[UNK]",'')
    snippet=snippet.replace(" ##",'')
    snippet=snippet.replace(" '","'")
    snippet=snippet.replace(" ’","’")
    snippet=snippet.replace("’ ","’")
    snippet=snippet.replace("' ","'")
    snippet=snippet.replace(" -","-")
    snippet=snippet.replace("- ","-")
    snippet=snippet.replace("/ ","/")
    snippet=snippet.replace(" /","/")
    snippet=snippet.replace(" :",":")
    snippet=snippet.replace(": ",":")
    return snippet

aspects=['Cinematography', 'Direction', 'Story', 'Characters', "Production Design", "Unique Concept", "Emotions"]
original_review="""The story was amazing but the cinematography wasn't it."""

#     print(aspect, new_snippets)
original_review=clean_text(original_review)

print("original review: ",original_review)

for aspect in aspects:
    predicted_snippets = predict_snippet(original_review, aspect, model, tokenizer)

    new_snippets = []
    for snippet in predicted_snippets:
        new_snippets.append(fix_special_characters(snippet))

    print(aspect, new_snippets, end='\n')
    print("\n-------------------")

original review:  the story was amazing but the cinematography wasn't it
Cinematography ["the cinematography wasn't"]

-------------------
Direction []

-------------------
Story ['the story was amazing']

-------------------
Characters []

-------------------
Production Design []

-------------------
Unique Concept []

-------------------
Emotions []

-------------------


In [29]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli")

# Example snippet
snippet = "the cinematography wasn't great"
candidate_labels = ["cinematography positive", "cinematography negative"]

# Classify the sentiment
sentiment_result = classifier(snippet, candidate_labels)
print(sentiment_result)

{'sequence': "the cinematography wasn't great", 'labels': ['cinematography negative', 'cinematography positive'], 'scores': [0.9993323683738708, 0.0006676383200101554]}
