In [None]:
import json
import pandas as pd

# Load dataset
with open("new_spans_labeled.json", 'r') as file:
    dataset = json.load(file)

### Clean Dataset

In [None]:

# Function to check if a snippet exists within a review and adjust the snippet length if necessary.
def find_snippet(review, snippet):
    """Check for exact or near-exact matches of a snippet in a review text.
    
    Args:
        review (str): The review text.
        snippet (str): The snippet to find in the review.
    
    Returns:
        tuple: (bool, str) indicating if found, and the matched snippet (possibly adjusted).
    """
    if snippet in review:
        return True, snippet
    elif snippet[:-1] in review:
        return True, snippet[:-1]
    return False, snippet

# Process each review and identify reviews containing all specified snippets from aspects.
def process_reviews(dataset, aspects):
    """Process a dataset of reviews to find and adjust snippets related to specified aspects.
    
    Args:
        dataset (list of dict): The dataset containing review details.
        aspects (list of str): List of aspects to check in each review.
    
    Returns:
        list: Filtered list of reviews containing all specified snippets.
    """
    reviews_list = []
    for data in dataset:
        review_text = data['review'].translate(str.maketrans({"'": '', '"': '', '“': '', ']': ''})).lower().strip()
        found = True

        for aspect in aspects:
            for i, item in enumerate(data.get(aspect, [])):
                snippet = item.translate(str.maketrans({"'": '', '"': '', '“': '', ']': ''})).lower().strip()
                is_found, adjusted_snippet = find_snippet(review_text, snippet)
                if not is_found:
                    found = False
                    break
                else:
                    data[aspect][i] = adjusted_snippet
            if not found:
                break

        if found:
            reviews_list.append(data)
    
    return reviews_list

# Create a DataFrame from processed review data.
def create_dataframe(reviews_list, aspects):
    """Generate a DataFrame from review data with encoded aspects and snippets.
    
    Args:
        reviews_list (list of dict): The processed list of reviews.
        aspects (list of str): Aspects to encode.
    
    Returns:
        DataFrame: A DataFrame with the review, aspect, snippets, and encoded aspect index.
    """
    aspect_encoding = {aspect: idx for idx, aspect in enumerate(aspects)}
    data = []

    for review in reviews_list:
        review_text = review['review']
        for aspect in aspects:
            snippets = review.get(aspect, [])
            snippets = snippets if snippets else ['']
            entry = {
                "review": review_text,
                "aspect": aspect,
                "snippets": snippets,
                "aspect_encoded": aspect_encoding[aspect]
            }
            data.append(entry)
    
    return pd.DataFrame(data)

# Example usage
aspects = ['Cinematography', 'Direction', 'Story', 'Characters', "Production Design","Emotions","Unique Concept"]
reviews_list = process_reviews(dataset, aspects)
df = create_dataframe(reviews_list, aspects)
df.head()


In [None]:

# Dictionary to hold counts
label_counts = {
    'Cinematography': 0,
    'Direction': 0,
    'Story': 0,
    'Characters': 0,
    'Production Design': 0,
    "Emotions":0,
    "Unique Concept":0
}

# Counting the number of elements for each label in each review
for review in reviews_list:
    for label in label_counts:
        if len(review[label]):
            # print(review[label])
            label_counts[label] += 1

# Printing the counts
print("Label counts:")
for label, count in label_counts.items():
    print(f"{label}: {count}")

In [None]:
import json
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments

from sklearn.model_selection import train_test_split

In [None]:
# Define the dataset class
class ReviewAspectDataset(Dataset):
    def __init__(self, reviews, aspects, snippets, tokenizer, max_len):
        self.reviews = reviews
        self.aspects = aspects
        self.snippets = snippets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        aspect = str(self.aspects[idx])
        snippets = self.snippets[idx]

        inputs = self.tokenizer.encode_plus(
            review,
            aspect,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        tokens = self.tokenizer.tokenize(review)
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        labels = [0] * len(token_ids)

        for snippet in snippets:
            snippet_tokens = self.tokenizer.tokenize(snippet)
            snippet_token_ids = self.tokenizer.convert_tokens_to_ids(snippet_tokens)

            for i in range(len(token_ids) - len(snippet_token_ids) + 1):
                if token_ids[i:i+len(snippet_token_ids)] == snippet_token_ids:
                    labels[i:i+len(snippet_token_ids)] = [1] * len(snippet_token_ids)
                    break  # Assuming one occurrence of snippet in review

        # Pad or truncate labels to match max_len
        if len(labels) < self.max_len:
            labels += [0] * (self.max_len - len(labels))
        else:
            labels = labels[:self.max_len]

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }
# Tokenizer and dataset preparation
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 512

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
train_dataset = ReviewAspectDataset(
    reviews=train_df['review'].to_numpy(),
    aspects=train_df['aspect'].to_numpy(),
    snippets=train_df['snippets'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
)

val_dataset = ReviewAspectDataset(
    reviews=val_df['review'].to_numpy(),
    aspects=val_df['aspect'].to_numpy(),
    snippets=val_df['snippets'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
)

# Model Training

In [None]:
import os
from transformers import EarlyStoppingCallback
import numpy as np
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score,f1_score
# Disable wandb logging
os.environ['WANDB_DISABLED'] = 'true'

# Load a BERT model pre-trained on 'bert-base-uncased' for token classification with two labels

model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=2)


# Define a custom loss function that adjusts class weights based on specific tokens
def custom_loss_fn(outputs, labels, token):
    # Assign initial class weights to prioritize tokens labeled as 1 over those labeled as 0.
    class_weights = torch.tensor([2.520658379555701, 9.601626815691288], device="cuda")
    # Adjust weights for specific classes
    if token in ["cinematography", "design", "direction","emotions","concept"]:
        class_weights[0] += 2.4562
        class_weights[1] += 4.4562

    # Use the CrossEntropyLoss with dynamic class weights
    loss_fct = CrossEntropyLoss(weight=class_weights)
    active_loss = labels.view(-1) != -100  # Filter out the padding tokens for loss calculation
    active_logits = outputs.view(-1, model.num_labels)[active_loss]
    active_labels = labels.view(-1)[active_loss]
    loss = loss_fct(active_logits, active_labels)
    return loss


# Define metrics computation for evaluation using accuracy and weighted F1 score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=2)

    # Flatten the predictions and labels for metric calculation
    preds_flat = preds.flatten()
    labels_flat = labels.flatten()

    # Calculate accuracy and F1 score
    accuracy = accuracy_score(labels_flat, preds_flat)
    f1 = f1_score(labels_flat, preds_flat, average='weighted')

    return {
        'accuracy': accuracy,
        'f1': f1
    }

# Set training arguments for the Trainer
training_args = TrainingArguments(
    output_dir='./results',  # Directory for saving output files
    num_train_epochs=4,  # Number of training epochs
    per_device_train_batch_size=52,  # Batch size for training
    per_device_eval_batch_size=52,  # Batch size for evaluation
    warmup_steps=500,  # Number of warmup steps
    weight_decay=0.01,  # Weight decay for regularization
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=500,  # Log every 500 steps
    evaluation_strategy="steps",  # Evaluate every 500 steps
    eval_steps=500,
    load_best_model_at_end=True,  # Load the best model at the end of training
    save_total_limit=1,  # Save only the best model to limit disk usage
    learning_rate=5e-05  # Learning rate
)

# Define a custom trainer class that uses the custom loss function
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].flatten())
        index = tokens.index("[PAD]")  # Find the padding token
        token = tokens[index - 2]  # Get the last real token before padding

        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = custom_loss_fn(logits, labels, token)
        return (loss, outputs) if return_outputs else loss

# Initialize the trainer with custom configurations
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=8)]
)


In [None]:
import logging
logging.disable(logging.WARNING)
trainer.train()

In [None]:
torch.cuda.empty_cache()

In [None]:
import numpy as np
from sklearn.metrics import classification_report
from transformers import EvalPrediction

# Custom evaluation function to generate classification report
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    # Flatten the predictions and labels
    preds_flat = preds.flatten()
    labels_flat = labels.flatten()

    # Filter out the labels where the label is -100 (ignore index)
    mask = labels_flat != -100
    preds_flat = preds_flat[mask]
    labels_flat = labels_flat[mask]

    # Generate classification report
    report = classification_report(labels_flat, preds_flat, target_names=['O', 'B-SNIPPET'], digits=4)
    accuracy = accuracy_score(labels_flat, preds_flat)

    print("\nClassification Report:\n", report)
    print(f"Accuracy: {accuracy:.4f}")

    # Safely extract metrics from the report
    report_lines = report.split('\n')
    f1_scores = []
    precision_scores = []
    recall_scores = []
    
    # Extract metrics from each class, skipping the header and footer lines
    for line in report_lines[2:-3]:
        parts = line.split()
        if len(parts) >= 4:  # Ensure that the line has enough parts
            recall_scores.append(float(parts[-4]))
            precision_scores.append(float(parts[-3]))
            f1_scores.append(float(parts[-2]))

    return {
        "accuracy": accuracy,
        "f1": np.mean(f1_scores) if f1_scores else 0.0,
        "precision": np.mean(precision_scores) if precision_scores else 0.0,
        "recall": np.mean(recall_scores) if recall_scores else 0.0
    }

# Trainer with custom metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Adding the custom compute_metrics function
)

# Evaluate the model on validation dataset
trainer.evaluate()



# Save the model



In [None]:
model.save_pretrained('./review_spans_extraction')
tokenizer.save_pretrained('./review_spans_extraction')

# Inference

In [4]:
import torch
from transformers import BertTokenizer, BertForTokenClassification

import re
# Load the fine-tuned model and tokenizer
model = BertForTokenClassification.from_pretrained('./review_spans_extraction')
tokenizer = BertTokenizer.from_pretrained('./review_spans_extraction')

def remove_specific_characters(strings_list):
    # Define the characters to be removed
    characters_to_remove = {
    '\x8d', '\x8b', '\x8c', '\x8f', '\x87', '\x8e', '\x81',
    '\x8a', '\x83', '\x94', '\x95', '\x97', '\x91', '\x89',
    '\x80', '\x99', '\x9e', '\xad', '\x9d', '\x98', '\x93',
    '\x82', '\x9c', '\x9f'"®", "´", "¿", "¥",
        "\u00c3", "\u00a2", "\u00c2", "\u0080", "\u00c2", "\u0099"
    }

    cleaned_strings_list = []

    for string in strings_list:
        cleaned_string = ''.join(char for char in string if char not in characters_to_remove)
        cleaned_strings_list.append(cleaned_string)

    return cleaned_strings_list

def remove_double_spaces(strings):
    pattern = re.compile(r'\s{2,}')  # Regex to match two or more spaces
    return [pattern.sub(' ', text) for text in strings]

def remove_multiple_punctuation(strings):
    # Create patterns to find multiple occurrences of ., !, and ,
    patterns = {
        r'\.{2,}': '.',
        r'\!{2,}': '!',
        r'\,{2,}': ','
    }

    # Process each string in the list
    cleaned_strings = []
    for text in strings:
        for pattern, replacement in patterns.items():
            text = re.sub(pattern, replacement, text)
        cleaned_strings.append(text)

    return cleaned_strings



def predict_snippet(review, aspect, model, tokenizer, max_len=512):
    model.eval()

    # Tokenize the input
    inputs = tokenizer.encode_plus(
        review,
        aspect,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation='longest_first'
    )
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Make predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    predictions = torch.argmax(logits, dim=2).flatten().tolist()
    new_predictions=predictions.copy()
    # Decode the tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids.flatten().tolist())
    # print(tokens)
    snippets=[]
    snippet=[]
    i = 0
    for token, label in zip(tokens, predictions):
        if label == 1:
            new_predictions[i] = 1
            snippet.append(token)
        elif label == 0 and i > 0 and i + 1 < len(tokens) and predictions[i - 1] == 1 and predictions[i + 1] == 1:
            new_predictions[i] = 1
            snippet.append(token)
        elif len(snippet):
            snippets.append(' '.join(snippet))
            snippet = []
        i += 1
    for i in range(1, len(new_predictions) - 2):
        # Check for the pattern 1,0,0,1
        if new_predictions[i] == 0 and new_predictions[i+1] == 0 and new_predictions[i-1] == 1 and new_predictions[i+2] == 1:
            new_predictions[i] = 1
            new_predictions[i+1] = 1

    # print(snippets)
    return snippets


In [5]:
import re


def clean_text(text):
    original_review= remove_double_spaces([text])
    original_review= remove_multiple_punctuation(original_review)
    original_review = remove_specific_characters(original_review)[0]
    text = original_review.lower()
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with a space
    text = re.sub(r'\.\.+', '.', text)  # Replace multiple periods with a single period
    # text=text.replace(',','')
    # text=text.replace('.','')
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space
    return text

def fix_special_characters(snippet):
    snippet=snippet.replace("[UNK]",'')
    snippet=snippet.replace(" ##",'')
    snippet=snippet.replace(" '","'")
    snippet=snippet.replace(" ’","’")
    snippet=snippet.replace("’ ","’")
    snippet=snippet.replace("' ","'")
    snippet=snippet.replace(" -","-")
    snippet=snippet.replace("- ","-")
    snippet=snippet.replace("/ ","/")
    snippet=snippet.replace(" /","/")
    snippet=snippet.replace(" :",":")
    snippet=snippet.replace(": ",":")
    return snippet

aspects = ["Cinematography", "Direction", "Story", "Characters", "Production Design","Emotions","Unique Concept"]
original_review = """"This movie really surprised me. 
I had my doubts about it at first but the movie got better and better for each minute. 
It is maybe not for the action seeking audience but for those that like an explicit portrait of a very strange criminal, man, lover and husband. 
If you're not a fan of bad language or sexual content this really is not for you. <br /><br />The storyline is somewhat hard to follow sometimes, but in the end I think it made everything better. 
The ending was unexpected since you were almost fouled to think it would end otherwise. <br /><br />As for the acting I think it was good.
It will not be up for an Oscar award for long but it at least caught my eye. Gil Bellows portrait of a prison man is not always perfect but it is very entertaining. 
Shaun Parkes portrait of Bellows prison mate Clinique is great and extremely powerful. On the downside I think I will put Esai Morales portrait of Markie.
Take my advice and watch this movie"""
original_review=clean_text(original_review)

print("original review: ",original_review)

for aspect in aspects:
    predicted_snippets = predict_snippet(original_review, aspect, model, tokenizer)
    new_snippets = []
    for snippet in predicted_snippets:
        new_snippets.append(fix_special_characters(snippet))

    print(aspect, new_snippets, end='\n')
    print("\n-------------------")

original review:  "this movie really surprised me. i had my doubts about it at first but the movie got better and better for each minute. it is maybe not for the action seeking audience but for those that like an explicit portrait of a very strange criminal, man, lover and husband. if you're not a fan of bad language or sexual content this really is not for you. <br /><br />the storyline is somewhat hard to follow sometimes, but in the end i think it made everything better. the ending was unexpected since you were almost fouled to think it would end otherwise. <br /><br />as for the acting i think it was good. it will not be up for an oscar award for long but it at least caught my eye. gil bellows portrait of a prison man is not always perfect but it is very entertaining. shaun parkes portrait of bellows prison mate clinique is great and extremely powerful. on the downside i think i will put esai morales portrait of markie. take my advice and watch this movie
Cinematography []

-------