In [25]:
import json
import pandas as pd

# Load dataset
with open('../../new_spans_labeled.json', 'r') as file:
    dataset = json.load(file)

### Clean Dataset

In [26]:
# Initialize an empty list to store matching reviews
reviews_list = []

# Function to check if a snippet is present in the review and adjust its length if needed
def find_snippet(review, snippet):
    if snippet in review:
        return True, snippet
    # Check if the snippet, minus the last character, is in the review
    elif snippet[:-1] in review:
        return True, snippet[:-1]
    return False, snippet

# List of aspects to check in each review
aspects = ['Cinematography', 'Direction', 'Story', 'Characters', "Production Design", "Unique Concept", "Emotions"]

# Iterate over each data entry in the dataset
for data in dataset:
    review_text = data['review'].replace("'", '').replace('"', '').replace("“",'').replace(' ]', '').lower().strip()
    found = True

    # Loop through each aspect and its corresponding items
    for aspect in aspects:
        for i, item in enumerate(data[aspect]):
            snippet = item.replace("'", '').replace('"', '').replace(' ]', '').replace("“",'').lower().strip()
            is_found, adjusted_snippet = find_snippet(review_text, snippet)
            if not is_found:
                found = False
                break
            else:
                # Update the original snippet to its adjusted version if necessary
                data[aspect][i] = adjusted_snippet
        if not found:
            # print(snippet)
            break

    # If all snippets were found, add the data entry to reviews_list
    if found:
        reviews_list.append(data)


In [27]:

# Dictionary to hold counts
label_counts = {
    'Cinematography': 0,
    'Direction': 0,
    'Story': 0,
    'Characters': 0,
    'Production Design': 0,
    'Unique Concept': 0,
    'Emotions': 0
}

# Counting the number of elements for each label in each review
for review in reviews_list:
    for label in label_counts:
        if len(review[label]):
            # print(review[label])
            label_counts[label] += 1

# Printing the counts
print("Label counts:")
for label, count in label_counts.items():
    print(f"{label}: {count}")

Label counts:
Cinematography: 7173
Direction: 9625
Story: 31194
Characters: 26865
Production Design: 6982
Unique Concept: 9997
Emotions: 9121


In [28]:
# Aspects of interest
aspects = ['Cinematography', 'Direction', 'Story', 'Characters', "Production Design", "Unique Concept", "Emotions"]

# Aspect encoding mapping
aspect_encoding = {aspect: index for index, aspect in enumerate(aspects)}

# Prepare data for DataFrame
data = []

for review in reviews_list:
    review_text = review["review"]
    for aspect in aspects:
        snippets = review.get(aspect, [])  # Get the snippets for the aspect, default to an empty list
        if not snippets:  # Ensure a row even if there are no snippets
            snippets = [""]  # Use an empty string for consistency

        # Combine snippets into a single list for each aspect
        entry = {
            "review": review_text,
            "aspect": aspect,
            "snippets": [snippet.replace('"','') for snippet in snippets],  # Store snippets as a list
            "aspect_encoded": aspect_encoding[aspect]
        }
        data.append(entry)

# Convert to DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,review,aspect,snippets,aspect_encoded
0,One of the other reviewers has mentioned that ...,Cinematography,[],0
1,One of the other reviewers has mentioned that ...,Direction,[],1
2,One of the other reviewers has mentioned that ...,Story,[the first thing that struck me about oz was i...,2
3,One of the other reviewers has mentioned that ...,Characters,"[em city is home to many..aryans, muslims, gan...",3
4,One of the other reviewers has mentioned that ...,Production Design,[],4
...,...,...,...,...
242867,The first time I ever saw this movie was when ...,Story,"[the only problem i have with this movie, howe...",2
242868,The first time I ever saw this movie was when ...,Characters,"[the actors were amazing, treat williams is gr...",3
242869,The first time I ever saw this movie was when ...,Production Design,[],4
242870,The first time I ever saw this movie was when ...,Unique Concept,[],5


In [41]:
#count where snippets are zero:
count = 0
for i in df['snippets']:
    # print(i)
    if i == ['']:
        count += 1

count

141915

In [5]:
df.to_csv("../data/formatted_data.csv")

In [6]:
new_rows=[]
for index, row in df.iterrows():
    snippets=[]
    not_found=False
    for snippet in row['snippets']:
        if snippet!='':
            if snippet.lower() in row['review'].lower():
                snippets.append(snippet.lower())
            else:
                # print(snippet)
                not_found=True
                break
    if not not_found:
        new_rows.append([row['review'].lower(),row['aspect'],snippets])

In [7]:
import json
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments

from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm
2024-08-18 00:42:07.114551: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-18 00:42:07.193805: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-18 00:42:07.229675: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-18 00:42:07.239549: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-18 00:42:07.2

In [8]:
df = pd.DataFrame(new_rows, columns = ['review', 'aspect','snippet'])
aspect_encoder = LabelEncoder()
df['aspect_encoded'] = aspect_encoder.fit_transform(df['aspect'])

In [9]:
aspect_counts = df['aspect'].value_counts()
aspect_counts

aspect
Cinematography       33169
Production Design    32917
Direction            31615
Unique Concept       31163
Emotions             30956
Characters           23184
Story                19840
Name: count, dtype: int64

In [10]:
# Define the dataset class
class ReviewAspectDataset(Dataset):
    def __init__(self, reviews, aspects, snippets, tokenizer, max_len):
        self.reviews = reviews
        self.aspects = aspects
        self.snippets = snippets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        aspect = str(self.aspects[idx])
        snippets = self.snippets[idx]

        inputs = self.tokenizer.encode_plus(
            review,
            aspect,
            add_special_tokens=False,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        tokens = self.tokenizer.tokenize(review)
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        labels = [0] * len(token_ids)

        for snippet in snippets:
            snippet_tokens = self.tokenizer.tokenize(snippet)
            snippet_token_ids = self.tokenizer.convert_tokens_to_ids(snippet_tokens)

            for i in range(len(token_ids) - len(snippet_token_ids) + 1):
                if token_ids[i:i+len(snippet_token_ids)] == snippet_token_ids:
                    labels[i:i+len(snippet_token_ids)] = [1] * len(snippet_token_ids)
                    break  # Assuming one occurrence of snippet in review

        # Pad or truncate labels to match max_len
        if len(labels) < self.max_len:
            labels += [0] * (self.max_len - len(labels))
        else:
            labels = labels[:self.max_len]

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }
# Tokenizer and dataset preparation
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 512

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [11]:
train_dataset = ReviewAspectDataset(
    reviews=train_df['review'].to_numpy(),
    aspects=train_df['aspect'].to_numpy(),
    snippets=train_df['snippet'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
)

val_dataset = ReviewAspectDataset(
    reviews=val_df['review'].to_numpy(),
    aspects=val_df['aspect'].to_numpy(),
    snippets=val_df['snippet'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
)

In [12]:
# Define the model
import os
from transformers import EarlyStoppingCallback
os.environ['WANDB_DISABLED'] = 'true'
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True,  # To ensure the best model is loaded at the end
    save_total_limit=1,  # To keep only the best model
    learning_rate= 5e-05
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [13]:
import logging
logging.disable(logging.WARNING)
trainer.train()

  1%|          | 500/81140 [11:02<30:11:08,  1.35s/it]

{'loss': 0.0939, 'grad_norm': 0.5681623816490173, 'learning_rate': 5e-05, 'epoch': 0.02}


                                                      
  1%|          | 500/81140 [49:15<30:11:08,  1.35s/it]

{'eval_loss': 0.05377301946282387, 'eval_runtime': 2293.0331, 'eval_samples_per_second': 17.692, 'eval_steps_per_second': 2.212, 'epoch': 0.02}


  1%|          | 1000/81140 [56:18<18:56:10,  1.18it/s]   

{'loss': 0.0527, 'grad_norm': 0.2634539306163788, 'learning_rate': 4.968998015873016e-05, 'epoch': 0.05}


                                                       
  1%|          | 1000/81140 [1:18:19<18:56:10,  1.18it/s]

{'eval_loss': 0.049038052558898926, 'eval_runtime': 1320.6064, 'eval_samples_per_second': 30.72, 'eval_steps_per_second': 3.841, 'epoch': 0.05}


  2%|▏         | 1500/81140 [1:25:22<18:36:36,  1.19it/s]   

{'loss': 0.0493, 'grad_norm': 0.47434914112091064, 'learning_rate': 4.937996031746032e-05, 'epoch': 0.07}


                                                         
  2%|▏         | 1500/81140 [1:47:25<18:36:36,  1.19it/s]

{'eval_loss': 0.04514707997441292, 'eval_runtime': 1323.4653, 'eval_samples_per_second': 30.654, 'eval_steps_per_second': 3.832, 'epoch': 0.07}


  2%|▏         | 2000/81140 [1:54:27<18:19:47,  1.20it/s]   

{'loss': 0.0416, 'grad_norm': 0.41527578234672546, 'learning_rate': 4.9069940476190476e-05, 'epoch': 0.1}


                                                         
  2%|▏         | 2000/81140 [2:16:10<18:19:47,  1.20it/s]

{'eval_loss': 0.041868217289447784, 'eval_runtime': 1303.3821, 'eval_samples_per_second': 31.126, 'eval_steps_per_second': 3.891, 'epoch': 0.1}


  3%|▎         | 2500/81140 [2:23:11<18:14:56,  1.20it/s]   

{'loss': 0.0418, 'grad_norm': 0.34335920214653015, 'learning_rate': 4.875992063492064e-05, 'epoch': 0.12}


                                                         
  3%|▎         | 2500/81140 [2:44:54<18:14:56,  1.20it/s]

{'eval_loss': 0.03882870450615883, 'eval_runtime': 1303.3097, 'eval_samples_per_second': 31.128, 'eval_steps_per_second': 3.892, 'epoch': 0.12}


  4%|▎         | 3000/81140 [2:51:54<18:14:20,  1.19it/s]   

{'loss': 0.0414, 'grad_norm': 0.2850511372089386, 'learning_rate': 4.84499007936508e-05, 'epoch': 0.15}


                                                         
  4%|▎         | 3000/81140 [3:13:37<18:14:20,  1.19it/s]

{'eval_loss': 0.04144946113228798, 'eval_runtime': 1303.1376, 'eval_samples_per_second': 31.132, 'eval_steps_per_second': 3.892, 'epoch': 0.15}


  4%|▍         | 3500/81140 [3:20:38<18:11:54,  1.19it/s]   

{'loss': 0.0396, 'grad_norm': 0.36550506949424744, 'learning_rate': 4.813988095238096e-05, 'epoch': 0.17}


                                                         
  4%|▍         | 3500/81140 [3:42:19<18:11:54,  1.19it/s]

{'eval_loss': 0.04070288687944412, 'eval_runtime': 1301.334, 'eval_samples_per_second': 31.175, 'eval_steps_per_second': 3.898, 'epoch': 0.17}


  4%|▍         | 3500/81140 [3:42:20<82:12:13,  3.81s/it]

{'train_runtime': 13340.6505, 'train_samples_per_second': 48.656, 'train_steps_per_second': 6.082, 'train_loss': 0.051476426805768694, 'epoch': 0.17}





TrainOutput(global_step=3500, training_loss=0.051476426805768694, metrics={'train_runtime': 13340.6505, 'train_samples_per_second': 48.656, 'train_steps_per_second': 6.082, 'total_flos': 7316309188608000.0, 'train_loss': 0.051476426805768694, 'epoch': 0.1725412866650234})

In [14]:
torch.cuda.empty_cache()

In [15]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
from transformers import EvalPrediction

# Custom evaluation function to generate classification report
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    # Flatten the predictions and labels
    preds_flat = preds.flatten()
    labels_flat = labels.flatten()

    # Filter out the labels where the label is -100 (ignore index)
    mask = labels_flat != -100
    preds_flat = preds_flat[mask]
    labels_flat = labels_flat[mask]

    # Generate classification report
    report = classification_report(labels_flat, preds_flat, target_names=['O', 'B-SNIPPET'], digits=4)
    accuracy = accuracy_score(labels_flat, preds_flat)

    print("\nClassification Report:\n", report)
    print(f"Accuracy: {accuracy:.4f}")

    # Safely extract metrics from the report
    report_lines = report.split('\n')
    f1_scores = []
    precision_scores = []
    recall_scores = []
    
    # Extract metrics from each class, skipping the header and footer lines
    for line in report_lines[2:-3]:
        parts = line.split()
        if len(parts) >= 4:  # Ensure that the line has enough parts
            recall_scores.append(float(parts[-4]))
            precision_scores.append(float(parts[-3]))
            f1_scores.append(float(parts[-2]))

    return {
        "accuracy": accuracy,
        "f1": np.mean(f1_scores) if f1_scores else 0.0,
        "precision": np.mean(precision_scores) if precision_scores else 0.0,
        "recall": np.mean(recall_scores) if recall_scores else 0.0
    }

# Trainer with custom metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Adding the custom compute_metrics function
)

# Evaluate the model on validation dataset
trainer.evaluate()


100%|██████████| 5072/5072 [22:14<00:00,  3.80it/s]


Classification Report:
               precision    recall  f1-score   support

           O     0.9895    0.9985    0.9940  20479523
   B-SNIPPET     0.7091    0.2548    0.3749    291805

    accuracy                         0.9881  20771328
   macro avg     0.8493    0.6266    0.6844  20771328
weighted avg     0.9855    0.9881    0.9853  20771328

Accuracy: 0.9881





{'eval_loss': 0.03882870450615883,
 'eval_model_preparation_time': 0.0028,
 'eval_accuracy': 0.9880620536154453,
 'eval_f1': 0.68445,
 'eval_precision': 0.62665,
 'eval_recall': 0.8492999999999999,
 'eval_runtime': 1334.6682,
 'eval_samples_per_second': 30.396,
 'eval_steps_per_second': 3.8}

In [16]:

# Save the model
model.save_pretrained('./spans_based_bert_model')
tokenizer.save_pretrained('./spans_based_bert_model')


('./spans_based_bert_model/tokenizer_config.json',
 './spans_based_bert_model/special_tokens_map.json',
 './spans_based_bert_model/vocab.txt',
 './spans_based_bert_model/added_tokens.json')

In [17]:
import torch
from transformers import BertTokenizer, BertForTokenClassification

import re
# Load the fine-tuned model and tokenizer
model = BertForTokenClassification.from_pretrained('./spans_based_bert_model')
tokenizer = BertTokenizer.from_pretrained('./spans_based_bert_model')

def remove_specific_characters(strings_list):
    # Define the characters to be removed
    characters_to_remove = {
    '\x8d', '\x8b', '\x8c', '\x8f', '\x87', '\x8e', '\x81',
    '\x8a', '\x83', '\x94', '\x95', '\x97', '\x91', '\x89',
    '\x80', '\x99', '\x9e', '\xad', '\x9d', '\x98', '\x93',
    '\x82', '\x9c', '\x9f'"®", "´", "¿", "¥",
        "\u00c3", "\u00a2", "\u00c2", "\u0080", "\u00c2", "\u0099"
    }

    cleaned_strings_list = []

    for string in strings_list:
        cleaned_string = ''.join(char for char in string if char not in characters_to_remove)
        cleaned_strings_list.append(cleaned_string)

    return cleaned_strings_list

def remove_double_spaces(strings):
    pattern = re.compile(r'\s{2,}')  # Regex to match two or more spaces
    return [pattern.sub(' ', text) for text in strings]

def remove_multiple_punctuation(strings):
    # Create patterns to find multiple occurrences of ., !, and ,
    patterns = {
        r'\.{2,}': '.',
        r'\!{2,}': '!',
        r'\,{2,}': ','
    }

    # Process each string in the list
    cleaned_strings = []
    for text in strings:
        for pattern, replacement in patterns.items():
            text = re.sub(pattern, replacement, text)
        cleaned_strings.append(text)

    return cleaned_strings



def predict_snippet(review, aspect, model, tokenizer, max_len=256):
    model.eval()

    # Tokenize the input
    inputs = tokenizer.encode_plus(
        review,
        aspect,
        add_special_tokens=False,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation='longest_first'
    )
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Make predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    predictions = torch.argmax(logits, dim=2).flatten().tolist()
    new_predictions=predictions.copy()
    # Decode the tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids.flatten().tolist())
    # print(tokens)
    snippets=[]
    snippet=[]
    i = 0
    for token, label in zip(tokens, predictions):
        if label == 1:
            new_predictions[i] = 1
            snippet.append(token)
        elif label == 0 and i > 0 and i + 1 < len(tokens) and predictions[i - 1] == 1 and predictions[i + 1] == 1:
            new_predictions[i] = 1
            snippet.append(token)
        elif len(snippet):
            snippets.append(' '.join(snippet))
            snippet = []
        i += 1

    for i in range(1, len(new_predictions) - 2):
        # Check for the pattern 1,0,0,1
        if new_predictions[i] == 0 and new_predictions[i+1] == 0 and new_predictions[i-1] == 1 and new_predictions[i+2] == 1:
            new_predictions[i] = 1
            new_predictions[i+1] = 1

    # print(snippets)
    return snippets


In [18]:
import re


def clean_text(text):
    original_review= remove_double_spaces([text])
    original_review= remove_multiple_punctuation(original_review)
    original_review = remove_specific_characters(original_review)[0]
    text = original_review.lower()
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with a space
    text = re.sub(r'\.\.+', '.', text)  # Replace multiple periods with a single period
    text=text.replace(',','')
    text=text.replace('.','')
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space
    return text

def fix_special_characters(snippet):
    snippet=snippet.replace("[UNK]",'')
    snippet=snippet.replace(" ##",'')
    snippet=snippet.replace(" '","'")
    snippet=snippet.replace(" ’","’")
    snippet=snippet.replace("’ ","’")
    snippet=snippet.replace("' ","'")
    snippet=snippet.replace(" -","-")
    snippet=snippet.replace("- ","-")
    snippet=snippet.replace("/ ","/")
    snippet=snippet.replace(" /","/")
    snippet=snippet.replace(" :",":")
    snippet=snippet.replace(": ",":")
    return snippet

aspects=['Cinematography', 'Direction', 'Story', 'Characters', "Production Design", "Unique Concept", "Emotions"]
original_review="""The story was amazing but the cinematography wasn't it."""

#     print(aspect, new_snippets)
original_review=clean_text(original_review)

print("original review: ",original_review)

for aspect in aspects:
    predicted_snippets = predict_snippet(original_review, aspect, model, tokenizer)

    new_snippets = []
    for snippet in predicted_snippets:
        new_snippets.append(fix_special_characters(snippet))

    print(aspect, new_snippets, end='\n')
    print("\n-------------------")

original review:  the story was amazing but the cinematography wasn't it
Cinematography []

-------------------
Direction []

-------------------
Story []

-------------------
Characters []

-------------------
Production Design []

-------------------
Unique Concept []

-------------------
Emotions []

-------------------


In [19]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli")

# Example snippet
snippet = "the cinematography wasn't great"
candidate_labels = ["cinematography positive", "cinematography negative"]

# Classify the sentiment
sentiment_result = classifier(snippet, candidate_labels)
print(sentiment_result)

{'sequence': "the cinematography wasn't great", 'labels': ['cinematography negative', 'cinematography positive'], 'scores': [0.9993323683738708, 0.0006676383200101554]}
