In [1]:
import torch
from transformers import BertTokenizer, BertForTokenClassification

import re

def remove_specific_characters(strings_list):
    # Define the characters to be removed
    characters_to_remove = {
    '\x8d', '\x8b', '\x8c', '\x8f', '\x87', '\x8e', '\x81',
    '\x8a', '\x83', '\x94', '\x95', '\x97', '\x91', '\x89',
    '\x80', '\x99', '\x9e', '\xad', '\x9d', '\x98', '\x93',
    '\x82', '\x9c', '\x9f'"®", "´", "¿", "¥",
        "\u00c3", "\u00a2", "\u00c2", "\u0080", "\u00c2", "\u0099"
    }

    cleaned_strings_list = []

    for string in strings_list:
        cleaned_string = ''.join(char for char in string if char not in characters_to_remove)
        cleaned_strings_list.append(cleaned_string)

    return cleaned_strings_list

def remove_double_spaces(strings):
    pattern = re.compile(r'\s{2,}')  # Regex to match two or more spaces
    return [pattern.sub(' ', text) for text in strings]

def remove_multiple_punctuation(strings):
    # Create patterns to find multiple occurrences of ., !, and ,
    patterns = {
        r'\.{2,}': '.',
        r'\!{2,}': '!',
        r'\,{2,}': ','
    }

    # Process each string in the list
    cleaned_strings = []
    for text in strings:
        for pattern, replacement in patterns.items():
            text = re.sub(pattern, replacement, text)
        cleaned_strings.append(text)

    return cleaned_strings



def predict_snippet(review, aspect, model, tokenizer, max_len=256):
    model.eval()

    # Tokenize the input
    inputs = tokenizer.encode_plus(
        review,
        aspect,
        add_special_tokens=False,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation='longest_first'
    )
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Make predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    predictions = torch.argmax(logits, dim=2).flatten().tolist()
    new_predictions=predictions.copy()
    # Decode the tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids.flatten().tolist())
    # print(tokens)
    snippets=[]
    snippet=[]
    i = 0
    for token, label in zip(tokens, predictions):
        if label == 1:
            new_predictions[i] = 1
            snippet.append(token)
        elif label == 0 and i > 0 and i + 1 < len(tokens) and predictions[i - 1] == 1 and predictions[i + 1] == 1:
            new_predictions[i] = 1
            snippet.append(token)
        elif len(snippet):
            snippets.append(' '.join(snippet))
            snippet = []
        i += 1

    for i in range(1, len(new_predictions) - 2):
        # Check for the pattern 1,0,0,1
        if new_predictions[i] == 0 and new_predictions[i+1] == 0 and new_predictions[i-1] == 1 and new_predictions[i+2] == 1:
            new_predictions[i] = 1
            new_predictions[i+1] = 1

    # print(snippets)
    return snippets

def clean_text(text):
    original_review= remove_double_spaces([text])
    original_review= remove_multiple_punctuation(original_review)
    original_review = remove_specific_characters(original_review)[0]
    text = original_review.lower()
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with a space
    text = re.sub(r'\.\.+', '.', text)  # Replace multiple periods with a single period
    text=text.replace(',','')
    text=text.replace('.','')
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space
    return text

def fix_special_characters(snippet):
    snippet=snippet.replace("[UNK]",'')
    snippet=snippet.replace(" ##",'')
    snippet=snippet.replace(" '","'")
    snippet=snippet.replace(" ’","’")
    snippet=snippet.replace("’ ","’")
    snippet=snippet.replace("' ","'")
    snippet=snippet.replace(" -","-")
    snippet=snippet.replace("- ","-")
    snippet=snippet.replace("/ ","/")
    snippet=snippet.replace(" /","/")
    snippet=snippet.replace(" :",":")
    snippet=snippet.replace(": ",":")
    return snippet


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
from transformers import BertTokenizer, BertForTokenClassification, pipeline

# Load models and tokenizer
aspect_model = BertForTokenClassification.from_pretrained('./spans_based_bert_model')
aspect_tokenizer = BertTokenizer.from_pretrained('./spans_based_bert_model')
sentiment_classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli")

# Define aspects
aspects = ['Cinematography', 'Direction', 'Story', 'Characters', "Production Design", "Unique Concept", "Emotions"]

# Reviews list
reviews = ["The story was amazing but the cinematography wasn't it.", "Another insightful but poorly directed film."]

# Process reviews
for review in reviews:
    cleaned_review = clean_text(review)
    print("Original Review:", review)
    print("Cleaned Review:", cleaned_review)
    
    for aspect in aspects:
        snippets = predict_snippet(cleaned_review, aspect, aspect_model, aspect_tokenizer)
        snippets = [fix_special_characters(snip) for snip in snippets]

        for snippet in snippets:
            positive_label = f"{aspect} positive"
            negative_label = f"{aspect} negative"
            sentiment_result = sentiment_classifier(snippet, [positive_label, negative_label])
            positive_score = sentiment_result['scores'][0]
            negative_score = sentiment_result['scores'][1]
            scaled_score = ((positive_score - negative_score + 1) / 2) * 10  # Scale to 0-10

            print(f"Aspect: {aspect}")
            print(f"Snippet: {snippet}")
            print(f"Sentiment Score: {scaled_score:.2f}\n")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Original Review: The story was amazing but the cinematography wasn't it.
Cleaned Review: the story was amazing but the cinematography wasn't it
Original Review: Another insightful but poorly directed film.
Cleaned Review: another insightful but poorly directed film
