In [1]:
import torch
from transformers import BertTokenizer, BertForTokenClassification
from tqdm import tqdm
import re
# Load the fine-tuned model and tokenizer
model = BertForTokenClassification.from_pretrained('./model')
tokenizer = BertTokenizer.from_pretrained('./model')

def remove_specific_characters(strings_list):
    # Define the characters to be removed
    characters_to_remove = {
    '\x8d', '\x8b', '\x8c', '\x8f', '\x87', '\x8e', '\x81', 
    '\x8a', '\x83', '\x94', '\x95', '\x97', '\x91', '\x89', 
    '\x80', '\x99', '\x9e', '\xad', '\x9d', '\x98', '\x93' , 
    '\x82', '\x9c', '\x9f'"®", "´", "¿", "¥",
        "\u00c3", "\u00a2", "\u00c2", "\u0080", "\u00c2", "\u0099"
    }
    
    cleaned_strings_list = []
    
    for string in strings_list:
        cleaned_string = ''.join(char for char in string if char not in characters_to_remove)
        cleaned_strings_list.append(cleaned_string)
    
    return cleaned_strings_list

def remove_double_spaces(strings):
    pattern = re.compile(r'\s{2,}')  # Regex to match two or more spaces
    return [pattern.sub(' ', text) for text in strings]

def remove_multiple_punctuation(strings):
    # Create patterns to find multiple occurrences of ., !, and ,
    patterns = {
        r'\.{2,}': '.',  
        r'\!{2,}': '!',  
        r'\,{2,}': ','   
    }

    # Process each string in the list
    cleaned_strings = []
    for text in strings:
        for pattern, replacement in patterns.items():
            text = re.sub(pattern, replacement, text)
        cleaned_strings.append(text)
    
    return cleaned_strings

def fix_special_characters(snippet):
    snippet=snippet.replace("[UNK]",'')
    snippet=snippet.replace(" ##",'')
    snippet=snippet.replace(" '","'")
    snippet=snippet.replace(" ’","’")
    snippet=snippet.replace("’ ","’")
    snippet=snippet.replace("' ","'")
    snippet=snippet.replace(" -","-")
    snippet=snippet.replace("- ","-")
    snippet=snippet.replace("/ ","/")
    snippet=snippet.replace(" /","/")
    snippet=snippet.replace(" :",":")
    snippet=snippet.replace(": ",":")
    return snippet


def predict_snippet(review, aspect, model, tokenizer, max_len=512):
    model.to("cuda")
    model.eval()

    # Tokenize the input
    inputs = tokenizer.encode_plus(
        review,
        aspect,
        add_special_tokens=False,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation='longest_first'
    )
    input_ids = inputs['input_ids'].to("cuda")
    attention_mask = inputs['attention_mask'].to("cuda")

    # Make predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    predictions = torch.argmax(logits, dim=2).flatten().tolist()
    new_predictions=predictions.copy()
    # Decode the tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids.flatten().tolist())
    
    i = 0
    for token, label in zip(tokens, predictions):
        if label == 1:
            new_predictions[i] = 1
        elif label == 0 and i > 0 and i + 1 < len(tokens) and predictions[i - 1] == 1 and predictions[i + 1] == 1:
            new_predictions[i] = 1
        i += 1

    for i in range(1, len(new_predictions) - 2):
        # Check for the pattern 1,0,0,1
        if new_predictions[i] == 0 and new_predictions[i+1] == 0 and new_predictions[i-1] == 1 and new_predictions[i+2] == 1:
            new_predictions[i] = 1
            new_predictions[i+1] = 1
    i = 0
    snippets=[]
    snippet=[]
    for token, label in zip(tokens, new_predictions):
        if label == 1:
            snippet.append(token)
        elif len(snippet):
            res=' '.join(snippet)
            res=fix_special_characters(res)
            snippets.append(res)
            snippet = []
        i += 1

    return snippets,new_predictions


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import json
# from tqdm import tqdm
# import pandas as pd
# from collections import defaultdict
# with open('new_spans_labeled.json','r') as file:
#     dataset=json.load(file)

In [3]:
# resultant={"Cinematography":[],"Direction":[],"Story":[],"Characters":[],"Production Design":[],"Unique Concept":[],"Emotions":[]}

# def contains_alpha(s):
#     return any(char.isalpha() for char in s)
# for data in dataset:
#     for key,value in data.items():
#         if key!='review':
#             if data[key]!=[]:
#                 for item in data[key]:
#                     if contains_alpha(item):
#                         resultant[key].append(data)
#                         break

# resultant['Cinematography']=resultant['Cinematography'][:5000]

# resultant['Direction']=resultant['Direction'][:5000]

# resultant['Story']=resultant['Story'][:5000]

# resultant['Characters']=resultant['Characters'][:5000]

# resultant['Production Design']=resultant['Production Design'][:5000]

# resultant['Unique Concept']=resultant['Unique Concept'][:5000]

# resultant['Emotions']=resultant['Emotions'][:5000]


In [4]:

# def labels_and_tokens(review, snippet,max_len=512):
#     tokens = tokenizer.tokenize(review)
#     token_ids = tokenizer.convert_tokens_to_ids(tokens)
#     labels = [0] * len(token_ids)
#     for snp in snippet:
#         snippet_tokens = tokenizer.tokenize(snp)
#         snippet_token_ids = tokenizer.convert_tokens_to_ids(snippet_tokens)
 
 
#         for i in range(len(token_ids)):
#             if token_ids[i:i+len(snippet_token_ids)] == snippet_token_ids:
#                 labels[i:i+len(snippet_token_ids)] = [1] * len(snippet_token_ids)
#                 break  # Assuming one occurrence of snippet in review
       
#         # Pad labels
#         if len(labels) < max_len:
#             labels = labels + [0] * (max_len - len(labels))
#         else:
#             labels = labels[:max_len]
 
#     return labels,len(tokens)
 

In [5]:
# test_set=[]
# for result in resultant:
#     for data in tqdm(resultant[result]):
#         for key,value in data.items():
#             if key!="review":
#                 labels,length_tokens=labels_and_tokens(data['review'],data[key])
#                 if data[key]==[]:
#                     test_set.append([data['review'],key,[],labels,0])
#                 else:
#                     test_set.append([data['review'],key,data[key],labels,length_tokens])

In [11]:
# columns=['review','aspect','snippets','labels','length']
# test_set_save=pd.DataFrame(test_set,columns=columns)
# test_set_save.to_csv('test_file.csv')

In [2]:
import pandas as pd

dataset=pd.read_json('new_spans_labeled.json')
dataset=dataset.sample(frac = 1,random_state=40)

In [3]:
def jaccard_similarity_char(sentence1, sentence2):
    set1 = set(sentence1.lower())
    set2 = set(sentence2.lower())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union


In [6]:
import logging

# Suppress the warning
logging.disable(logging.WARNING)
cimematography=[]

aspects = ['Cinematography', 'Direction', 'Story', 'Characters', "Production Design", "Unique Concept", "Emotions"]
found=0
not_found=0
for aspect in aspects:
    average_sim=[]

    for i,(review,data) in enumerate(zip(dataset['review'],dataset[aspect])):
        
        if data!=[]:
            # print(data)
            snippets,new_predictions = predict_snippet(review,aspect,model,tokenizer)
            # if len(snippets)>0:
            #     print(snippets)
            # print(data)
            # print(snippets)
            # print("***************************")
            # print(snippets)
            smiliarities=[]
            
            for snippet in snippets:
                similarity=0
                for label in data:
                    sim=jaccard_similarity_char(label, snippet)
                    if sim > similarity:
                        similarity=sim
                smiliarities.append(similarity)
            if len(smiliarities):
                average_sim.append(sum(smiliarities)/len(smiliarities))
            else:
                average_sim.append(0)
            found+=1
            if found>500:
                found=0
                break
            # print("==========================")
    print(aspect,sum(average_sim)/len(average_sim))
print(not_found)

Cinematography 0.6117647892383771
Direction 0.5890670131153023
Story 0.7512071800118716
Characters 0.7238620121034917
Production Design 0.5069140887725108
Unique Concept 0.17493709018535902
Emotions 0.07063709478100669
0
