<a href="https://colab.research.google.com/github/madhurima5978/new/blob/main/bert_base_uncased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import xml.etree.ElementTree as ET

# Assuming you have a class to load your XML dataset into a suitable format
class ABSADataset(Dataset):
    def __init__(self, xml_file, tokenizer, max_seq_length=128):
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.data = self._load_data(xml_file)

    def _load_data(self, xml_file):
        tree = ET.parse(xml_file)
        root = tree.getroot()

        data = []
        for review in root.findall(".//sentence"):
            text = review.find("text").text
            aspect_terms = []
            for aspect_term in review.findall(".//aspectTerm"):
                term = aspect_term.get("term")
                polarity = aspect_term.get("polarity")
                from_pos = int(aspect_term.get("from"))
                to_pos = int(aspect_term.get("to"))
                aspect_terms.append({
                    "term": term,
                    "polarity": polarity,
                    "from": from_pos,
                    "to": to_pos
                })

            data.append({
                "text": text,
                "aspect_terms": aspect_terms
            })

        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        text = item["text"]
        aspect_terms = item["aspect_terms"]

        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=self.max_seq_length,
            padding="max_length"
        )

        labels = torch.zeros(self.max_seq_length, dtype=torch.long)
        for aspect_term in aspect_terms:
            from_pos = min(aspect_term["from"], self.max_seq_length - 1)
            to_pos = min(aspect_term["to"], self.max_seq_length - 1)
            labels[from_pos:to_pos + 1] = self.map_label_to_int(aspect_term["polarity"])

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "labels": labels
        }

    def map_label_to_int(self, label):
        label_mapping = {"positive": 1, "negative": 2, "neutral": 3}
        return label_mapping[label] if label in label_mapping else 0  # Default to 0 for unknown labels


def collate_fn(batch):
    # Sort the batch by sequence length (in descending order)
    batch = sorted(batch, key=lambda x: len(x["input_ids"]), reverse=True)

    # Extract inputs and labels
    inputs = [item["input_ids"] for item in batch]
    labels = [item["labels"] for item in batch]

    # Pad sequences to the length of the longest sequence in the batch
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=0)  # Assuming 0 is the padding value for labels

    # Create attention mask
    attention_mask = (inputs_padded != tokenizer.pad_token_id).int()

    return {
        "input_ids": inputs_padded,
        "labels": labels_padded,
        "attention_mask": attention_mask
    }

# Load your ABSA dataset and tokenizer
xml_file = "Laptop_Train_v2.xml"  # Replace with your actual file path
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
train_dataset = ABSADataset(xml_file, tokenizer)

# Load the pre-trained model
model_name = "bert-base-uncased"
config = AutoConfig.from_pretrained(model_name)
config.num_labels = 4  # Number of labels: positive, negative, neutral, padding
model = AutoModelForTokenClassification.from_pretrained(model_name, config=config)

# Fine-tune your model using the DataLoader and collate function
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

# Specify training parameters, optimizer, and loss function
num_epochs = 5
learning_rate = 5e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Average Loss: {average_loss}")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 381/381 [01:17<00:00,  4.94it/s]


Epoch 1, Average Loss: 0.23168213573330892


Epoch 2: 100%|██████████| 381/381 [01:17<00:00,  4.92it/s]


Epoch 2, Average Loss: 0.16762905571245518


Epoch 3: 100%|██████████| 381/381 [01:16<00:00,  4.96it/s]


Epoch 3, Average Loss: 0.1299502490860779


Epoch 4: 100%|██████████| 381/381 [01:16<00:00,  4.97it/s]


Epoch 4, Average Loss: 0.10687568473288657


Epoch 5: 100%|██████████| 381/381 [01:16<00:00,  4.96it/s]


Epoch 5, Average Loss: 0.09045776272231708


('fine_tuned_model/tokenizer_config.json',
 'fine_tuned_model/special_tokens_map.json',
 'fine_tuned_model/vocab.txt',
 'fine_tuned_model/added_tokens.json',
 'fine_tuned_model/tokenizer.json')

In [None]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch
from tqdm import tqdm
import xml.etree.ElementTree as ET


# Load the fine-tuned model and tokenizer
model = AutoModelForTokenClassification.from_pretrained("fine_tuned_model")
tokenizer = AutoTokenizer.from_pretrained("fine_tuned_model")


In [None]:
class ValidationDataset(Dataset):
    def __init__(self, xml_file, tokenizer):
        self.tokenizer = tokenizer
        self.data = self._load_data(xml_file)

    def _load_data(self, xml_file):
        tree = ET.parse(xml_file)
        root = tree.getroot()

        data = []
        for sentence in root.findall(".//sentence"):
            text = sentence.find("text").text
            aspect_terms = []
            for aspect_term in sentence.findall(".//aspectTerm"):
                term = aspect_term.get("term")
                polarity = aspect_term.get("polarity")
                from_pos = int(aspect_term.get("from"))
                to_pos = int(aspect_term.get("to"))
                aspect_terms.append({
                    "term": term,
                    "polarity": polarity,
                    "from": from_pos,
                    "to": to_pos
                })

            data.append({
                "text": text,
                "aspect_terms": aspect_terms
            })

        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        text = item["text"]
        aspect_terms = item["aspect_terms"]

        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=128,  # Adjust as needed
            padding="max_length"
        )

        labels = torch.zeros(128, dtype=torch.long)  # Assuming max_length is 128
        for aspect_term in aspect_terms:
            from_pos = min(aspect_term["from"], 127)  # 127 is max_length - 1
            to_pos = min(aspect_term["to"], 127)
            labels[from_pos:to_pos + 1] = self.map_label_to_int(aspect_term["polarity"])

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "labels": labels
        }

    def map_label_to_int(self, label):
        label_mapping = {"positive": 1, "negative": 2, "neutral": 3}
        return label_mapping[label] if label in label_mapping else 0  # Default to 0 for unknown labels

# Define your collate function
def collate_fn(batch):
    batch = sorted(batch, key=lambda x: len(x["input_ids"]), reverse=True)
    inputs = [item["input_ids"] for item in batch]
    labels = [item["labels"] for item in batch]
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=0)
    attention_mask = (inputs_padded != tokenizer.pad_token_id).int()

    return {
        "input_ids": inputs_padded,
        "labels": labels_padded,
        "attention_mask": attention_mask
    }

# Load the fine-tuned model and tokenizer
model = AutoModelForTokenClassification.from_pretrained("fine_tuned_model")
tokenizer = AutoTokenizer.from_pretrained("fine_tuned_model")

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create the validation dataset and data loader
validation_dataset = ValidationDataset("laptops-trial.xml", tokenizer)
validation_loader = DataLoader(validation_dataset, batch_size=8, collate_fn=collate_fn)

# Validation loop
model.eval()
total_correct = 0
total_samples = 0

with torch.no_grad():
    for batch in tqdm(validation_loader, desc="Validation"):
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=2).squeeze()

        # Count correct predictions
        total_correct += (predicted_labels == labels).sum().item()
        total_samples += labels.numel()

# Calculate accuracy
accuracy = total_correct / total_samples
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

Validation: 100%|██████████| 13/13 [00:00<00:00, 15.87it/s]

Validation Accuracy: 97.80%





In [None]:
print(model.config.num_labels)


4


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("fine_tuned_model")
tokenizer = AutoTokenizer.from_pretrained("fine_tuned_model")
input_sentence = "I don't like the laptop design but its battery life is good"
inputs = tokenizer(input_sentence, return_tensors="pt")
print(inputs)

with torch.no_grad():
    outputs = model(**inputs)

print(outputs.logits)

predicted_labels = torch.argmax(outputs.logits, dim=2).squeeze().tolist()
predicted_labels_decoded = tokenizer.batch_decode(predicted_labels)

print(predicted_labels_decoded)
label_mapping = {1: "positive", 2: "negative", 3: "neutral"}

# Extract aspects and sentiments
aspects = []
sentiments = []

current_aspect = None
current_sentiment = None
for token, label_id in zip(tokenizer.tokenize(input_sentence), predicted_labels_decoded[0]):
    label = label_mapping.get(label_id, None)

    if label is None or label == "[pad]":
        continue
    if current_aspect is None:
        current_aspect = token
        current_sentiment = label
    elif label == current_sentiment:
        current_aspect += f" {token}"
    else:
        aspects.append(current_aspect)
        sentiments.append(current_sentiment)
        current_aspect = token
        current_sentiment = label

if current_aspect is not None:
    aspects.append(current_aspect)
    sentiments.append(current_sentiment)

# Print or use the extracted aspects and sentiments
for aspect, sentiment in zip(aspects, sentiments):
    print(f"Aspect: {aspect}\nSentiment: {sentiment}")


{'input_ids': tensor([[  101,  1045,  2123,  1005,  1056,  2066,  1996, 12191,  2640,  2021,
          2049,  6046,  2166,  2003,  2204,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[[ 7.7998, -2.7625, -2.7599, -2.4968],
         [ 7.6707, -2.8084, -2.8279, -2.3984],
         [ 7.7099, -2.8112, -2.8755, -2.5009],
         [ 7.7191, -2.8092, -2.8058, -2.5985],
         [ 7.7378, -2.8216, -2.8980, -2.5429],
         [ 7.5982, -2.7552, -2.8988, -2.5706],
         [ 7.5434, -2.6632, -2.5479, -2.6122],
         [ 7.7524, -2.7728, -2.6132, -2.5914],
         [ 7.3784, -2.3706, -2.6503, -2.3804],
         [ 7.5770, -2.5950, -2.5655, -2.6020],
         [ 7.0114, -1.9806, -2.5963, -2.4753],
         [ 6.4649, -1.4167, -2.7726, -2.3893],
         [ 5.7148, -0.8626, -2.8013, -2.4827],
         [ 6.3804, -1.6774, -2.6779, -2.5248],
         [ 6.0397, -1.2942, -2.7517, -2

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load the fine-tuned model and tokenizer
model = AutoModelForTokenClassification.from_pretrained("fine_tuned_model")
tokenizer = AutoTokenizer.from_pretrained("fine_tuned_model")
# Replace 'Your new input sentence here.' with your actual input sentence
input_sentence = 'The laptops performance is good but battery life is bad'

# Tokenize and encode the input sentence
inputs = tokenizer(input_sentence, return_tensors="pt")
# Perform inference using the model
with torch.no_grad():
    outputs = model(**inputs)

# Access the logits or predicted labels
logits = outputs.logits
predicted_labels = torch.argmax(logits, dim=2).squeeze().tolist()
# Decode predicted labels using the tokenizer
predicted_labels_decoded = tokenizer.batch_decode(predicted_labels)
# Extract aspects and sentiments from the decoded labels
aspects = []
sentiments = []

for token, label in zip(tokenizer.tokenize(input_sentence), predicted_labels_decoded):
    if label != "O":  # "O" typically represents tokens outside aspect terms
        aspects.append(token)
        sentiments.append(label.lower())  # Assuming labels are "positive", "negative", "neutral"

# Print or use the extracted aspects and sentiments
print("Aspects:", aspects)
print("Sentiments:", sentiments)


Aspects: ['the', 'laptop', '##s', 'performance', 'is', 'good', 'but', 'battery', 'life', 'is', 'bad']
Sentiments: ['[pad]', '[pad]', '[pad]', '[pad]', '[pad]', '[pad]', '[pad]', '[pad]', '[pad]', '[pad]', '[pad]']


In [None]:
# Assuming the labels are "positive", "negative", "neutral"
label_mapping = {1: "positive", 2: "negative", 3: "neutral"}

# Initialize variables to store aspects and sentiments
current_aspect = None
current_sentiment = None
aspects = []
sentiments = []

# Iterate over tokens and labels
for token, label_id in zip(tokenizer.tokenize(input_sentence), predicted_labels):
    label = label_mapping.get(label_id, None)

    # Handle the case where the label is not mapped or it is a pad token
    if label is None or label == "[pad]":
        continue

    # Check if the current token is part of the same aspect term
    if current_aspect is None:
        current_aspect = token
        current_sentiment = label
    elif label == current_sentiment:
        current_aspect += f" {token}"
    else:
        # Save the current aspect and sentiment
        aspects.append(current_aspect)
        sentiments.append(current_sentiment)

        # Reset variables for the next aspect term
        current_aspect = token
        current_sentiment = label

# Check if there is an aspect term remaining
if current_aspect is not None:
    aspects.append(current_aspect)
    sentiments.append(current_sentiment)

# Print or use the extracted aspects and sentiments
for aspect, sentiment in zip(aspects, sentiments):
    print(f"{aspect}: {sentiment}")


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
import xml.etree.ElementTree as ET
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

# Load the sentiment analysis model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("kevinscaria/joint_tk-instruct-base-def-pos-neg-neut-combined")
model = AutoModelForSeq2SeqLM.from_pretrained("kevinscaria/joint_tk-instruct-base-def-pos-neg-neut-combined")

# Define a simple dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {'text': self.texts[idx], 'label': self.labels[idx]}

# Fine-tuning parameters
num_epochs = 10
learning_rate = 1e-5

# Load and parse the XML dataset
tree = ET.parse('Laptop_Train_v2.xml')
root = tree.getroot()

# Initialize lists to store text and labels
texts = []
labels = []

# Iterate through each 'sentence' element in the XML
for sentence in root.findall('.//sentence'):
    text = sentence.find('text').text.strip()

    # Check if the 'aspectTerms' element exists
    aspect_terms = sentence.find('aspectTerms')
    if aspect_terms is not None:
        # Iterate through each 'aspectTerm' element
        for aspect_term in aspect_terms.findall('aspectTerm'):
            term = aspect_term.get('term')
            polarity = aspect_term.get('polarity', 'neutral')

            # Append to the lists
            texts.append(text)
            labels.append(polarity)

# Create a DataFrame
df = pd.DataFrame({'text': texts, 'label': labels})

# Split the dataset into training and validation sets
train_size = int(0.8 * len(df))
train_data = SentimentDataset(df['text'][:train_size], df['label'][:train_size])
val_data = SentimentDataset(df['text'][train_size:], df['label'][train_size:])

# Create DataLoader for training and validation sets
train_loader = DataLoader(train_data, batch_size=2, shuffle=True)
val_loader = DataLoader(val_data, batch_size=2, shuffle=False)

# Set up optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_text = batch['text']
        labels = tokenizer(batch['label'], return_tensors="pt", padding=True)["input_ids"]
        # Perform forward pass with specified 'labels'
        output = model(input_ids=tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)["input_ids"], labels=labels)

        # Retrieve the loss from the output
        loss = output.loss

        # Perform backward pass and optimization step
        loss.backward()
        optimizer.step()
    print(epoch)



# Save the fine-tuned model
model.save_pretrained("fine_tuned_model")

# Load the fine-tuned model
model = AutoModelForSeq2SeqLM.from_pretrained("fine_tuned_model")

# Perform sentiment analysis on a sample text
sample_text = "I love the performance of this laptop."
# Perform forward pass
tokenized_text = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
output = model.generate(tokenized_text.input_ids)
result = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the sentiment analysis result
print("Sentiment Analysis Result:")
print(result)


KeyboardInterrupt: ignored