<a href="https://colab.research.google.com/github/pai24rohit/semper8project/blob/main/Semper8Proj2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# Install necessary libraries
!pip install transformers datasets -q


In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import pandas as pd

In [18]:
from datasets import load_dataset

# Load the financial dataset with a specific configuration
dataset = load_dataset("financial_phrasebank", "sentences_allagree", split="train")

# Print a sample of the dataset
print(dataset[0])


FinancialPhraseBank-v1.0.zip:   0%|          | 0.00/682k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2264 [00:00<?, ? examples/s]

{'sentence': 'According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .', 'label': 1}


In [19]:
from transformers import AutoTokenizer

# Load a tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding="max_length", truncation=True)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2264 [00:00<?, ? examples/s]

In [20]:
from datasets import DatasetDict

# Split the dataset into train and validation
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]

# Format datasets for PyTorch or TensorFlow
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


In [21]:
from transformers import AutoModelForSequenceClassification

# Load the pretrained model with the required number of labels
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,  # Set to True if using Hugging Face Hub
)




In [23]:
from transformers import Trainer

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.3612,0.319723
2,0.106,0.132733
3,0.0388,0.128907


TrainOutput(global_step=342, training_loss=0.26485044652955575, metrics={'train_runtime': 174.0688, 'train_samples_per_second': 31.212, 'train_steps_per_second': 1.965, 'total_flos': 1429495198516224.0, 'train_loss': 0.26485044652955575, 'epoch': 3.0})

In [24]:
# Evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.12890729308128357, 'eval_runtime': 3.2568, 'eval_samples_per_second': 139.093, 'eval_steps_per_second': 8.904, 'epoch': 3.0}


In [25]:
# Save the model and tokenizer
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

('./sentiment_model/tokenizer_config.json',
 './sentiment_model/special_tokens_map.json',
 './sentiment_model/vocab.txt',
 './sentiment_model/added_tokens.json',
 './sentiment_model/tokenizer.json')

In [50]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [51]:
import os
model_path = '/content/drive/MyDrive/sentiment_model'
os.makedirs(model_path, exist_ok=True)


In [52]:
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


('/content/drive/MyDrive/sentiment_model/tokenizer_config.json',
 '/content/drive/MyDrive/sentiment_model/special_tokens_map.json',
 '/content/drive/MyDrive/sentiment_model/vocab.txt',
 '/content/drive/MyDrive/sentiment_model/added_tokens.json',
 '/content/drive/MyDrive/sentiment_model/tokenizer.json')

In [53]:
!ls /content/drive/MyDrive/sentiment_model


config.json	   special_tokens_map.json  tokenizer.json
model.safetensors  tokenizer_config.json    vocab.txt


In [57]:
print(dataset[0])  # This will show the first sentence and its label


{'sentence': 'According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .', 'label': 1}


In [58]:
from transformers import pipeline

# Load the fine-tuned model
sentiment_analyzer = pipeline("text-classification", model="./sentiment_model", tokenizer="./sentiment_model")

# Define a mapping for the labels
labels_map = {
    "LABEL_0": "negative",
    "LABEL_1": "neutral",
    "LABEL_2": "positive"
}

# Test with a news headline
news_headline = "Global markets rally as inflation fears fade."
result = sentiment_analyzer(news_headline)

# Extract the label and map it
predicted_label = result[0]['label']
predicted_score = result[0]['score']

# Print the sentiment label and score
print(f"Sentiment: {labels_map.get(predicted_label, predicted_label)}")
print(f"Confidence score: {predicted_score:.2f}")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Sentiment: positive
Confidence score: 0.71
