In [1]:
import os
%pwd

'c:\\Users\\91787\\Programming\\Projects\\Sentiment-Analysis_Roberta\\research'

In [2]:
os.chdir("../")
%pwd

'c:\\Users\\91787\\Programming\\Projects\\Sentiment-Analysis_Roberta'

In [3]:
from dataclasses import dataclass
from pathlib import Path 
@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path

In [4]:
from sentimentanalysis.constants import *
from sentimentanalysis.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir= config.root_dir,
            data_path = config.data_path,
            model_ckpt = config.model_ckpt
        )

        return model_trainer_config

In [6]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from scipy.special import softmax

In [7]:
import os
import pandas as pd
import numpy as np 

In [10]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        

    def train(self):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {device}")

        # Read in data
        df = pd.read_csv("C:\\Users\\91787\\Programming\\Projects\\Sentiment-Analysis_Roberta\\data\\Reviews.csv")
        df = df.head(500)

        # Step 1: VADER Sentiment Scoring
        sia = SentimentIntensityAnalyzer()

        # Run the polarity score on the entire dataset
        res = {}
        for i, row in tqdm(df.iterrows(), total=len(df)):
            text = row['Text']
            myid = row['Id']
            res[myid] = sia.polarity_scores(text)

        vaders = pd.DataFrame(res).T
        vaders = vaders.reset_index().rename(columns={'index': 'Id'})
        vaders = vaders.merge(df, how='left')

        # Step 3: Roberta Pretrained Model
        MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
        tokenizer = AutoTokenizer.from_pretrained(MODEL)
        model = AutoModelForSequenceClassification.from_pretrained(MODEL)
        model.to(device)

        def polarity_scores_roberta(example):
            encoded_text = tokenizer(example, return_tensors='pt')
            encoded_text = encoded_text.to(device)
            
            output = model(**encoded_text)
            scores = output.logits.detach().cpu().numpy()
            scores = softmax(scores, axis=1)  # Ensure softmax is applied along the correct axis
            
            scores_dict = {
                'roberta_neg' : scores[0, 0],  # Adjust indexing
                'roberta_neu' : scores[0, 1],  # Adjust indexing
                'roberta_pos' : scores[0, 2],  # Adjust indexing
            }
            return scores_dict

        res = {}
        for i, row in tqdm(df.iterrows(), total=len(df)):
            try:
                text = row['Text']
                myid = row['Id']
                vader_result = sia.polarity_scores(text)
                vader_result_rename = {}
                for key, value in vader_result.items():
                    vader_result_rename[f"vader_{key}"] = value
                roberta_result = polarity_scores_roberta(text)
                both = {**vader_result_rename, **roberta_result}
                res[myid] = both
            except RuntimeError:
                print(f'Broke for id {myid}')

        results_df = pd.DataFrame(res).T
        results_df = results_df.reset_index().rename(columns={'index': 'Id'})
        results_df = results_df.merge(df, how='left')

        # Save the tokenizer and model
        tokenizer.save_pretrained(os.path.join(self.config.root_dir, "tokenizer"))
        model.save_pretrained(os.path.join(self.config.root_dir, "pegasus-samsum-model"))


In [11]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.train()
except Exception as e:
    raise e


[2024-02-10 14:45:49,872: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-10 14:45:49,885: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-10 14:45:49,887: INFO: common: created directory at: artifacts]
[2024-02-10 14:45:49,890: INFO: common: created directory at: artifacts/model_trainer]
Using device: cpu


  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Broke for id 83
Broke for id 187
