In [44]:
import os

In [None]:
%ls

 Volume in drive C is Windows-SSD
 Volume Serial Number is 5824-84FE

 Directory of c:\Users\prass\OneDrive\Desktop\practise\new_env



02/03/2025  10:10 PM    <DIR>          .
02/03/2025  06:58 PM    <DIR>          ..
02/03/2025  10:02 PM    <DIR>          .github
02/17/2025  03:23 PM    <DIR>          text-classification-using-BERT
               0 File(s)              0 bytes
               4 Dir(s)  256,057,806,848 bytes free


In [32]:
os.chdir('text-classification-using-BERT')

In [45]:
%pwd

'c:\\Users\\prass\\OneDrive\\desktop\\practise\\new_env\\text-classification-using-BERT'

In [46]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class TrainingConfig:
    datasets_dir: Path  # Directory where datasets are saved
    output_dir: Path  # Directory to save training outputs
    model_save_path: Path  # Directory to save the trained model
    num_train_epochs: int  # Number of training epochs
    per_device_train_batch_size: int  # Training batch size per device
    per_device_eval_batch_size: int  # Evaluation batch size per device
    warmup_steps: int  # Number of warmup steps
    weight_decay: float  # Weight decay rate
    max_steps: int  # Maximum number of training steps
    save_steps: int  # Save model every `save_steps` steps
    logging_steps: int  # Log metrics every `logging_steps` steps


In [47]:
from pathlib import Path

CONFIG_FILE_PATH = Path("c:\\Users\\prass\\OneDrive\\desktop\\practise\\new_env\\text-classification-using-BERT/config/config.yaml")
PARAMS_FILE_PATH = Path("c:\\Users\\prass\\OneDrive\\desktop\\practise\\new_env\\text-classification-using-BERT/params.yaml")

In [48]:
from textClassifier.constants import *
from textClassifier.utils.commons import read_yaml, create_directories

In [49]:
import os
from pathlib import Path

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
        # Debug: Print the config and params objects
        print("Config:", self.config)
        print("Params:", self.params)

    def get_training_config(self) -> TrainingConfig:
        """
        Returns the configuration for model training.
        """
        training = self.config.feature_engineering
        print("i am printing training", training)
        params = self.params
        training_data = os.path.join(self.config.artifacts_root, 'feature_engineering')
        model_save_path = os.path.join(self.config.artifacts_root, 'training')

        create_directories([training.root_dir, training.model_save_path])

        training_config = TrainingConfig(
            datasets_dir=Path(training_data),
            output_dir=Path(training.root_dir),
            # model_save_path=Path(training.model_save_path),
            params_num_train_epochs=params.num_train_epochs,
            params_per_device_train_batch_size=params.per_device_train_batch_size,
            params_per_device_eval_batch_size=params.per_device_eval_batch_size,
            params_warmup_steps=params.warmup_steps,
            params_weight_decay=params.weight_decay,
            params_max_steps=params.max_steps,
            params_save_steps=params.save_steps,
            params_logging_steps=params.logging_steps
            
        )

        return training_config

In [50]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_training_config(self) -> TrainingConfig:
        """
        Returns the configuration for model training.
        """
        training = self.config.training
        params = self.params

        # Create directories if they don't exist
        create_directories([
            Path(training.root_dir),
            Path(training.model_save_path)
        ])

        training_config = TrainingConfig(
            datasets_dir=Path(self.config.feature_engineering.training_cleansed_data),  # Update this to point to feature_engineering
            output_dir=Path(training.root_dir),
            model_save_path=Path(training.model_save_path),
            num_train_epochs=params.num_train_epochs,
            per_device_train_batch_size=params.per_device_train_batch_size,
            per_device_eval_batch_size=params.per_device_eval_batch_size,
            warmup_steps=params.warmup_steps,
            weight_decay=params.weight_decay,
            max_steps=params.max_steps,
            save_steps=params.save_steps,
            logging_steps=params.logging_steps
        )

        return training_config

This below code is for BERT but we are using distllBERT

In [None]:
#  this is bert model or bert-based-uncased but we downgraded to distillBert for resources bottleneks
from transformers import (
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)

from pathlib import Path
import torch
from textClassifier import logger


class ModelTraining:
    def __init__(self, config: TrainingConfig):
        """
        Initializes the ModelTraining class.

        Args:
            config (ModelTrainingConfig): Configuration for model training.
        """
        self.config = config

    def load_datasets(self):
        """
        Loads the train, validation, and test datasets from the specified directory.

        Returns:
            train_dataset, val_dataset, test_dataset: Loaded datasets.
        """
        datasets_dir = Path(self.config.feature_engineering)
        train_dataset = torch.load(datasets_dir / "train_dataset.pt")
        val_dataset = torch.load(datasets_dir / "val_dataset.pt")
        test_dataset = torch.load(datasets_dir / "test_dataset.pt")

        logger.info(f"Datasets loaded from {datasets_dir}")
        return train_dataset, val_dataset, test_dataset

    def train(self):
        """
        Trains the model using the loaded datasets.
        """
        # Load datasets
        train_dataset, val_dataset, _ = self.load_datasets()

        # Initialize the model
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased', num_labels=3
        )

        # Set up training arguments
        training_args = TrainingArguments(
            output_dir=self.config.output_dir,
            num_train_epochs=self.config.num_train_epochs,
            per_device_train_batch_size=self.config.per_device_train_batch_size,
            per_device_eval_batch_size=self.config.per_device_eval_batch_size,
            warmup_steps=self.config.warmup_steps,
            weight_decay=self.config.weight_decay,
            max_steps=self.config.max_steps,
            save_steps=self.config.save_steps,
            logging_steps=self.config.logging_steps,
            load_best_model_at_end=True
        )

        # Initialize the Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset
        )

        # Start training
        trainer.train()

        # Save the trained model
        self.save_model(trainer)

    def save_model(self, trainer):
        """
        Saves the trained model and tokenizer to the specified directory.

        Args:
            trainer (Trainer): The Trainer object containing the trained model.
        """
        save_path = Path(self.config.model_save_path)
        save_path.mkdir(parents=True, exist_ok=True)

        trainer.save_model(save_path)
        logger.info(f"Model saved to {save_path}")

    def evaluate(self):
        """
        Evaluates the model on the test dataset.
        """
        # Load the test dataset
        _, _, test_dataset = self.load_datasets()

        # Load the trained model
        model = BertForSequenceClassification.from_pretrained(self.config.model_save_path)

        # Set up training arguments for evaluation
        training_args = TrainingArguments(
            output_dir=self.config.output_dir,
            per_device_eval_batch_size=self.config.per_device_eval_batch_size,
            logging_dir=self.config.logging_dir
        )

        # Initialize the Trainer for evaluation
        trainer = Trainer(
            model=model,
            args=training_args,
            # train_dataset=train_dataset, # The dataset used for training the model
            # eval_dataset=val_dataset
        )

        # Evaluate the model
        results = trainer.evaluate(test_dataset)
        logger.info("Evaluation Results:")
        logger.info(f"  - Loss: {results['eval_loss']:.4f}")
        logger.info(f"  - Runtime: {results['eval_runtime']:.2f} seconds")
        logger.info(f"  - Samples per Second: {results['eval_samples_per_second']:.2f}")
        logger.info(f"  - Steps per Second: {results['eval_steps_per_second']:.2f}")
        logger.info(f"  - Epoch: {results['epoch']:.4f}")

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# num_train_epochs: 1
# per_device_train_batch_size: 5
# per_device_eval_batch_size: 5
# warmup_steps: 10
# weight_decay: 0.01
# max_steps: 10
# save_steps: 2
# logging_steps: 2

In [5]:
import os

In [6]:
%pwd

'c:\\Users\\prass\\OneDrive\\desktop\\practise\\new_env\\text-classification-using-BERT'

Actual Training starts here!

In [52]:
from pathlib import Path

CONFIG_FILE_PATH = Path("c:\\Users\\prass\\OneDrive\\desktop\\practise\\new_env\\text-classification-using-BERT/config/config.yaml")
PARAMS_FILE_PATH = Path("c:\\Users\\prass\\OneDrive\\desktop\\practise\\new_env\\text-classification-using-BERT/params.yaml")

In [53]:
from textClassifier.constants import *
from textClassifier.utils.commons import read_yaml, create_directories

In [61]:
from transformers import (
    BertForSequenceClassification,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from pathlib import Path
import torch
from textClassifier import logger


class ModelTraining:
    def __init__(self, config: TrainingConfig):
        """
        Initializes the ModelTraining class.

        Args:
            config (ModelTrainingConfig): Configuration for model training.
        """
        self.config = config

    def load_datasets(self):
        """
        Loads the train, validation, and test datasets from the specified directory.

        Returns:
            train_dataset, val_dataset, test_dataset: Loaded datasets.
        """
        datasets_dir = Path(self.config.datasets_dir)
        train_dataset = torch.load(datasets_dir / "train_dataset.pt", weights_only=False)
        val_dataset = torch.load(datasets_dir / "val_dataset.pt", weights_only=False)
        test_dataset = torch.load(datasets_dir / "test_dataset.pt", weights_only=False)

        logger.info(f"Datasets loaded from {datasets_dir}")
        return train_dataset, val_dataset, test_dataset

    def train(self):
        """
        Trains the model using the loaded datasets.
        """
        # Load datasets
        train_dataset, val_dataset, _ = self.load_datasets()

        # Initialize the model
        model = AutoModelForSequenceClassification.from_pretrained(
            "distilbert-base-uncased", num_labels=3
            )

        #model = BertForSequenceClassification.from_pretrained(
        #    'bert-base-uncased', num_labels=3
        #)

        # Set up training arguments
        training_args = TrainingArguments(
            output_dir=self.config.output_dir,
            num_train_epochs=self.config.num_train_epochs,
            per_device_train_batch_size=self.config.per_device_train_batch_size,
            per_device_eval_batch_size=self.config.per_device_eval_batch_size,
            warmup_steps=self.config.warmup_steps,
            weight_decay=self.config.weight_decay,
            max_steps=self.config.max_steps,
            save_steps=self.config.save_steps,
            evaluation_strategy="epoch",
            # load_best_model_at_end=True
        )

        # Initialize the Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset
        )

        # Start training
        trainer.train()

        # Save the trained model
        self.save_model(trainer)

    def save_model(self, trainer):
        """
        Saves the trained model and tokenizer to the specified directory.

        Args:
            trainer (Trainer): The Trainer object containing the trained model.
        """
        save_path = Path(self.config.model_save_path)
        save_path.mkdir(parents=True, exist_ok=True)

        trainer.save_model(save_path)
        logger.info(f"Model saved to {save_path}")

    def evaluate(self):
        """
        Evaluates the model on the test dataset.
        """
        # Load the test dataset
        _, _, test_dataset = self.load_datasets()

        # Load the trained model
        # model = AutoModelForSequenceClassification.from_pretrained(self.config.model_save_path)
        model = AutoModelForSequenceClassification.from_pretrained(self.config.model_save_path)


        # Set up training arguments for evaluation
        training_args = TrainingArguments(
            output_dir=self.config.output_dir,
            per_device_eval_batch_size=self.config.per_device_eval_batch_size
        )

        # Initialize the Trainer for evaluation
        trainer = Trainer(
            model=model,
            args=training_args
        )

        # Evaluate the model
        results = trainer.evaluate(test_dataset)

        logger.info("Evaluation Results:")
        logger.info("Evaluation Results:", results)
        logger.info(f"  - Loss: {results['eval_loss']:.4f}")
        logger.info(f"  - Runtime: {results['eval_runtime']:.2f} seconds")
        logger.info(f"  - Samples per Second: {results['eval_samples_per_second']:.2f}")
        logger.info(f"  - Steps per Second: {results['eval_steps_per_second']:.2f}")
        logger.info(f"  - Epoch: {results.get('epoch', 'N/A')}")

In [79]:
# from textClassifier.components import SentimentDataset  # Import the SentimentDataset class
# import torch

# # Load your data and create datasets
# train_dataset = SentimentDataset(train_encodings, train_labels)
# val_dataset = SentimentDataset(val_encodings, val_labels)
# test_dataset = SentimentDataset(test_encodings, test_labels)

# # Save the datasets
# torch.save(train_dataset, "artifacts/feature_engineering/train_dataset.pt")
# torch.save(val_dataset, "artifacts/feature_engineering/val_dataset.pt")
# torch.save(test_dataset, "artifacts/feature_engineering/test_dataset.pt")

In [62]:
import torch

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

def convert_to_dicts(tokenized_texts):
    """
    Converts a list of tokenized texts into a dictionary of input IDs and attention masks.

    Args:
        tokenized_texts (list): List of tokenized texts (each is a dictionary with 'input_ids' and 'attention_mask').

    Returns:
        dict: A dictionary containing 'input_ids' and 'attention_mask' as lists.
    """
    input_ids = [d['input_ids'].squeeze(0) for d in tokenized_texts]  # Remove batch dimension
    attention_masks = [d['attention_mask'].squeeze(0) for d in tokenized_texts]  # Remove batch dimension
    return {'input_ids': input_ids, 'attention_mask': attention_masks}


def create_datasets(train_texts, train_labels, val_texts, val_labels, test_texts, test_labels):
    """
    Creates SentimentDataset objects for train, validation, and test splits.

    Args:
        train_texts (list): List of tokenized texts for the training set.
        train_labels (list): List of labels for the training set.
        val_texts (list): List of tokenized texts for the validation set.
        val_labels (list): List of labels for the validation set.
        test_texts (list): List of tokenized texts for the test set.
        test_labels (list): List of labels for the test set.

    Returns:
        train_dataset, val_dataset, test_dataset: SentimentDataset objects for each split.
    """
    # Convert tokenized texts to encodings
    train_encodings = convert_to_dicts(train_texts)
    val_encodings = convert_to_dicts(val_texts)
    test_encodings = convert_to_dicts(test_texts)

    # Create SentimentDataset objects
    train_dataset = SentimentDataset(train_encodings, train_labels)
    val_dataset = SentimentDataset(val_encodings, val_labels)
    test_dataset = SentimentDataset(test_encodings, test_labels)

    return train_dataset, val_dataset, test_dataset

def save_datasets(self, train_dataset, val_dataset, test_dataset):
        """
        Saves the train, validation, and test datasets to the specified directory.

        Args:
            train_dataset: Training dataset.
            val_dataset: Validation dataset.
            test_dataset: Test dataset.
        """
        # Create the directory if it doesn't exist
        datasets_dir = Path(self.config.datasets_dir)
        datasets_dir.mkdir(parents=True, exist_ok=True)

        # Save the datasets
        torch.save(train_dataset, datasets_dir / "train_dataset.pt")
        torch.save(val_dataset, datasets_dir / "val_dataset.pt")
        torch.save(test_dataset, datasets_dir / "test_dataset.pt")

        logger.info(f"Datasets saved to {datasets_dir}")

In [66]:
from accelerate import PartialState
accelerator_state_kwargs = {"enabled": True, "use_configured_state": False}

# Initialize PartialState

from textClassifier.components import sentimentsDataset
if __name__ == "__main__":
    # import mlflow
    # mlflow.set_tracking_uri(None)
    # Initialize ConfigurationManager
    config_manager = ConfigurationManager()
    # Get the model training config
    training_config = config_manager.get_training_config()

    # Initialize ModelTraining
    model_training = ModelTraining(config=training_config)

    # Train the model
    model_training.train()

    # Evaluate the model
    model_training.evaluate()
    
    partial_state = PartialState()
    
    

[2025-03-04 21:54:36,547: INFO: commons: yaml file: c:\Users\prass\OneDrive\desktop\practise\new_env\text-classification-using-BERT\config\config.yaml loaded successfully]
[2025-03-04 21:54:36,555: INFO: commons: yaml file: c:\Users\prass\OneDrive\desktop\practise\new_env\text-classification-using-BERT\params.yaml loaded successfully]
[2025-03-04 21:54:36,558: INFO: commons: created directory at: artifacts]


[2025-03-04 21:54:36,559: INFO: commons: created directory at: artifacts\training]
[2025-03-04 21:54:36,563: INFO: commons: created directory at: artifacts\training\trained_model]
[2025-03-04 21:54:40,234: INFO: 2927273006: Datasets loaded from artifacts\feature_engineering]


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
0,No log,0.976229


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


[2025-03-04 21:59:15,344: INFO: 2927273006: Model saved to artifacts\training\trained_model]
[2025-03-04 21:59:20,081: INFO: 2927273006: Datasets loaded from artifacts\feature_engineering]


[2025-03-04 22:03:26,398: INFO: 2927273006: Evaluation Results:]
[2025-03-04 22:03:26,399: INFO: 2927273006: Evaluation Results:]
[2025-03-04 22:03:26,400: INFO: 2927273006:   - Loss: 0.9623]
[2025-03-04 22:03:26,402: INFO: 2927273006:   - Runtime: 246.15 seconds]
[2025-03-04 22:03:26,404: INFO: 2927273006:   - Samples per Second: 8.92]
[2025-03-04 22:03:26,406: INFO: 2927273006:   - Steps per Second: 4.46]
[2025-03-04 22:03:26,408: INFO: 2927273006:   - Epoch: N/A]


In [69]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from pathlib import Path

class SentimentInference:
    def __init__(self, model_path: str):
        """
        Initializes the SentimentInference class.

        Args:
            model_path (str): Path to the trained model and tokenizer.
        """
        self.model_path = Path(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_path)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
        self.sentiment_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

    def predict_sentiment(self, text: str) -> str:
        """
        Predicts the sentiment of the input text.

        Args:
            text (str): Input text for sentiment prediction.

        Returns:
            str: Predicted sentiment ('negative', 'neutral', 'positive').
        """
        # Tokenize the input text
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
        
        # Get model predictions
        outputs = self.model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=1).item()
        
        # Map the predicted class to sentiment
        return self.sentiment_map[predicted_class]

In [None]:
os.

 Volume in drive C is Windows-SSD
 Volume Serial Number is 5824-84FE

 Directory of c:\Users\prass\OneDrive\Desktop\practise\new_env\text-classification-using-BERT

02/17/2025  03:23 PM    <DIR>          .
02/03/2025  10:10 PM    <DIR>          ..
02/03/2025  10:15 PM    <DIR>          .github


02/05/2025  11:14 AM             3,597 .gitignore
03/04/2025  09:54 PM    <DIR>          artifacts
02/03/2025  10:15 PM    <DIR>          config
02/03/2025  10:15 PM                 0 dvc.yaml
02/03/2025  09:43 PM             1,087 LICENSE
02/03/2025  10:50 PM    <DIR>          logs
02/05/2025  11:31 AM               440 main.py
03/04/2025  08:59 PM               170 params.yaml
02/03/2025  09:43 PM                32 README.md
02/03/2025  10:26 PM               248 requirements.txt
03/04/2025  09:08 PM    <DIR>          research
02/03/2025  10:16 PM               825 setup.py
02/03/2025  10:28 PM    <DIR>          src
02/07/2025  10:25 AM             1,344 template.py
02/05/2025  11:31 AM    <DIR>          templates
03/03/2025  04:07 PM    <DIR>          wandb
               9 File(s)          7,743 bytes
              10 Dir(s)  251,240,882,176 bytes free


In [74]:
if __name__ == "__main__":
    model_path = 'C:\Users\prass\OneDrive\desktop\practise\new_env\text-classification-using-BERT\artifacts\training'
    pred = SentimentInference(model_path)
    test = pred.predict_sentiment('I hate flying with airlines')

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (1975116703.py, line 2)

In [76]:
from pathlib import Path
if __name__ == "__main__":
    # Provide the path to your trained model
    model_path = Path("C:/Users/prass/OneDrive/desktop/practise/new_env/text-classification-using-BERT/artifacts/training/trained_model")
    
    # Initialize the SentimentInference class
    pred = SentimentInference(model_path)
    
    # Test the prediction
    test = pred.predict_sentiment('I hate flying with airlines')
    print(f"Predicted Sentiment: {test}")

OSError: Can't load tokenizer for 'C:\Users\prass\OneDrive\desktop\practise\new_env\text-classification-using-BERT\artifacts\training\trained_model'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'C:\Users\prass\OneDrive\desktop\practise\new_env\text-classification-using-BERT\artifacts\training\trained_model' is the correct path to a directory containing all relevant files for a DistilBertTokenizerFast tokenizer.

In [None]:
# from transformers import AutoModelForSequenceClassification, AutoTokenizer

# # Load the model and tokenizer
# model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# # Save the model and tokenizer
# model.save_pretrained("C:/Users/prass/OneDrive/desktop/practise/new_env/text-classification-using-BERT/artifacts/training/testing_dir")
# tokenizer.save_pretrained("C:/Users/prass/OneDrive/desktop/practise/new_env/text-classification-using-BERT/artifacts/training/testing_dir")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


('C:/Users/prass/OneDrive/desktop/practise/new_env/text-classification-using-BERT/artifacts/training/testing_dir\\tokenizer_config.json',
 'C:/Users/prass/OneDrive/desktop/practise/new_env/text-classification-using-BERT/artifacts/training/testing_dir\\special_tokens_map.json',
 'C:/Users/prass/OneDrive/desktop/practise/new_env/text-classification-using-BERT/artifacts/training/testing_dir\\vocab.txt',
 'C:/Users/prass/OneDrive/desktop/practise/new_env/text-classification-using-BERT/artifacts/training/testing_dir\\added_tokens.json',
 'C:/Users/prass/OneDrive/desktop/practise/new_env/text-classification-using-BERT/artifacts/training/testing_dir\\tokenizer.json')

In [101]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from pathlib import Path

class SentimentInference:
    def __init__(self, model_path: str):
        """
        Initializes the SentimentInference class.

        Args:
            model_path (str): Path to the trained model and tokenizer.
        """
        self.model_path = Path(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_path)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
        self.sentiment_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

    def predict_sentiment(self, text: str) -> str:
        """
        Predicts the sentiment of the input text.

        Args:
            text (str): Input text for sentiment prediction.

        Returns:
            str: Predicted sentiment ('negative', 'neutral', 'positive').
        """
        # Tokenize the input text
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
        
        # Get model predictions
        outputs = self.model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=1).item()
        
        # Map the predicted class to sentiment
        return self.sentiment_map[predicted_class]

# if __name__ == "__main__":
#     # Provide the path to your trained model and tokenizer
#     model_path = "C:/Users/prass/OneDrive/desktop/practise/new_env/text-classification-using-BERT/artifacts/training/testing_dir"
    
#     # Initialize the SentimentInference class
#     pred = SentimentInference(model_path)
    
#     # Test the prediction
#     test = pred.predict_sentiment('I hate flying with airlines')
#     print(f"Predicted Sentiment: {test}")

In [85]:
# def predict_sentiment(text):
#     '''Function to predict the sentiment of a given text using a pre-trained BERT model.
#     Args: the input text for sentiment prediction.
#     Returns: the predicted sentiment ('negative', 'neutral', 'positive').
#     '''

#     inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
#     outputs = model(**inputs)
#     predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
#     predicted_class = torch.argmax(predictions, dim=1).item()
#     sentiment = {0: 'negative', 1: 'neutral', 2: 'positive'}
#     return sentiment[predicted_class]

# # Example prediction
# example_text = "I hate flying with this airline!"
# predicted_sentiment = predict_sentiment(example_text)
# print(f"Predicted Sentiment: {predicted_sentiment}")

In [None]:
# import accelerate
# import transformers

# print("Accelerate version:", accelerate.__version__)
# print("Transformers version:", transformers.__version__)

Accelerate version: 0.26.0
Transformers version: 4.49.0


In [99]:
from flask import Flask, request, jsonify
from flask_cors import CORS, cross_origin
# Initialize the Flask app
app = Flask(__name__)
# app = Flask(__name__)
CORS(app)

# Initialize the SentimentInference class
# model_path = "artifacts/trained_model"
model_path = "C:/Users/prass/OneDrive/desktop/practise/new_env/text-classification-using-BERT/artifacts/training/testing_dir"
sentiment_inference = SentimentInference(model_path)

@app.route("/predict", methods=["POST"])
def predict():
    """
    API endpoint for sentiment prediction.
    Expects a JSON payload with a 'text' field.
    """
    # Get the input text from the request
    data = request.json
    text = data.get("text")
    
    # Validate the input
    if not text:
        return jsonify({"error": "No text provided"}), 400
    
    # Predict the sentiment
    try:
        sentiment = sentiment_inference.predict_sentiment(text)
        return jsonify({"sentiment": sentiment})
    except Exception as e:
        return jsonify({"error": str(e)}), 500

if __name__ == "__main__":
    # Run the Flask app
    app.run(host="0.0.0.0", port=5000)

 * Serving Flask app '__main__'
 * Debug mode: off
 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.1.117:5000]
[2025-03-09 14:07:14,066: INFO: _internal: [33mPress CTRL+C to quit[0m]


In [102]:
from flask import Flask, request, jsonify, render_template_string

# Initialize the Flask app
app = Flask(__name__)

# Dummy SentimentInference class for demonstration
class SentimentInference:
    def __init__(self, model_path):
        self.model_path = model_path

    def predict_sentiment(self, text):
        # Dummy sentiment prediction logic
        # if "good" in text.lower():
        #     return "Positive"
        # elif "bad" in text.lower():
        #     return "Negative"
        # else:
        #     return "Neutral"
        try:
            sentiment = sentiment_inference.predict_sentiment(text)
            return jsonify({"sentiment": sentiment})
        except Exception as e:
            return jsonify({"error": str(e)}), 500

        
    

# Initialize the SentimentInference class
model_path = "C:/Users/prass/OneDrive/desktop/practise/new_env/text-classification-using-BERT/artifacts/training/testing_dir"
sentiment_inference = SentimentInference(model_path)

# Home page route
@app.route("/", methods=["GET"])
def home():
    return """
    <h1>Welcome to Sentiment Analysis</h1>
    <p>Click <a href="/predict">here</a> to go to the prediction page.</p>
    """

# Predict page route
@app.route("/predict", methods=["GET", "POST"])
def predict():
    if request.method == "GET":
        # Render a basic HTML form for text input
        return render_template_string('''
        <h1>Sentiment Prediction</h1>
        <form method="POST">
            <label for="text">Enter your text:</label><br>
            <textarea id="text" name="text" rows="4" cols="50"></textarea><br><br>
            <input type="submit" value="Predict">
        </form>
        ''')
    elif request.method == "POST":
        # Get the input text from the form
        text = request.form.get("text")
        
        # Validate the input
        if not text:
            return jsonify({"error": "No text provided"}), 400
        
        # Predict the sentiment
        try:
            sentiment = sentiment_inference.predict_sentiment(text)
            return jsonify({"sentiment": sentiment})
        except Exception as e:
            return jsonify({"error": str(e)}), 500

if __name__ == "__main__":
    # Run the Flask app
    app.run(host="0.0.0.0", port=5000)

 * Serving Flask app '__main__'
 * Debug mode: off
 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.1.117:5000]
[2025-03-09 14:26:25,829: INFO: _internal: [33mPress CTRL+C to quit[0m]


In [103]:
from flask import Flask, request, jsonify, render_template_string
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from pathlib import Path

# Initialize the Flask app
app = Flask(__name__)

# Your SentimentInference class
class SentimentInference:
    def __init__(self, model_path: str):
        """
        Initializes the SentimentInference class.

        Args:
            model_path (str): Path to the trained model and tokenizer.
        """
        self.model_path = Path(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_path)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
        self.sentiment_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

    def predict_sentiment(self, text: str) -> str:
        """
        Predicts the sentiment of the input text.

        Args:
            text (str): Input text for sentiment prediction.

        Returns:
            str: Predicted sentiment ('negative', 'neutral', 'positive').
        """
        # Tokenize the input text
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
        
        # Get model predictions
        outputs = self.model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=1).item()
        
        # Map the predicted class to sentiment
        return self.sentiment_map[predicted_class]

# Initialize the SentimentInference class
model_path = "C:/Users/prass/OneDrive/desktop/practise/new_env/text-classification-using-BERT/artifacts/training/testing_dir"
sentiment_inference = SentimentInference(model_path)

# Home page route
@app.route("/", methods=["GET"])
def home():
    return """
    <h1>Welcome to Sentiment Analysis</h1>
    <p>Click <a href="/predict">here</a> to go to the prediction page.</p>
    """

# Predict page route
@app.route("/predict", methods=["GET", "POST"])
def predict():
    if request.method == "GET":
        # Render a basic HTML form for text input
        return render_template_string('''
        <h1>Sentiment Prediction</h1>
        <form method="POST">
            <label for="text">Enter your text:</label><br>
            <textarea id="text" name="text" rows="4" cols="50"></textarea><br><br>
            <input type="submit" value="Predict">
        </form>
        ''')
    elif request.method == "POST":
        # Get the input text from the form
        text = request.form.get("text")
        
        # Validate the input
        if not text:
            return jsonify({"error": "No text provided"}), 400
        
        # Predict the sentiment
        try:
            sentiment = sentiment_inference.predict_sentiment(text)
            return jsonify({"sentiment": sentiment})
        except Exception as e:
            return jsonify({"error": str(e)}), 500

if __name__ == "__main__":
    # Run the Flask app
    app.run(host="0.0.0.0", port=5000)

 * Serving Flask app '__main__'
 * Debug mode: off
 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.1.117:5000]
[2025-03-09 14:33:46,912: INFO: _internal: [33mPress CTRL+C to quit[0m]
[2025-03-09 14:34:04,194: INFO: _internal: 127.0.0.1 - - [09/Mar/2025 14:34:04] "GET / HTTP/1.1" 200 -]
[2025-03-09 14:34:04,499: INFO: _internal: 127.0.0.1 - - [09/Mar/2025 14:34:04] "GET / HTTP/1.1" 200 -]
[2025-03-09 14:34:07,187: INFO: _internal: 127.0.0.1 - - [09/Mar/2025 14:34:07] "GET /predict HTTP/1.1" 200 -]
[2025-03-09 14:34:18,667: INFO: _internal: 127.0.0.1 - - [09/Mar/2025 14:34:18] "POST /predict HTTP/1.1" 200 -]
[2025-03-09 14:34:37,061: INFO: _internal: 127.0.0.1 - - [09/Mar/2025 14:34:37] "GET / HTTP/1.1" 200 -]
[2025-03-09 14:34:38,810: INFO: _internal: 127.0.0.1 - - [09/Mar/2025 14:34:38] "GET /predict HTTP/1.1" 200 -]
[2025-03-09 14:34:51,884: INFO: _internal: 127.0.0.1 - - [09/Mar/2025 14:34:51] "POST /predict HTTP/1.1" 200 -]
[2

In [96]:
import requests

url = "http://127.0.0.1:5000/predict"
data = {"text": "This is a sample text for sentiment analysis."}
response = requests.post(url, json=data)

print(response.json())

ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=5000): Max retries exceeded with url: /predict (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000023395A1EDF0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

In [None]:
# import torch
# print(torch.cuda.is_available())

False


In [None]:
# from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [101]:
os.chdir('feature_engineering')

In [109]:
# os.chdir('prepare_model')
%pwd
# %ls

'c:\\Users\\prass\\OneDrive\\desktop\\practise\\new_env\\text-classification-using-BERT\\artifacts\\prepare_model'

In [111]:
from dataclasses import dataclass

@dataclass
class TrainingConfig:
    datasets_dir: str = "c:\\Users\\prass\\OneDrive\\desktop\\practise\\new_env\\text-classification-using-BERT\\artifacts\\feature_engineering"  # Directory containing train_dataset.pt, val_dataset.pt, test_dataset.pt
    output_dir: str = "c:\\Users\\prass\\OneDrive\\desktop\\practise\\new_env\\text-classification-using-BERT\\artifacts\\training"      # Directory to save training outputs
    model_save_path: str = "c:\\Users\\prass\\OneDrive\\desktop\\practise\\new_env\\text-classification-using-BERT\\artifacts\\training\\trained_model"  # Directory to save the trained model
    num_train_epochs: int = 1              # Number of training epochs
    per_device_train_batch_size: int = 16   # Batch size for training
    per_device_eval_batch_size: int = 16    # Batch size for evaluation
    warmup_steps: int = 500                 # Number of warmup steps
    weight_decay: float = 0.01              # Weight decay
    max_steps: int = 1000                   # Maximum number of training steps
    save_steps: int = 500                   # Save model every `save_steps`
    logging_dir: str = "c:\\Users\\prass\\OneDrive\\desktop\\practise\\new_env\\text-classification-using-BERT\\logs"       # Directory for logs

# Initialize TrainingConfig
config = TrainingConfig()

# Initialize ModelTraining
model_training = ModelTraining(config)

In [1]:
import os
%os._exit(00)

UsageError: Line magic function `%os._exit(00)` not found.


In [25]:
import accelerate

accelerate.__version__

'0.26.0'