### Installing required packages

### Importing necessary libraries

In [None]:
import numpy as np # Numerical operations
import pandas as pd # Data manipulation and analysis

# Datasets load

### Loading training dataset and showing first 5 rows

In [None]:
train_set = pd.read_csv('SentimentAnalysis/Train.csv')
train_set.head()

### Loading test dataset and showing first 5 rows

In [None]:
test_set = pd.read_csv('SentimentAnalysis/Test.csv')
test_set.head()

### Loading validation dataset and showing first 5 rows

In [None]:
val_set = pd.read_csv('SentimentAnalysis/Valid.csv')
val_set.head()

# Data preprocessing


### Converting all text to lowercase

In [None]:
def lowered(text):
    return text.lower()
train_set['text'] = train_set['text'].apply(lowered)
test_set['text'] = test_set['text'].apply(lowered)
val_set['text'] = val_set['text'].apply(lowered)

### Removing duplicates entries based on text content

In [None]:
train_set.drop_duplicates(subset=['text'], inplace=True)
test_set.drop_duplicates(subset=['text'], inplace=True)
val_set.drop_duplicates(subset=['text'], inplace=True)

### Converting DataFrames to Hugging Face Dataset for compatibility optimization

In [None]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_set)
test_dataset = Dataset.from_pandas(test_set)
val_dataset = Dataset.from_pandas(val_set)

### Showing information about each dataset

In [None]:
train_dataset, test_dataset, val_dataset

# Tokenization

### Loading BERT tokenizer

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

### Tokenization function

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True)

### Applying tokenization function to all datasets

In [None]:
train_tokenized = train_dataset.map(preprocess_function, batched=True)
test_tokenized = test_dataset.map(preprocess_function, batched=True)
val_tokenized = val_dataset.map(preprocess_function, batched=True)

### Shuffling dataset to improve generalization

In [None]:
train_tokenized.shuffle(seed=42)

# Model setup

### Loading pre-trained model and configuring for binary classification

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

### Setting up training configuration

In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments(output_dir="SentimentAnalysis/test_trainer")

# Evaluation metrics setup

### Loading evaluation metrics



In [None]:
import evaluate
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

### Function to compute multiple evaluation metrics

In [None]:
def eval_metric(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels, average="weighted")["f1"],
        "precision": precision.compute(predictions=predictions, references=labels, average="weighted")["precision"],
        "recall": recall.compute(predictions=predictions, references=labels, average="weighted")["recall"],
    }

# Training

### Training parameters

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="SentimentAnalysis/test_trainer",eval_strategy="epoch",save_steps=3000,num_train_epochs= 2)

### Initializing trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    compute_metrics=eval_metric
)

### Training the model

In [None]:
trainer.train()

### Model evaluation on validation data

In [None]:
trainer.evaluate()

# Model saving

### Fine-tuned model

In [None]:
model.save_pretrained("SentimentAnalysis/model")

### Tokenizer

In [None]:
tokenizer.save_pretrained("SentimentAnalysis/tokenizer")