In [7]:
import pandas as pd

In [8]:
df = pd.read_csv(r"C:\Users\HP\Downloads\YoutubeCommentsDataSet.csv")

In [9]:
df = df.dropna(subset=['Comment','Sentiment'])

In [10]:
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
df['label'] = df['Sentiment'].map(label_map)

In [11]:
print(df.head)

<bound method NDFrame.head of                                                  Comment Sentiment  label
0      lets not forget that apple pay in 2014 require...   neutral      1
1      here in nz 50 of retailers don’t even have con...  negative      2
2      i will forever acknowledge this channel with t...  positive      0
3      whenever i go to a place that doesn’t take app...  negative      2
4      apple pay is so convenient secure and easy to ...  positive      0
...                                                  ...       ...    ...
18403  i really like the point about engineering tool...  positive      0
18404  i’ve just started exploring this field and thi...  positive      0
18405  excelente video con una pregunta filosófica pr...   neutral      1
18406  hey daniel just discovered your channel a coup...  positive      0
18407  this is great focus is key a playful approach ...  positive      0

[18364 rows x 3 columns]>


In [12]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

In [13]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Comment'].tolist(), df['label'].tolist(), test_size=0.2
)

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [15]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

In [16]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

In [17]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4586,0.394866
2,0.2785,0.53759
3,0.1324,0.685518


TrainOutput(global_step=5511, training_loss=0.3054946356645147, metrics={'train_runtime': 1787.8643, 'train_samples_per_second': 24.651, 'train_steps_per_second': 3.082, 'total_flos': 2899049414881536.0, 'train_loss': 0.3054946356645147, 'epoch': 3.0})

In [18]:
trainer.save_model('./sentiment-model')
tokenizer.save_pretrained('./sentiment-model')

('./sentiment-model\\tokenizer_config.json',
 './sentiment-model\\special_tokens_map.json',
 './sentiment-model\\vocab.txt',
 './sentiment-model\\added_tokens.json')