In [115]:
import warnings
warnings.filterwarnings("ignore")

# Importing Basic libraries

In [116]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [117]:
df = pd.read_csv('./YoutubeCommentsDataSet.csv')
df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [118]:
df.isnull().sum()

Comment      44
Sentiment     0
dtype: int64

In [119]:
df.dropna(inplace=True)

In [120]:
print(f"Percent of duplication in the dataset: {round(df.duplicated().sum() / len(df), 3) * 100}%")

Percent of duplication in the dataset: 2.7%


Since the duplication is less than 3% of the original dataset, we can just drop it

In [121]:
df.drop_duplicates(inplace=True)

In [122]:
df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [123]:
df['Sentiment'].value_counts()

Sentiment
positive    11054
neutral      4503
negative     2317
Name: count, dtype: int64

In [124]:
df['Sentiment'].replace({'positive': 1, 
                         'negative': -1,
                         'neutral': 0}, inplace=True)

df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,0
1,here in nz 50 of retailers don’t even have con...,-1
2,i will forever acknowledge this channel with t...,1
3,whenever i go to a place that doesn’t take app...,-1
4,apple pay is so convenient secure and easy to ...,1


## Data Cleaning

1. LowerCase all the text

In [125]:
df['Comment'] = df['Comment'].str.lower()

2. Tokenization 

In [126]:
from nltk.tokenize import word_tokenize
df['Comment'] = df['Comment'].apply(word_tokenize)

3. Removing Punctuation

In [127]:
import re

df['Comment'] = df['Comment'].apply(lambda text: [re.sub(r'\W', ' ', i) for i in text]) 
# keeps words, numbers and spaces, removes punctuations

4. Removing basic english words

In [128]:
from nltk.corpus import stopwords

# Define negation words to retain (critical for sentiment analysis)
negation_words = {
    "not", "no", "nor", "never", "none", "nobody", "nothing", "neither",
    "nowhere", "don't", "isn't", "aren't", "wasn't", "weren't", "hasn't",
    "haven't", "hadn't", "won't", "wouldn't", "shan't", "shouldn't", "mightn't",
    "mustn't", "needn't", "couldn't", "n't", "against"
}

# Load default English stopwords and remove negation words
custom_stopwords = set(stopwords.words('english')) - negation_words

# Remove additional non-critical words (optional)
non_critical_words = {"'s", "'m", "'re", "'d"}  # Remove possessives/contractions
custom_stopwords = custom_stopwords - non_critical_words

# Apply custom stopwords to the DataFrame
df['Comment'] = df['Comment'].apply(
    lambda tokens: [word for word in tokens if word not in custom_stopwords]
)

# MODEL TIME!!!

### Transforming the comments feature

In [129]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Unigrams + bigrams

df['Comment'] = df['Comment'].apply(lambda tokens: ' '.join(tokens)) # convert list of tokens to string
X = vectorizer.fit_transform(df['Comment'])

X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

print("TF-IDF Features:")
print(X.head())


TF-IDF Features:
   000  000 intro  0000  0018  0018 joonie   10  10 years  100  1000  10000  \
0  0.0        0.0   0.0   0.0          0.0  0.0       0.0  0.0   0.0    0.0   
1  0.0        0.0   0.0   0.0          0.0  0.0       0.0  0.0   0.0    0.0   
2  0.0        0.0   0.0   0.0          0.0  0.0       0.0  0.0   0.0    0.0   
3  0.0        0.0   0.0   0.0          0.0  0.0       0.0  0.0   0.0    0.0   
4  0.0        0.0   0.0   0.0          0.0  0.0       0.0  0.0   0.0    0.0   

   ...  zoom  çok   đi   để   ơn   за  очень  спасибо  タッピング   너무  
0  ...   0.0  0.0  0.0  0.0  0.0  0.0    0.0      0.0    0.0  0.0  
1  ...   0.0  0.0  0.0  0.0  0.0  0.0    0.0      0.0    0.0  0.0  
2  ...   0.0  0.0  0.0  0.0  0.0  0.0    0.0      0.0    0.0  0.0  
3  ...   0.0  0.0  0.0  0.0  0.0  0.0    0.0      0.0    0.0  0.0  
4  ...   0.0  0.0  0.0  0.0  0.0  0.0    0.0      0.0    0.0  0.0  

[5 rows x 5000 columns]


In [134]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"  # Use "distilbert-base-uncased" for DistilBERT
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 classes: -1, 0, 1

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [135]:
import torch
def tokenize_function(texts):
    return tokenizer(
        texts, padding=True, truncation=True, max_length=128, return_tensors="pt"
    )

# Tokenize the comments
train_encodings = tokenize_function(df['Comment'].tolist())
train_labels = torch.tensor(df['Sentiment'].tolist())

In [136]:
from torch.utils.data import Dataset

class CommentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CommentDataset(train_encodings, train_labels)

In [141]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

Accuracy

1. RF w/o class weights- 73.11%
2. RF w class weights - 73.56%
3. XGBoost - 74.20%
4. SVM(Linear) - 75.1%
5. SVM (Poly) - 64.3%
6. SVM (rbf) - 64.3%
7. SVM (sigmoid) - 74.68%
8. LGMBoost - 74%
9. CatBoost - 72%
9. Logistic Regression - 75%
11. Naive Bayes - 69%