In [31]:
import warnings
warnings.filterwarnings("ignore")

# Importing Basic libraries

In [32]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [33]:
df = pd.read_csv('./YoutubeCommentsDataSet.csv')
df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [34]:
df.duplicated().sum()

531

In [35]:
df.isnull().sum()

Comment      44
Sentiment     0
dtype: int64

In [36]:
df.dropna(inplace=True)

In [44]:
print(f"Percent of missing values in the dataset: {round(44 / len(df), 3) * 100}%")

Percent of missing values in the dataset: 0.2%


Since the duplication is less than 1% of the original dataset, we can just drop it

In [38]:
df.drop_duplicates(inplace=True)

In [39]:
df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [40]:
df['Sentiment'].value_counts()

Sentiment
positive    11054
neutral      4503
negative     2317
Name: count, dtype: int64

In [41]:
df['Sentiment'].replace({'positive': 1, 
                         'negative': -1,
                         'neutral': 0}, inplace=True)

df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,0
1,here in nz 50 of retailers don’t even have con...,-1
2,i will forever acknowledge this channel with t...,1
3,whenever i go to a place that doesn’t take app...,-1
4,apple pay is so convenient secure and easy to ...,1


## Data Cleaning

1. LowerCase all the text

In [13]:
df['Comment'] = df['Comment'].str.lower()

2. Tokenization 

In [14]:
from nltk.tokenize import word_tokenize
df['Comment'] = df['Comment'].apply(word_tokenize)

3. Removing Punctuation

In [15]:
import re

df['Comment'] = df['Comment'].apply(lambda text: [re.sub(r'\W', ' ', i) for i in text]) 
# keeps words, numbers and spaces, removes punctuations

4. Removing basic english words

In [16]:
from nltk.corpus import stopwords

# Define negation words to retain (critical for sentiment analysis)
negation_words = {
    "not", "no", "nor", "never", "none", "nobody", "nothing", "neither",
    "nowhere", "don't", "isn't", "aren't", "wasn't", "weren't", "hasn't",
    "haven't", "hadn't", "won't", "wouldn't", "shan't", "shouldn't", "mightn't",
    "mustn't", "needn't", "couldn't", "n't", "against"
}

# Load default English stopwords and remove negation words
custom_stopwords = set(stopwords.words('english')) - negation_words

# Remove additional non-critical words (optional)
non_critical_words = {"'s", "'m", "'re", "'d"}  # Remove possessives/contractions
custom_stopwords = custom_stopwords - non_critical_words

# Apply custom stopwords to the DataFrame
df['Comment'] = df['Comment'].apply(
    lambda tokens: [word for word in tokens if word not in custom_stopwords]
)

# MODEL TIME!!!

### Transforming the comments feature

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Unigrams + bigrams

df['Comment'] = df['Comment'].apply(lambda tokens: ' '.join(tokens)) # convert list of tokens to string
X = vectorizer.fit_transform(df['Comment'])

X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

print("TF-IDF Features:")
print(X.head())


TF-IDF Features:
   000  000 intro  0000  0018  0018 joonie   10  10 years  100  1000  10000  \
0  0.0        0.0   0.0   0.0          0.0  0.0       0.0  0.0   0.0    0.0   
1  0.0        0.0   0.0   0.0          0.0  0.0       0.0  0.0   0.0    0.0   
2  0.0        0.0   0.0   0.0          0.0  0.0       0.0  0.0   0.0    0.0   
3  0.0        0.0   0.0   0.0          0.0  0.0       0.0  0.0   0.0    0.0   
4  0.0        0.0   0.0   0.0          0.0  0.0       0.0  0.0   0.0    0.0   

   ...  zoom  çok   đi   để   ơn   за  очень  спасибо  タッピング   너무  
0  ...   0.0  0.0  0.0  0.0  0.0  0.0    0.0      0.0    0.0  0.0  
1  ...   0.0  0.0  0.0  0.0  0.0  0.0    0.0      0.0    0.0  0.0  
2  ...   0.0  0.0  0.0  0.0  0.0  0.0    0.0      0.0    0.0  0.0  
3  ...   0.0  0.0  0.0  0.0  0.0  0.0    0.0      0.0    0.0  0.0  
4  ...   0.0  0.0  0.0  0.0  0.0  0.0    0.0      0.0    0.0  0.0  

[5 rows x 5000 columns]


### Fine Tuning BERT Model

In [18]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"  # Use "distilbert-base-uncased" for DistilBERT
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 classes: -1, 0, 1





All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
def tokenize_function(texts):
    return tokenizer(
        texts, padding=True, truncation=True, max_length=128, return_tensors="tf"
    )

# Tokenize the comments
train_encodings = tokenize_function(df['Comment'].tolist())
train_labels = tf.constant(df['Sentiment'].tolist())

In [20]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),  # Input features
    train_labels            # Labels
)).shuffle(1000).batch(16)  # Batch size = 16

In [21]:
# Define optimizer and loss
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

# Compile the model
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [22]:
# Fine-tune the model
history = model.fit(
    train_dataset,
    epochs=3,  # Number of epochs
    batch_size=16,  # Batch size
)

Epoch 1/3



InvalidArgumentError: Graph execution error:

Detected at node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "C:\Users\mayan\AppData\Roaming\Python\Python312\site-packages\ipykernel_launcher.py", line 18, in <module>

  File "C:\Users\mayan\AppData\Roaming\Python\Python312\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "C:\Users\mayan\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "C:\Users\mayan\AppData\Roaming\Python\Python312\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "c:\Users\mayan\anaconda3\Lib\asyncio\base_events.py", line 641, in run_forever

  File "c:\Users\mayan\anaconda3\Lib\asyncio\base_events.py", line 1987, in _run_once

  File "c:\Users\mayan\anaconda3\Lib\asyncio\events.py", line 88, in _run

  File "C:\Users\mayan\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue

  File "C:\Users\mayan\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelbase.py", line 534, in process_one

  File "C:\Users\mayan\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell

  File "C:\Users\mayan\AppData\Roaming\Python\Python312\site-packages\ipykernel\ipkernel.py", line 362, in execute_request

  File "C:\Users\mayan\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelbase.py", line 778, in execute_request

  File "C:\Users\mayan\AppData\Roaming\Python\Python312\site-packages\ipykernel\ipkernel.py", line 449, in do_execute

  File "C:\Users\mayan\AppData\Roaming\Python\Python312\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "C:\Users\mayan\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "C:\Users\mayan\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "C:\Users\mayan\AppData\Roaming\Python\Python312\site-packages\IPython\core\async_helpers.py", line 128, in _pseudo_sync_runner

  File "C:\Users\mayan\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "C:\Users\mayan\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "C:\Users\mayan\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\mayan\AppData\Local\Temp\ipykernel_8420\3659685789.py", line 2, in <module>

  File "c:\Users\mayan\anaconda3\Lib\site-packages\transformers\modeling_tf_utils.py", line 1229, in fit

  File "c:\Users\mayan\anaconda3\Lib\site-packages\tf_keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "c:\Users\mayan\anaconda3\Lib\site-packages\tf_keras\src\engine\training.py", line 1804, in fit

  File "c:\Users\mayan\anaconda3\Lib\site-packages\tf_keras\src\engine\training.py", line 1398, in train_function

  File "c:\Users\mayan\anaconda3\Lib\site-packages\tf_keras\src\engine\training.py", line 1381, in step_function

  File "c:\Users\mayan\anaconda3\Lib\site-packages\tf_keras\src\engine\training.py", line 1370, in run_step

  File "c:\Users\mayan\anaconda3\Lib\site-packages\transformers\modeling_tf_utils.py", line 1706, in train_step

  File "c:\Users\mayan\anaconda3\Lib\site-packages\tf_keras\src\engine\compile_utils.py", line 277, in __call__

  File "c:\Users\mayan\anaconda3\Lib\site-packages\tf_keras\src\losses.py", line 143, in __call__

  File "c:\Users\mayan\anaconda3\Lib\site-packages\tf_keras\src\losses.py", line 270, in call

  File "c:\Users\mayan\anaconda3\Lib\site-packages\tf_keras\src\losses.py", line 2454, in sparse_categorical_crossentropy

  File "c:\Users\mayan\anaconda3\Lib\site-packages\tf_keras\src\backend.py", line 5777, in sparse_categorical_crossentropy

Received a label value of -1 which is outside the valid range of [0, 3).  Label values: -1 1 0 -1 1 -1 1 -1 0 1 1 0 1 0 -1 -1
	 [[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]] [Op:__inference_train_function_34488]

In [None]:
# Tokenize test data
test_encodings = tokenize_function(test_df['Comment'].tolist())
test_labels = tf.constant(test_df['Sentiment'].tolist())

# Create test dataset
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),  # Input features
    test_labels            # Labels
)).batch(16)  # Batch size = 16

# Evaluate
results = model.evaluate(test_dataset)
print("Test Accuracy:", results[1])

Accuracy

1. RF w/o class weights- 73.11%
2. RF w class weights - 73.56%
3. XGBoost - 74.20%
4. SVM(Linear) - 75.1%
5. SVM (Poly) - 64.3%
6. SVM (rbf) - 64.3%
7. SVM (sigmoid) - 74.68%
8. LGMBoost - 74%
9. CatBoost - 72%
9. Logistic Regression - 75%
11. Naive Bayes - 69%