In [140]:
import warnings
warnings.filterwarnings("ignore")

# Importing Basic libraries

In [141]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [142]:
df = pd.read_csv('./YoutubeCommentsDataSet.csv')
df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [143]:
df.duplicated().sum()

531

In [144]:
df.isnull().sum()

Comment      44
Sentiment     0
dtype: int64

In [145]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [146]:
print(f"Percent of values to drop in the dataset: {round((44+531) / len(df), 3) * 100}%")

Percent of values to drop in the dataset: 3.2%


Since the missing values are just less than 4% of the original dataset, we can just drop it

In [147]:
df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [148]:
df['Sentiment'].replace({'positive': 2, 
                         'negative': 0,
                         'neutral': 1}, inplace=True)

df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,1
1,here in nz 50 of retailers don’t even have con...,0
2,i will forever acknowledge this channel with t...,2
3,whenever i go to a place that doesn’t take app...,0
4,apple pay is so convenient secure and easy to ...,2


# MODEL TIME!!!

### Splitting the dataset

In [150]:
from sklearn.model_selection import train_test_split as tts 

x_train, x_test, y_train, y_test = tts(df['Comment'], df['Sentiment'], test_size=0.2, random_state=42)

In [126]:
df['Sentiment'].value_counts()

Sentiment
positive    11054
neutral      4503
negative     2317
Name: count, dtype: int64

### Applying class weights 

In [173]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(class_weight="balanced", classes = np.unique(y_train), y = y_train)

class_weights_dict = {i: round(class_weights[i],3) for i in range(len(class_weights))}

print(f"Weights for classes: {class_weights_dict}")

Weights for classes: {0: 2.561, 1: 1.325, 2: 0.539}


### Tokenizing BERT Model

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(
    x_train.tolist(), truncation=True, padding=True, max_length=512, return_tensors="tf"
)

test_encodings = tokenizer(
    x_test.tolist(), truncation=True, padding=True, max_length=512, return_tensors="tf"
)

### Converting to Tensorflow dataset

In [193]:
import tensorflow as tf

# Tokenize the dataset
def tokenize_data(texts, labels, tokenizer, max_length=128):
    encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_length)
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings), labels))
    return dataset

# Convert train & test data into tokenized format
train_dataset = tokenize_data(x_train, y_train, tokenizer)
test_dataset = tokenize_data(x_test, y_test, tokenizer)

In [194]:
from transformers import AutoTokenizer, TFBertForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import AdamWeightDecay
from tensorflow.keras.losses import SparseCategoricalCrossentropy

# Define optimizer and loss function
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
loss_fn = SparseCategoricalCrossentropy(from_logits=True)

# Compile the model
model.compile(optimizer=optimizer, loss=loss_fn, metrics=["accuracy"])

# Train the model
model.fit(train_dataset.batch(16), validation_data=test_dataset.batch(16), epochs=3)


Epoch 1/3
