In [22]:
import warnings
warnings.filterwarnings("ignore")

# Importing Basic libraries

In [23]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [24]:
df = pd.read_csv('./YoutubeCommentsDataSet.csv')
df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don‚Äôt even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn‚Äôt take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [25]:
df.isnull().sum()

Comment      44
Sentiment     0
dtype: int64

In [26]:
df.dropna(inplace=True)

In [27]:
print(f"Percent of duplication in the dataset: {round(df.duplicated().sum() / len(df), 3) * 100}%")

Percent of duplication in the dataset: 2.7%


Since the duplication is less than 3% of the original dataset, we can just drop it

In [28]:
df.drop_duplicates(inplace=True)

In [29]:
df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don‚Äôt even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn‚Äôt take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [30]:
df['Sentiment'].value_counts()

Sentiment
positive    11054
neutral      4503
negative     2317
Name: count, dtype: int64

In [31]:
df['Sentiment'].replace({'positive': 2, 
                         'negative': 0,
                         'neutral': 1}, inplace=True)

df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,1
1,here in nz 50 of retailers don‚Äôt even have con...,0
2,i will forever acknowledge this channel with t...,2
3,whenever i go to a place that doesn‚Äôt take app...,0
4,apple pay is so convenient secure and easy to ...,2


## Data Cleaning

1. LowerCase all the text

In [32]:
df['Comment'] = df['Comment'].str.lower()

2. Tokenization 

In [33]:
from nltk.tokenize import word_tokenize
df['Comment'] = df['Comment'].apply(word_tokenize)

3. Removing Punctuation

In [34]:
import re

df['Comment'] = df['Comment'].apply(lambda text: [re.sub(r'\W', ' ', i) for i in text]) 
# keeps words, numbers and spaces, removes punctuations

4. Removing basic english words

In [35]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))  
df['Comment'] = df['Comment'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

# MODEL TIME!!!

### Transforming the comments feature

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

vector = TfidfVectorizer(max_features=10000)

df['Comment'] = df['Comment'].apply(lambda tokens: ' '.join(tokens)) # convert list of tokens to string
X = vector.fit_transform(df['Comment'])

X = pd.DataFrame(X.toarray(), columns=vector.get_feature_names_out())

print("TF-IDF Features:")
print(X.head())


TF-IDF Features:
   000  0000  000000  0001  001  0018   01  010   02  0212  ...  Ïò§ÎäòÏùÄ  ÏùåÏãùÏù¥  \
0  0.0   0.0     0.0   0.0  0.0   0.0  0.0  0.0  0.0   0.0  ...  0.0  0.0   
1  0.0   0.0     0.0   0.0  0.0   0.0  0.0  0.0  0.0   0.0  ...  0.0  0.0   
2  0.0   0.0     0.0   0.0  0.0   0.0  0.0  0.0  0.0   0.0  ...  0.0  0.0   
3  0.0   0.0     0.0   0.0  0.0   0.0  0.0  0.0  0.0   0.0  ...  0.0  0.0   
4  0.0   0.0     0.0   0.0  0.0   0.0  0.0  0.0  0.0   0.0  ...  0.0  0.0   

    Ï†ïÎßê   ÏßÑÏßú  Ìé∏ÏùòÏ†ê  ùóßùóøùó≤ùóªùó±ùó∂ùóªùó¥  ùóÆùó±ùòÇùóπùòÅ  ùóºùóªùóπùòÜ  ùóΩùóºùóøùóª  ùòÉùó∂ùó±ùó≤ùóºùòÄ  
0  0.0  0.0  0.0       0.0    0.0   0.0   0.0     0.0  
1  0.0  0.0  0.0       0.0    0.0   0.0   0.0     0.0  
2  0.0  0.0  0.0       0.0    0.0   0.0   0.0     0.0  
3  0.0  0.0  0.0       0.0    0.0   0.0   0.0     0.0  
4  0.0  0.0  0.0       0.0    0.0   0.0   0.0     0.0  

[5 rows x 10000 columns]


In [37]:
from sklearn.model_selection import train_test_split as tts 

x_train, x_test, y_train, y_test = tts(X, df['Sentiment'], test_size=0.2, random_state=42)

In [38]:
from xgboost import XGBClassifier

model = XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='mlogloss')

model.fit(x_train, y_train)

In [39]:
pred = model.predict(x_test)

In [40]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(f"Accuracy: {accuracy_score(y_test, pred)}")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred))

Accuracy: 0.742097902097902

Confusion Matrix:
 [[ 116  167  173]
 [  42  602  262]
 [  28  250 1935]]

Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.25      0.36       456
           1       0.59      0.66      0.63       906
           2       0.82      0.87      0.84      2213

    accuracy                           0.74      3575
   macro avg       0.68      0.60      0.61      3575
weighted avg       0.73      0.74      0.73      3575



Accuracy

1. RF w/o class weights- 73.11%
2. RF w class weights - 73.56%
3. XGBoost - 74.20%