In [235]:
import warnings
warnings.filterwarnings("ignore")

# Importing Basic libraries

In [236]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [237]:
df = pd.read_csv('./YoutubeCommentsDataSet.csv')
df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [238]:
df.isnull().sum()

Comment      44
Sentiment     0
dtype: int64

In [239]:
df.dropna(inplace=True)

In [240]:
print(f"Percent of duplication in the dataset: {round(df.duplicated().sum() / len(df), 3) * 100}%")

Percent of duplication in the dataset: 2.7%


Since the duplication is less than 3% of the original dataset, we can just drop it

In [241]:
df.drop_duplicates(inplace=True)

In [242]:
df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [243]:
df['Sentiment'].value_counts()

Sentiment
positive    11054
neutral      4503
negative     2317
Name: count, dtype: int64

In [244]:
df['Sentiment'].replace({'positive': 1, 
                         'negative': -1,
                         'neutral': 0}, inplace=True)

df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,0
1,here in nz 50 of retailers don’t even have con...,-1
2,i will forever acknowledge this channel with t...,1
3,whenever i go to a place that doesn’t take app...,-1
4,apple pay is so convenient secure and easy to ...,1


## Data Cleaning

1. LowerCase all the text

In [245]:
df['Comment'] = df['Comment'].str.lower()

2. Tokenization 

In [246]:
from nltk.tokenize import word_tokenize
df['Comment'] = df['Comment'].apply(word_tokenize)

3. Removing Punctuation

In [247]:
import re

df['Comment'] = df['Comment'].apply(lambda text: [re.sub(r'\W', ' ', i) for i in text]) 
# keeps words, numbers and spaces, removes punctuations

4. Removing basic english words

In [248]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))  
df['Comment'] = df['Comment'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

# MODEL TIME!!!

### Transforming the comments feature

In [249]:
from sklearn.feature_extraction.text import TfidfVectorizer

vector = TfidfVectorizer(max_features=10000)

df['Comment'] = df['Comment'].apply(lambda tokens: ' '.join(tokens)) # convert list of tokens to string
X = vector.fit_transform(df['Comment'])

X = pd.DataFrame(X.toarray(), columns=vector.get_feature_names_out())

print("TF-IDF Features:")
print(X.head())


TF-IDF Features:
   000  0000  000000  0001  001  0018   01  010   02  0212  ...  오늘은  음식이  \
0  0.0   0.0     0.0   0.0  0.0   0.0  0.0  0.0  0.0   0.0  ...  0.0  0.0   
1  0.0   0.0     0.0   0.0  0.0   0.0  0.0  0.0  0.0   0.0  ...  0.0  0.0   
2  0.0   0.0     0.0   0.0  0.0   0.0  0.0  0.0  0.0   0.0  ...  0.0  0.0   
3  0.0   0.0     0.0   0.0  0.0   0.0  0.0  0.0  0.0   0.0  ...  0.0  0.0   
4  0.0   0.0     0.0   0.0  0.0   0.0  0.0  0.0  0.0   0.0  ...  0.0  0.0   

    정말   진짜  편의점  𝗧𝗿𝗲𝗻𝗱𝗶𝗻𝗴  𝗮𝗱𝘂𝗹𝘁  𝗼𝗻𝗹𝘆  𝗽𝗼𝗿𝗻  𝘃𝗶𝗱𝗲𝗼𝘀  
0  0.0  0.0  0.0       0.0    0.0   0.0   0.0     0.0  
1  0.0  0.0  0.0       0.0    0.0   0.0   0.0     0.0  
2  0.0  0.0  0.0       0.0    0.0   0.0   0.0     0.0  
3  0.0  0.0  0.0       0.0    0.0   0.0   0.0     0.0  
4  0.0  0.0  0.0       0.0    0.0   0.0   0.0     0.0  

[5 rows x 10000 columns]


In [250]:
from sklearn.model_selection import train_test_split as tts 

x_train, x_test, y_train, y_test = tts(X, df['Sentiment'], test_size=0.2, random_state=42)

In [251]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=1000, random_state=42)

model.fit(x_train, y_train)
pred = model.predict(x_test)

In [253]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(f"Accuracy: {accuracy_score(y_test, pred)}")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred))

Accuracy: 0.7311888111888112

Confusion Matrix:
 [[  77  124  255]
 [  26  512  368]
 [  12  176 2025]]

Classification Report:
               precision    recall  f1-score   support

          -1       0.67      0.17      0.27       456
           0       0.63      0.57      0.60       906
           1       0.76      0.92      0.83      2213

    accuracy                           0.73      3575
   macro avg       0.69      0.55      0.57      3575
weighted avg       0.72      0.73      0.70      3575

