# Import Libraries

In [4]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer # chuyển đổi các tài liệu văn bản thô thành ma trận tf-idf, đánh giá mức
# độ quan trọng của từ trong một tài liệu liên quan đến toàn bộ bộ sưu tập
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

import spacy

import warnings
warnings.filterwarnings('ignore')

# Explore Data

### Dataset: https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis

In [88]:
col = ['id','country','Label','Text']
data = pd.read_csv("dataset/twitter_training.csv", names=col)

In [89]:
data.head()

Unnamed: 0,id,country,Label,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
data.tail()

Unnamed: 0,id,country,Label,Text
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...
74681,9200,Nvidia,Positive,Just like the windows partition of my Mac is l...


In [21]:
data.shape

(74682, 4)

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       74682 non-null  int64 
 1   country  74682 non-null  object
 2   Label    74682 non-null  object
 3   Text     73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [8]:
data['Label'].value_counts()

Label
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

In [90]:
print(f"{data['Text'][555]} -> {data['Label'][555]}")

Trying it -> Neutral


# Preprocessing

In [91]:
data_copy = data.copy()
data_copy.dropna(inplace=True)

In [92]:
data_copy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 73996 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       73996 non-null  int64 
 1   country  73996 non-null  object
 2   Label    73996 non-null  object
 3   Text     73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.8+ MB


In [93]:
# Preprocess Function
nlp = spacy.load("en_core_web_sm") 
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [35]:
count=0
for text in data_copy['Text']:
    doc = nlp(text)  # Phân tích văn bản cho từng chuỗi
    for token in doc:
        if token.is_stop or token.is_punct:
            print(f"Token: {token.text}, is_stop: {token.is_stop}, is_punct: {token.is_punct}")
            count+=1
            if count == 20:  # Dừng sau khi in 20 token
                break
    if count==20:
        break


Token: i, is_stop: True, is_punct: False
Token: on, is_stop: True, is_punct: False
Token: and, is_stop: True, is_punct: False
Token: i, is_stop: True, is_punct: False
Token: will, is_stop: True, is_punct: False
Token: you, is_stop: True, is_punct: False
Token: all, is_stop: True, is_punct: False
Token: ,, is_stop: False, is_punct: True
Token: I, is_stop: True, is_punct: False
Token: am, is_stop: True, is_punct: False
Token: to, is_stop: True, is_punct: False
Token: the, is_stop: True, is_punct: False
Token: and, is_stop: True, is_punct: False
Token: I, is_stop: True, is_punct: False
Token: will, is_stop: True, is_punct: False
Token: you, is_stop: True, is_punct: False
Token: all, is_stop: True, is_punct: False
Token: ,, is_stop: False, is_punct: True
Token: i, is_stop: True, is_punct: False
Token: on, is_stop: True, is_punct: False


In [94]:
data_copy['Preprocessed Text'] = data_copy['Text'].apply(preprocess) 

In [95]:
data_copy

Unnamed: 0,id,country,Label,Text,Preprocessed Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,m get borderland murder
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,m get borderland kill
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,m come borderland murder
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,m get borderland 2 murder
...,...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...,realize Windows partition Mac like 6 year Nvid...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...,realize Mac window partition 6 year Nvidia dri...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...,realize window partition Mac 6 year Nvidia dri...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...,realize window partition Mac like 6 year Nvidi...


In [96]:
# Encoding target column
le = LabelEncoder()
data_copy['Label'] = le.fit_transform(data_copy['Label'])

In [97]:
data_copy

Unnamed: 0,id,country,Label,Text,Preprocessed Text
0,2401,Borderlands,3,im getting on borderlands and i will murder yo...,m get borderland murder
1,2401,Borderlands,3,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,3,im getting on borderlands and i will kill you ...,m get borderland kill
3,2401,Borderlands,3,im coming on borderlands and i will murder you...,m come borderland murder
4,2401,Borderlands,3,im getting on borderlands 2 and i will murder ...,m get borderland 2 murder
...,...,...,...,...,...
74677,9200,Nvidia,3,Just realized that the Windows partition of my...,realize Windows partition Mac like 6 year Nvid...
74678,9200,Nvidia,3,Just realized that my Mac window partition is ...,realize Mac window partition 6 year Nvidia dri...
74679,9200,Nvidia,3,Just realized the windows partition of my Mac ...,realize window partition Mac 6 year Nvidia dri...
74680,9200,Nvidia,3,Just realized between the windows partition of...,realize window partition Mac like 6 year Nvidi...


In [None]:
# Hiển thị các nhãn gốc tương ứng với các giá trị số
print("Label mapping:", dict(zip(le.classes_, range(len(le.classes_)))))




Label mapping: {'Irrelevant': 0, 'Negative': 1, 'Neutral': 2, 'Positive': 3}


In [99]:
# Split data into train and test

x_train, x_test, y_train, y_test = train_test_split(data_copy['Preprocessed Text'], data_copy['Label'], 
                                                    test_size=0.2, random_state=42, stratify=data_copy['Label'])

In [47]:
x_test.shape

(14800,)

In [48]:
x_train.shape

(59196,)

# ML Model

#### Naive Bayes 

In [100]:
# classifier
# tfid: chuyển văn bản thành vector số , naive_bayes được huấn luyện để phân biệt
clf = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('naive_bayes', (MultinomialNB()))         
])

In [101]:
clf.fit(x_train, y_train)

In [102]:
y_pred = clf.predict(x_test)

In [103]:
print(accuracy_score(y_test, y_pred))

0.7322972972972973


In [104]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.46      0.62      2575
           1       0.65      0.90      0.76      4472
           2       0.84      0.64      0.72      3622
           3       0.71      0.81      0.76      4131

    accuracy                           0.73     14800
   macro avg       0.79      0.70      0.71     14800
weighted avg       0.77      0.73      0.72     14800



In [105]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Giả sử 'data_copy' là DataFrame của bạn và đã có cột 'Preprocessed Text' chứa văn bản đã qua xử lý
# Bạn có thể thay đổi cột 'Preprocessed Text' thành cột văn bản bạn muốn

# Khởi tạo TfidfVectorizer
vectorizer = TfidfVectorizer()

# Áp dụng TfidfVectorizer lên dữ liệu văn bản đã được tiền xử lý
X = vectorizer.fit_transform(data_copy['Preprocessed Text'].head(1))

# Hiển thị các vector số học tương ứng
print(X.toarray())  # Chuyển đổi thành mảng số học để dễ dàng hiển thị


[[0.57735027 0.57735027 0.57735027]]


#### Random Forest

In [106]:
clf = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('naive_bayes', (RandomForestClassifier()))         
])

In [107]:
clf.fit(x_train, y_train)

In [108]:
y_pred = clf.predict(x_test)

In [109]:
print(accuracy_score(y_test, y_pred))

0.9110135135135136


In [62]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.86      0.91      2575
           1       0.93      0.93      0.93      4472
           2       0.94      0.89      0.92      3622
           3       0.85      0.94      0.90      4131

    accuracy                           0.91     14800
   macro avg       0.92      0.91      0.91     14800
weighted avg       0.92      0.91      0.91     14800



# Testing

In [110]:
test_data = pd.read_csv('dataset/twitter_validation.csv', names=col)

In [111]:
test_data.head()

Unnamed: 0,id,country,Label,Text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [112]:
test_txt = test_data['Text'][555]
print(f"{test_txt} ===> {test_data['Label'][355]}")

@Rainbow6Game servers.... where they at????? Tired of getting disconnected from every single game. #RainbowSixSiege pic.twitter.com/2lS3iMC8Ki ===> Negative


In [113]:
# Apply preprocess

test_txt_processed = [preprocess(test_txt)]
test_txt_processed

['@Rainbow6Game server tired get disconnected single game RainbowSixSiege pic.twitter.com/2lS3iMC8Ki']

In [114]:
# Get Prediction

test_txt = clf.predict(test_txt_processed)

In [115]:
print(test_txt[0])

1


In [117]:
classes = ['Irrelevant', 'Natural', 'Negative', 'Positive']

print(f"True Label: {test_data['Label'][35]}")
print(f'Predict Label: {classes[test_txt[0]]}')

True Label: Neutral
Predict Label: Natural
