### Import the required libraries and load the data

In [1]:
import numpy as np
import pandas as pd
from underthesea import word_tokenize
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# filename = 'train_nor_811.csv' # file name
filename = 'twitter_training.csv'
df = pd.read_csv(filename)
print(df.shape)
df.head(5)

(74682, 2)


Unnamed: 0,Sentiment,Content
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [3]:
# Devide data into 2 parts: X (features) and y (labels)
MAX_DATA_NUM = 10000
# X = df['Sentence'][:MAX_DATA_NUM].astype(str) # texts
# y = df['Emotion'][:MAX_DATA_NUM] # labels
X = df['Content'][:MAX_DATA_NUM].astype(str) # texts
y = df['Sentiment'][:MAX_DATA_NUM] # labels
for i in range(5):
    print(X[i],end='\t')
    print(y[i])

im getting on borderlands and i will murder you all ,	Positive
I am coming to the borders and I will kill you all,	Positive
im getting on borderlands and i will kill you all,	Positive
im coming on borderlands and i will murder you all,	Positive
im getting on borderlands 2 and i will murder you me all,	Positive


### Preprocessing

In [4]:
X_tokenized = [word_tokenize(sent, format="text").lower() for sent in X]
for i in range(5):
    print(X_tokenized[i], y[i])

im getting on_borderlands and i will murder you all , Positive
i am_coming to the_borders and i will kill you all , Positive
im getting on_borderlands and i will kill you all , Positive
im coming on_borderlands and i will murder you all , Positive
im getting on borderlands 2 and i will murder you me all , Positive


In [5]:
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
def get_X_encoded(X_tokenized, mode, ngram_range=(1, 2)):
    # if mode == 'binary':
    #     vectorizer = CountVectorizer(binary=True, ngram_range=ngram_range)
    # elif mode == 'count':
    #     vectorizer = CountVectorizer(ngram_range=ngram_range)
    # elif mode == 'freq':
    #     vectorizer = TfidfVectorizer(use_idf=False, ngram_range=ngram_range)
    # elif mode == 'tfidf':
    #     vectorizer = TfidfVectorizer(ngram_range=ngram_range)
    # else:
    #     raise ValueError('Invalid mode: %s' % mode)
    # X_encoded = vectorizer.fit_transform(X_tokenized)

    t = Tokenizer()
    t.fit_on_texts(X_tokenized)
    X_encoded = t.texts_to_matrix(X_tokenized, mode=mode)
    return X_encoded, t

def train_and_evaluate(clf_name, clf, X_train, X_test, y_train, y_test):
    print("=== %s ===" % clf_name)
    print("Training %s ..." % clf_name)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: %.2f" % accuracy)
    print("Classification report:")
    print(classification_report(y_test, y_pred))
    return ("%.2f" % accuracy, clf)

### Training the model

In [6]:
modes = ['binary', 'count', 'freq', 'tfidf']
# Choose 1 mode randomly
mode = modes[np.random.randint(0, len(modes))]
X_encoded, vectorizer = get_X_encoded(X_tokenized, mode)
# X_encoded = X_encoded.toarray()
print('Mode:', mode)

Mode: count


In [7]:
# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.25, random_state=42)

In [8]:
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'kNN': KNeighborsClassifier(),
    'RandomForest': RandomForestClassifier(),
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(kernel='linear')
}
models = {name : None for name in classifiers.keys()}

#### Train and evaluate the Naive Bayes model

In [9]:
clf_name = 'Naive Bayes'
clf = classifiers[clf_name]
models[clf_name] = train_and_evaluate(clf_name, clf, X_train, X_test, y_train, y_test)

=== Naive Bayes ===
Training Naive Bayes ...
Accuracy: 0.81
Classification report:
              precision    recall  f1-score   support

  Irrelevant       0.86      0.72      0.78       446
    Negative       0.79      0.84      0.82       589
     Neutral       0.89      0.74      0.81       662
    Positive       0.75      0.90      0.82       803

    accuracy                           0.81      2500
   macro avg       0.82      0.80      0.81      2500
weighted avg       0.82      0.81      0.81      2500



#### Logistic Regression

In [10]:
clf_name = 'Logistic Regression'
clf = classifiers[clf_name]
models[clf_name] = train_and_evaluate(clf_name, clf, X_train, X_test, y_train, y_test)

=== Logistic Regression ===
Training Logistic Regression ...


Accuracy: 0.88
Classification report:
              precision    recall  f1-score   support

  Irrelevant       0.91      0.85      0.88       446
    Negative       0.88      0.87      0.88       589
     Neutral       0.91      0.88      0.90       662
    Positive       0.85      0.91      0.88       803

    accuracy                           0.88      2500
   macro avg       0.89      0.88      0.88      2500
weighted avg       0.89      0.88      0.88      2500



#### SVM

In [13]:
clf_name = 'SVM'
clf = classifiers[clf_name]
models[clf_name] = train_and_evaluate(clf_name, clf, X_train, X_test, y_train, y_test)

=== SVM ===
Training SVM ...
Accuracy: 0.88
Classification report:
              precision    recall  f1-score   support

  Irrelevant       0.88      0.87      0.87       446
    Negative       0.88      0.85      0.87       589
     Neutral       0.90      0.87      0.89       662
    Positive       0.86      0.91      0.88       803

    accuracy                           0.88      2500
   macro avg       0.88      0.87      0.88      2500
weighted avg       0.88      0.88      0.88      2500



#### Random Forest

In [14]:
clf_name = 'RandomForest'
clf = classifiers[clf_name]
models[clf_name] = train_and_evaluate(clf_name, clf, X_train, X_test, y_train, y_test)

=== RandomForest ===
Training RandomForest ...
Accuracy: 0.93
Classification report:
              precision    recall  f1-score   support

  Irrelevant       0.96      0.89      0.93       446
    Negative       0.94      0.91      0.92       589
     Neutral       0.91      0.96      0.93       662
    Positive       0.92      0.94      0.93       803

    accuracy                           0.93      2500
   macro avg       0.93      0.93      0.93      2500
weighted avg       0.93      0.93      0.93      2500



#### Decision Tree

In [15]:
clf_name = 'Decision Tree'
clf = classifiers[clf_name]
models[clf_name] = train_and_evaluate(clf_name, clf, X_train, X_test, y_train, y_test)

=== Decision Tree ===
Training Decision Tree ...
Accuracy: 0.78
Classification report:
              precision    recall  f1-score   support

  Irrelevant       0.77      0.73      0.75       446
    Negative       0.79      0.76      0.78       589
     Neutral       0.76      0.82      0.79       662
    Positive       0.80      0.79      0.79       803

    accuracy                           0.78      2500
   macro avg       0.78      0.78      0.78      2500
weighted avg       0.78      0.78      0.78      2500



#### k Nearest Neighbors

In [16]:
clf_name = 'kNN'
clf = classifiers[clf_name]
models[clf_name] = train_and_evaluate(clf_name, clf, X_train, X_test, y_train, y_test)

=== kNN ===
Training kNN ...
Accuracy: 0.86
Classification report:
              precision    recall  f1-score   support

  Irrelevant       0.85      0.84      0.85       446
    Negative       0.86      0.86      0.86       589
     Neutral       0.82      0.90      0.86       662
    Positive       0.91      0.83      0.87       803

    accuracy                           0.86      2500
   macro avg       0.86      0.86      0.86      2500
weighted avg       0.86      0.86      0.86      2500



In [25]:
question = input() # I really like this book
X_input = vectorizer.texts_to_matrix([question],mode=mode)
print(X_input)
for clf_name, clf in models.items():
    print(f'Predicted sentiment by {clf_name}: {clf[1].predict(X_input)}')

[[0. 0. 1. ... 0. 0. 0.]]
Predicted sentiment by Logistic Regression: ['Positive']
Predicted sentiment by Decision Tree: ['Positive']
Predicted sentiment by kNN: ['Positive']
Predicted sentiment by RandomForest: ['Positive']
Predicted sentiment by Naive Bayes: ['Positive']
Predicted sentiment by SVM: ['Positive']


In [20]:
models

{'Logistic Regression': ('0.88', LogisticRegression(max_iter=1000)),
 'Decision Tree': ('0.78', DecisionTreeClassifier()),
 'kNN': ('0.86', KNeighborsClassifier()),
 'RandomForest': ('0.93', RandomForestClassifier()),
 'Naive Bayes': ('0.81', MultinomialNB()),
 'SVM': ('0.88', SVC(kernel='linear'))}

##### Validate models on accuracy and show the graph and choose the best model
> Accuracy (Độ chính xác) của các model lần lượt là:
- Decision Tree: 0.78
- Naive Bayes: 0.81
- kNN: 0.86
- SVM: 0.88
- Logistic Regression: 0.88
- Random Forest: 0.93

Như vậy, Random Forest là model có độ chính xác cao nhất với 93%.