#Import Library#

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from gensim.utils import simple_preprocess
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn import svm
import warnings
warnings.filterwarnings("ignore")

In [27]:
def seed_everything(seed_value):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed_everything(86)

In [29]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
EPOCHS = 10
train_path = "/content/drive/MyDrive/Colab Notebooks/CLB AI/TEMPO RUN/2023_01/train/train.txt"
val_path = "/content/drive/MyDrive/Colab Notebooks/CLB AI/TEMPO RUN/2023_01/train/train.txt"
test_path = "/content/drive/MyDrive/Colab Notebooks/CLB AI/TEMPO RUN/2023_01/private_test"

#Prepare Dataset#

In [44]:
def labelencoder(text):
        if text=='positive':
            return 2
        elif text=='neutral':
            return 1
        else: 
          return 0

# Hàm đọc file
def get_train_data(file_path):
  train_text = []
  train_label = []
  with open(file_path, "r", encoding="utf-8") as f:
    lines = f.read().splitlines()
    for line in lines:
      try:
        data = line.split("\t")
        train_text.append(' '.join(simple_preprocess(data[1])))
        train_label.append(labelencoder(data[0]))
      except:
        continue

  return [train_text, train_label]

# Hàm đọc file
def get_test_data(file_path):
  test_text = []
  test_label = []
  with open(file_path + "/input.txt", "r", encoding="utf-8") as f:
    lines = f.read().splitlines()
    for line in lines:
      test_text.append(' '.join(simple_preprocess(line)))
        
  with open(file_path + "/labels.txt", "r", encoding="utf-8") as f:
    lines = f.read().splitlines()
    for line in lines:
      test_label.append(labelencoder(line))

  return [test_text, test_label]

#Build Model and Training#

In [36]:
train_data = get_train_data(train_path)
X_train, y_train = train_data[0], train_data[1]

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
vectorizer = CountVectorizer(max_features=100)
classifier = LinearSVC(C=1.0, class_weight="balanced")
model = Pipeline(
    [
        ("vectorizer", vectorizer),
        ("classifier", classifier),
    ]
)

model.fit(X_train, y_train)


#Evaluate Model#

In [47]:
test_data = get_test_data(test_path)
X_test, y_test = test_data[0], test_data[1]
y_predict = model.predict(X_test)

In [48]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_pred=y_predict, y_true=y_test)
precision = precision_score(y_pred=y_predict, y_true=y_test, average='weighted')
recall = recall_score(y_pred=y_predict, y_true=y_test, average='weighted')
f1 = f1_score(y_pred=y_predict, y_true=y_test, average='weighted')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.57
Precision: 0.4465563792560161
Recall: 0.57
F1 Score: 0.4876118791602662


#Inference#

In [50]:
label_map = {
    0: 'negative',
    1: 'neutral',
    2: 'positive'
}

In [57]:
X_new = ["Thầy giảng hay lắm"]
y_new_pred = model.predict(X_new)
y_new_pred_str = label_map[y_new_pred[0]]
print(y_new_pred_str)

positive


In [58]:
X_new = ["Khá chán"]
y_new_pred = model.predict(X_new)
y_new_pred_str = label_map[y_new_pred[0]]
print(y_new_pred_str)

negative
