# Import dataset

In [None]:
import pandas as pd
import numpy as np

X_train = pd.read_excel("/content/drive/MyDrive/Dataset/DS102_Project_Dataset_/Toxic_comments_dataset/train.xlsx")
X_test = pd.read_excel("/content/drive/MyDrive/Dataset/DS102_Project_Dataset_/Toxic_comments_dataset/test.xlsx")
X_dev =pd.read_excel("/content/drive/MyDrive/Dataset/DS102_Project_Dataset_/Toxic_comments_dataset/dev.xlsx")

y_train = np.array(X_train["label"])
y_test = np.array(X_test["label"])
y_dev = np.array(X_dev["label"])

X_train = pd.DataFrame(X_train["comment"])
X_test = pd.DataFrame(X_test["comment"])
X_dev = pd.DataFrame(X_dev["comment"])

stopword = pd.read_csv('/content/drive/MyDrive/Dataset/DS102_Project_Dataset_/Toxic_comments_dataset/Run_model/vietnamese.txt')

In [None]:
pip install underthesea

# Pre-process

## Functions

In [None]:
def remove_stopwords(line):
    words = []
    for word in line.strip().split():
        if word not in stopword:
            words.append(word)
    return ' '.join(words)

In [None]:
def word_tokenize(str):
  from underthesea import word_tokenize
  word_tokenize(str)
  return word_tokenize(str, format="text")

In [None]:
def text_preprocess(document):
  import regex as re
  #Lowercase
  document = document.lower()
  #Delete unnecessary
  document = re.sub(r'[^\s\wáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịúùủũụưứừửữựýỳỷỹỵđ_]',' ',document)
  #Delete extra whitespace
  document = re.sub(r'\s+', ' ', document).strip()
  return document

In [None]:
def col_preprocess(data):
  for i in range(0,len(data)):
    data["comment"].values[i] = word_tokenize(data["comment"].values[i])
    data["comment"].values[i] = text_preprocess(data["comment"].values[i])
    data["comment"].values[i] = remove_stopwords(data["comment"].values[i])
  return data

## Run code

In [None]:
X_train = col_preprocess(X_train)
X_test = col_preprocess(X_test)
X_dev = col_preprocess(X_dev)

# Run models

In [None]:
CV_LogReg = 3
CV_SVM = 3

ngram_range_lower = 1
ngram_range_upper = 1

## Encoder: CountVectorizer

### Encode

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

encoder = CountVectorizer(ngram_range=(ngram_range_lower,ngram_range_upper))
encoder.fit(X_train.squeeze())

In [None]:
X_train_encoded = encoder.transform(X_train.squeeze())
X_dev_encoded = encoder.transform(X_dev.squeeze())
X_test_encoded = encoder.transform(X_test.squeeze())

### CountVectorizer + Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_encoded, y_train)

y_pred_naive_bayes_cv = model.predict(X_test_encoded)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

f1_macro_naive_bayes_cv = f1_score(y_dev, y_pred_naive_bayes_cv, average='macro')*100
accuracy_naive_bayes_cv = accuracy_score(y_dev, y_pred_naive_bayes_cv)*100
precision_naive_bayes_cv = precision_score(y_dev, y_pred_naive_bayes_cv)*100
recall_naive_bayes_cv = recall_score(y_dev, y_pred_naive_bayes_cv)*100

data = [{"F1-macro":f1_macro_naive_bayes_cv, "accuracy":accuracy_naive_bayes_cv,"precision":precision_naive_bayes_cv,"recall":recall_naive_bayes_cv}]
naive_bayes_cv = pd.DataFrame(data, index = ["Naive Bayes + CV"])
naive_bayes_cv

In [None]:
from sklearn.metrics import confusion_matrix

cf = confusion_matrix(y_test, y_pred_naive_bayes_cv)

import seaborn as sn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df_cm = pd.DataFrame(cf, range(2), range(2))
sn.set(font_scale=1.4) 
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='d') 
plt.show()

### CountVectorizer + Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV 

grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"], "multi_class": ["ovr","multinomial"]}

model = LogisticRegression()
gridsearchcv=GridSearchCV(model,grid,cv=CV_LogReg)
gridsearchcv.fit(X_train_encoded, y_train)

y_pred_logistic_regression_cv = gridsearchcv.predict(X_dev_encoded)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

f1_macro_logistic_regression_cv = f1_score(y_dev, y_pred_logistic_regression_cv, average='macro')*100
accuracy_logistic_regression_cv = accuracy_score(y_dev, y_pred_logistic_regression_cv)*100
precision_logistic_regression_cv = precision_score(y_dev, y_pred_logistic_regression_cv)*100
recall_logistic_regression_cv = recall_score(y_dev, y_pred_logistic_regression_cv)*100

data = [{"F1-macro":f1_macro_logistic_regression_cv, "accuracy":accuracy_logistic_regression_cv,"precision":precision_logistic_regression_cv,"recall":recall_logistic_regression_cv}]
logistic_regression_cv = pd.DataFrame(data, index = ["Logistic Regression + CV"])
logistic_regression_cv

In [None]:
from sklearn.metrics import confusion_matrix

cf = confusion_matrix(y_dev, y_pred_logistic_regression_cv)

import seaborn as sn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df_cm = pd.DataFrame(cf, range(2), range(2))
sn.set(font_scale=1.4) 
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='d') 
plt.show()

### CountVectorizer + SVM

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

gridsearchcv = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3, cv = CV_SVM)
gridsearchcv.fit(X_train_encoded, y_train)

y_pred_svm_cv = gridsearchcv.predict(X_dev_encoded)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

f1_macro_svm_cv = f1_score(y_dev, y_pred_svm_cv, average='macro')*100
accuracy_svm_cv = accuracy_score(y_dev, y_pred_svm_cv)*100
precision_svm_cv = precision_score(y_dev, y_pred_svm_cv)*100
recall_svm_cv = recall_score(y_dev, y_pred_svm_cv)*100

data = [{"F1-macro":f1_macro_svm_cv, "accuracy":accuracy_svm_cv,"precision":precision_svm_cv,"recall":recall_svm_cv}]
svm_cv = pd.DataFrame(data, index = ["SVM + CV"])
svm_cv

In [None]:
from sklearn.metrics import confusion_matrix

cf = confusion_matrix(y_dev, y_pred_svm_cv)

import seaborn as sn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df_cm = pd.DataFrame(cf, range(2), range(2))
sn.set(font_scale=1.4) 
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='d') 
plt.show()

## Encoder: TfidVectorizer

### Encode

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

encoder = TfidfVectorizer(ngram_range=(ngram_range_lower,ngram_range_upper))
encoder.fit(X_train.squeeze())

TfidfVectorizer()

In [None]:
X_train_encoded = encoder.transform(X_train.squeeze())
X_dev_encoded = encoder.transform(X_dev.squeeze())
X_test_encoded = encoder.transform(X_test.squeeze())

### TfidVectorizer + Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_encoded, y_train)

y_pred_naive_bayes_tv = model.predict(X_dev_encoded)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

f1_macro_naive_bayes_tv = f1_score(y_dev, y_pred_naive_bayes_tv, average='macro')*100
accuracy_naive_bayes_tv = accuracy_score(y_dev, y_pred_naive_bayes_tv)*100
precision_naive_bayes_tv = precision_score(y_dev, y_pred_naive_bayes_tv)*100
recall_naive_bayes_tv = recall_score(y_dev, y_pred_naive_bayes_tv)*100

data = [{"F1-macro":f1_macro_naive_bayes_tv, "accuracy":accuracy_naive_bayes_tv,"precision":precision_naive_bayes_tv,"recall":recall_naive_bayes_tv}]
naive_bayes_tv = pd.DataFrame(data, index = ["Naive Bayes + TV"])
naive_bayes_tv

In [None]:
from sklearn.metrics import confusion_matrix

cf = confusion_matrix(y_dev, y_pred_naive_bayes_tv)

import seaborn as sn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df_cm = pd.DataFrame(cf, range(2), range(2))
sn.set(font_scale=1.4) 
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='d') 
plt.show()

### TfidVectorizer + Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV 

grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"], "multi_class": ["ovr","multinomial"]}

model = LogisticRegression()
gridsearchcv=GridSearchCV(model,grid,cv=CV_LogReg)
gridsearchcv.fit(X_train_encoded, y_train)

y_pred_logistic_regression_tv = gridsearchcv.predict(X_test_encoded)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

f1_macro_logistic_regression_tv = f1_score(y_dev, y_pred_logistic_regression_tv, average='macro')*100
accuracy_logistic_regression_tv = accuracy_score(y_dev, y_pred_logistic_regression_tv)*100
precision_logistic_regression_tv = precision_score(y_dev, y_pred_logistic_regression_tv)*100
recall_logistic_regression_tv = recall_score(y_dev, y_pred_logistic_regression_tv)*100

data = [{"F1-macro":f1_macro_logistic_regression_tv, "accuracy":accuracy_logistic_regression_tv,"precision":precision_logistic_regression_tv,"recall":recall_logistic_regression_tv}]
logistic_regression_tv = pd.DataFrame(data, index = ["Logistic Regression + TV"])
logistic_regression_tv

In [None]:
from sklearn.metrics import confusion_matrix

cf = confusion_matrix(y_test, y_pred_logistic_regression_tv)

import seaborn as sn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df_cm = pd.DataFrame(cf, range(2), range(2))
sn.set(font_scale=1.4) 
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='d') 
plt.show()

### TfidVectorizer + SVM

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

gridsearchcv = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3, cv = CV_SVM)
gridsearchcv.fit(X_train_encoded, y_train)

y_pred_svm_tv = gridsearchcv.predict(X_dev_encoded)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

f1_macro_svm_tv = f1_score(y_dev, y_pred_svm_tv, average='macro')*100
accuracy_svm_tv = accuracy_score(y_dev, y_pred_svm_tv)*100
precision_svm_tv = precision_score(y_dev, y_pred_svm_tv)*100
recall_svm_tv = recall_score(y_dev, y_pred_svm_tv)*100

data = [{"F1-macro":f1_macro_svm_tv, "accuracy":accuracy_svm_tv,"precision":precision_svm_tv,"recall":recall_svm_tv}]
svm_tv = pd.DataFrame(data, index = ["SVM + TV"])
svm_tv

In [None]:
from sklearn.metrics import confusion_matrix

cf = confusion_matrix(y_dev, y_pred_svm_tv)

import seaborn as sn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df_cm = pd.DataFrame(cf, range(2), range(2))
sn.set(font_scale=1.4) 
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='d') 
plt.show()

## Result

In [None]:
result = naive_bayes_cv.append(logistic_regression_cv).append(svm_cv).append(naive_bayes_tv).append(logistic_regression_tv).append(svm_tv)
result

# Save and load model

In [None]:
import pickle

filename = 'SVM_no_grid_model.sav'
with open('/content/drive/MyDrive/Dataset/DS102_Project_Dataset_/Toxic_comments_dataset/Run_model/SVM_no_grid_model.sav', 'wb') as f:
  pickle.dump(gridsearchcv, f)

In [None]:
import pickle

filename = 'SVM_no_grid_model.sav'
with open('/content/drive/MyDrive/Dataset/DS102_Project_Dataset_/Toxic_comments_dataset/Run_model/SVM_no_grid_model.sav','rb') as f:
  loaded_model = pickle.load(f)
result = loaded_model.predict(X_test_encoded)
result = result.reshape(-1)