<a href="https://colab.research.google.com/github/mungnguyen/comment-classification/blob/master/comment_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import string

from pathlib import Path
from datetime import datetime
from preprocess import cleanAndPreprocess

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report as metric

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import NearestCentroid
from sklearn.naive_bayes import GaussianNB

print("Begin Load file\n")

# dirPath = Path(os.path.dirname(__file__))

trainDir = '../data/aclImdb/train'
testDir =  '../data/aclImdb/test'

X_train = []
y_train = []
X_test = []
y_test = []
train_docs = []
test_docs = []

# Load data
if os.path.isfile('train-docs-after-preprocessing.txt'):
    train_file = open("./train-docs-after-preprocessing.txt", "r")
    train_docs = train_file.readlines()
    print(f"Train-docs length: {len(train_docs)}")

    train_label = open("./train-label-after-preprocessing.txt", "r")
    y_train = train_label.readlines()
    print(f"Train-label length: {len(y_train)}")
else:
    #Load train file
    trainset = load_files(trainDir, "r", categories=["pos", "neg"], encoding="utf-8")
    X_train, y_train = trainset.data, trainset.target

    # Clean and pre-process
    print("Begin pre-processing trainset\n")

    for x in X_train:
        train_docs.append(" ".join(cleanAndPreprocess(x)))
    
    print("End pre-processing trainset\n")

    # Print to file
    with open('train-docs-after-preprocessing.txt', 'w') as f_train:
      for item in train_docs:
          f_train.write("%s\n" % item)
      print("print train_docs to file successly")

    with open('train-label-after-preprocessing.txt', 'w') as f_train_label:
      for item in y_train:
          f_train_label.write("%s\n" % item)
      print("print train_lable to file successly")

if os.path.isfile('test-docs-after-preprocessing.txt'): 
    test_file = open("./test-docs-after-preprocessing.txt", "r")
    test_docs = test_file.readlines()
    print(f"Test-docs length: {len(test_docs)}")

    test_label = open("./test-label-after-preprocessing.txt", "r")
    y_test = test_label.readlines()
    print(f"Test-label length: {len(y_test)}")

else:
    #Load test file
    print("Begin load test file")
    testset = load_files(testDir, "r", categories=["pos", "neg"], encoding="utf-8")
    X_test, y_test = testset.data, testset.target
    print("End Load test file\n")

    # Clean and pre-process
    print("Begin pre-processing testset\n")
    for x in X_test:
        test_docs.append(" ".join(cleanAndPreprocess(x)))
    print("End pre-processing testset\n")

    # Print to file
    with open('test-docs-after-preprocessing.txt', 'w') as f_test:
      for item in test_docs:
          f_test.write("%s\n" % item)
      print("print test_docs to file successly") 

    with open('test-label-after-preprocessing.txt', 'w') as f_test_label:
      for item in y_test:
          f_test_label.write("%s\n" % item)
      print("print test_label to file successly")

Begin Load file

Train-docs length: 25000
Train-label length: 25000
Test-docs length: 25000
Test-label length: 25000


In [0]:
# Vectorize
print("Begin vectorize \n")
# IF-IDF
print("Begin tf-idf")
train_tfidf = TfidfVectorizer(max_features=10000, min_df=10, max_df=0.7, encoding='utf-8', lowercase=False)
X_train = train_tfidf.fit_transform(train_docs)

print(f"Voca len: {len(train_tfidf.vocabulary_)}")

print("Begin test tf-idf")
test_tf_idf = TfidfVectorizer(min_df=5, max_df=0.7, encoding='utf-8', lowercase=False, vocabulary=train_tfidf.vocabulary_)
X_test = test_tf_idf.fit_transform(test_docs)
print("End tf-idf")

print("End vectorize \n")

Begin vectorize 

Begin tf-idf
Voca len: 10000
Begin test tf-idf
End tf-idf
End vectorize 



In [0]:
# Training data - testing data
    # Traning with Decision Tree
print("Begin training data with decision tree")
start_time = datetime.now()

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

end_time = datetime.now()
print(f"Time to training: {end_time - start_time}")

print("End training data with decision tree")

# Testing with decision tree
print("Begin testing data with decision tree")
start_time = datetime.now()

dstree_pred = clf.predict(X_test)

result = metric(y_test, dstree_pred)

print(f"{result}\n")

end_time = datetime.now()
print(f"Time to testing: {end_time - start_time}")

print("End testing data with decision tree")

Begin training data with decision tree
Time to training: 0:00:27.212792
End training data with decision tree
Begin testing data with decision tree
              precision    recall  f1-score   support

          0
       0.71      0.72      0.72     12500
          1
       0.72      0.70      0.71     12500

    accuracy                           0.71     25000
   macro avg       0.71      0.71      0.71     25000
weighted avg       0.71      0.71      0.71     25000


Time to testing: 0:00:00.159772
End testing data with decision tree


In [0]:
 # Traning with Random Forest
print("Begin training with Random Forest")
start_time = datetime.now()

clf = RandomForestClassifier(n_estimators=1000, random_state=0)
clf.fit(X_train, y_train) 

end_time = datetime.now()
print(f"Time to training: {end_time - start_time}")

print("End training with Random Forest")

    # Tesing with Random Forest
print("Begin testing with Random Forest")
start_time = datetime.now()

rd_pred = clf.predict(X_test)

result = metric(y_test, rd_pred)

print(f"{result}\n")

end_time = datetime.now()
print(f"Time to testing: {end_time - start_time}")

print("End testing with Random Forest")

Begin training with Random Forest
Time to training: 0:08:48.912607
End training with Random Forest
Begin testing with Random Forest
              precision    recall  f1-score   support

          0
       0.85      0.86      0.86     12500
          1
       0.86      0.85      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000


Time to testing: 0:00:15.949981
End testing with Random Forest


In [0]:
# Traning with SVM
print("Begin training with SVM")
start_time = datetime.now()

clf = SVC(kernel="linear", gamma='auto')
clf.fit(X_train, y_train)

end_time = datetime.now()
print(f"Time to training: {end_time - start_time}")

print("End training with SVM")

    # Tesing with SVM
print("Begin testing with SVM")
start_time = datetime.now()

svm_pred = clf.predict(X_test)

result = metric(y_test, svm_pred)

end_time = datetime.now()
print(f"Time to testing: {end_time - start_time}")

print(f"{result}\n")

print("End testing with SVM")

Begin training with SVM
Time to training: 0:07:19.772654
End training with SVM
Begin testing with SVM
Time to testing: 0:03:13.367918
              precision    recall  f1-score   support

          0
       0.87      0.88      0.88     12500
          1
       0.88      0.87      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000


End testing with SVM


In [0]:
# Traning with Rocchio Classification
print("Begin training with Rocchio Classification")
start_time = datetime.now()

clf = NearestCentroid()
clf.fit(X_train, y_train)

end_time = datetime.now()
print(f"Time to training: {end_time - start_time}")

print("End training with Rocchio Classification")

    # Tesing with Rocchio Classification
print("Begin testing with Rocchio Classification")
start_time = datetime.now()

rc_pred = clf.predict(X_test)

result = metric(y_test, rc_pred)

print(f"{result}\n")

end_time = datetime.now()
print(f"Time to testing: {end_time - start_time}")

print("End testing with Rocchio Classification")

Begin training with Rocchio Classification
Time to training: 0:00:00.038749
End training with Rocchio Classification
Begin testing with Rocchio Classification
              precision    recall  f1-score   support

          0
       0.83      0.77      0.80     12500
          1
       0.78      0.85      0.81     12500

    accuracy                           0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000


Time to testing: 0:00:00.161210
End testing with Rocchio Classification


In [0]:
# Traning with Naive Bayes Classifier
print("Begin training with Naive Bayes Classifier, use Gaussian Naive Bayes algorithm")
start_time = datetime.now()

clf = GaussianNB()
clf.fit(X_train.toarray(), y_train)

end_time = datetime.now()
print(f"Time to training: {end_time - start_time}")

print("End training with Naive Bayes Classifier")

    # Tesing with Naive Bayes Classifier
print("Begin testing with Naive Bayes Classifier")
start_time = datetime.now()

nb_pred = clf.predict(X_test.toarray())

result = metric(y_test, nb_pred)

end_time = datetime.now()
print(f"Time to testing: {end_time - start_time}")

print(f"{result}\n")

print("End testing with Naive Bayes Classifier")

print("End programe")

Begin training with Naive Bayes Classifier, use Gaussian Naive Bayes algorithm
Time to training: 0:00:03.963598
End training with Naive Bayes Classifier
Begin testing with Naive Bayes Classifier
Time to testing: 0:00:04.154739
              precision    recall  f1-score   support

          0
       0.64      0.78      0.71     12500
          1
       0.72      0.57      0.64     12500

    accuracy                           0.67     25000
   macro avg       0.68      0.67      0.67     25000
weighted avg       0.68      0.67      0.67     25000


End testing with Naive Bayes Classifier
End programe


In [0]:
# Traning with Naive Bayes Classifier
from sklearn.naive_bayes import BernoulliNB

print("Begin training with Naive Bayes Classifier, use Bernoulli Naive Bayes algorithm")
start_time = datetime.now()

clf = BernoulliNB()
clf.fit(X_train.toarray(), y_train)

end_time = datetime.now()
print(f"Time to training: {end_time - start_time}")

print("End training with Naive Bayes Classifier")

    # Tesing with Naive Bayes Classifier
print("Begin testing with Naive Bayes Classifier")
start_time = datetime.now()

nb_pred = clf.predict(X_test.toarray())

result = metric(y_test, nb_pred)

end_time = datetime.now()
print(f"Time to testing: {end_time - start_time}")

print(f"{result}\n")

print("End testing with Naive Bayes Classifier")

print("End programe")

Begin training with Naive Bayes Classifier, use Bernoulli Naive Bayes algorithm
Time to training: 0:00:02.518697
End training with Naive Bayes Classifier
Begin testing with Naive Bayes Classifier
Time to testing: 0:00:02.662418
              precision    recall  f1-score   support

          0
       0.79      0.88      0.83     12500
          1
       0.86      0.77      0.81     12500

    accuracy                           0.82     25000
   macro avg       0.83      0.82      0.82     25000
weighted avg       0.83      0.82      0.82     25000


End testing with Naive Bayes Classifier
End programe
