In [5]:
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
imdb_dataset = pd.read_csv('/content/drive/MyDrive/TEST_INTERN1/IMDB_dataset.csv', nrows=1000)
imdb_dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [24]:
from bs4 import BeautifulSoup
import re

#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply function on review column
imdb_dataset['review']=imdb_dataset['review'].apply(denoise_text)
imdb_dataset.head()

  soup = BeautifulSoup(text, "html.parser")


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [25]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
imdb_dataset['review']=imdb_dataset['review'].apply(remove_special_characters)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(imdb_dataset ['review'], imdb_dataset['sentiment'],
                                                   test_size=0.2, random_state=0,
                                                   shuffle=True, stratify=imdb_dataset['sentiment'])

X_train.shape, X_test.shape


((800,), (200,))

In [28]:
# model Building
clf = Pipeline([('tfidf', TfidfVectorizer()),
                ('rfc', RandomForestClassifier(n_estimators=100, n_jobs=-1))])

clf.fit(X_train, y_train)

In [29]:
y_pred = clf.predict(X_test)

In [30]:
print(classification_report(y_test, y_pred))
# Độ chính xác của mô hình RandomForest
accuracy_rfc = accuracy_score(y_test, y_pred)
print("Accuracy for RandomForest:", accuracy_rfc)

              precision    recall  f1-score   support

    negative       0.76      0.78      0.77       100
    positive       0.78      0.76      0.77       100

    accuracy                           0.77       200
   macro avg       0.77      0.77      0.77       200
weighted avg       0.77      0.77      0.77       200

Accuracy for RandomForest: 0.77


In [31]:
from sklearn.svm import SVC

clf_svm = Pipeline([('tfidf', TfidfVectorizer()),
                    ('svm', SVC())])

clf_svm.fit(X_train, y_train)


In [32]:
from sklearn.metrics import classification_report
# Sử dụng mô hình SVM đã thay thế
y_pred_svm = clf_svm.predict(X_test)
# Độ chính xác của mô hình SVM
accuracy_svc = accuracy_score(y_test, y_pred_svm)
print("Accuracy for SVM:", accuracy_svc)
# In kết quả báo cáo
print("Classification Report for SVM:")
print(classification_report(y_test, y_pred_svm))


Accuracy for SVM: 0.835
Classification Report for SVM:
              precision    recall  f1-score   support

    negative       0.87      0.79      0.83       100
    positive       0.81      0.88      0.84       100

    accuracy                           0.83       200
   macro avg       0.84      0.83      0.83       200
weighted avg       0.84      0.83      0.83       200



In [33]:
from sklearn.naive_bayes import MultinomialNB

clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                   ('nb', MultinomialNB())])

clf_nb.fit(X_train, y_train)


In [34]:
from sklearn.metrics import classification_report

# Sử dụng mô hình Naive Bayes đã thay thế
y_pred_nb = clf_nb.predict(X_test)
# Độ chính xác của mô hình Naive Bayes
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print("Accuracy for Naive Bayes:", accuracy_nb)
# In kết quả báo cáo
print("Classification Report for Naive Bayes:")
print(classification_report(y_test, y_pred_nb))


Accuracy for Naive Bayes: 0.76
Classification Report for Naive Bayes:
              precision    recall  f1-score   support

    negative       0.71      0.88      0.79       100
    positive       0.84      0.64      0.73       100

    accuracy                           0.76       200
   macro avg       0.78      0.76      0.76       200
weighted avg       0.78      0.76      0.76       200



In [35]:
import pickle

In [36]:
pickle.dump(clf, open('model.pkl', 'wb'))

In [37]:
model = pickle.load(open('model.pkl', 'rb'))


In [38]:
model.predict(['you have won lottery ticket worth $1000, please click here to claim',
            'hi, how are you doing today?'])

array(['positive', 'positive'], dtype=object)