In [2]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import codecs
from pyvi import ViTokenizer
import numpy as np
import _pickle as cPickle

In [2]:
df = pd.read_csv('data/negative.csv',encoding='utf8')

In [3]:
def create_stopwordlist():
    f = codecs.open('data/vietnamese-stopwords.txt', encoding='utf-8')
    data = []
    null_data = []
    for i, line in enumerate(f):
        line = repr(line)
        line = line[1:len(line)-3]
        data.append(line)
    return data

In [4]:
stopword_vn = create_stopwordlist()

In [5]:
stopword_vn = [ViTokenizer.tokenize(text) for text in stopword_vn]

In [6]:
with open('stop_words.txt', 'wb') as fid:
    cPickle.dump(stopword_vn, fid)

In [7]:
review = df.columns.values[0]
sentiment = df.columns.values[1]
review , sentiment

('review', 'label')

In [8]:
from process_data import process_data

In [9]:
df['review'] = np.vectorize(process_data)(df[review])

In [10]:
tokenized_review = df['review'].apply(lambda x: ViTokenizer.tokenize(x).split())

In [11]:
import re
for i in range(len(tokenized_review)):
    for j in range(0,len(tokenized_review[i])):
        if tokenized_review[i][j] in stopword_vn :
            tokenized_review[i][j]=""
    tokenized_review[i] = ' '.join(tokenized_review[i])
    tokenized_review[i] = re.sub(r"\s+", " ", str(tokenized_review[i]))
df['review'] = tokenized_review

In [12]:
df['review']

0        màn_hình sản_phẩm tệ giá sản_phẩm
1                   điện_thoại sản_phẩm tệ
2       notpos kiểu_dáng trông sản_phẩm tệ
3                           trải nghiệm tệ
4                                       tệ
                      ...                 
810                        cực_kì hài_lòng
811                         chất_lượng chê
812                      đồng_tiền bát gạo
813                                    tốt
814                  sản_phẩm tốt sản_phẩm
Name: review, Length: 815, dtype: object

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer 
tf_idf_vectorizer = TfidfVectorizer(use_idf=True,ngram_range=(1,3))
final_vectorized_data = tf_idf_vectorizer.fit_transform(df['review'])
final_vectorized_data
print(len(tf_idf_vectorizer.get_feature_names()))

14404


In [14]:
with open('tf.pkl', 'wb') as fid:
    cPickle.dump(tf_idf_vectorizer, fid)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(final_vectorized_data, df[sentiment],test_size=0.2, random_state=69)

In [16]:
from sklearn.naive_bayes import MultinomialNB  # Naive Bayes Classifier
model_naive = MultinomialNB().fit(X_train, y_train) 

In [17]:
predicted_naive = model_naive.predict(X_test)

In [18]:
predicted_naive

array([0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 0], dtype=int64)

In [19]:
text =[["sản phẩm tuyệt vời"],["sản phẩm notneg"],["dùng quá notpos"]]
for i in text:
    test = tf_idf_vectorizer.transform(i)
    print(model_naive.predict(test))

[1]
[0]
[0]


In [20]:
import _pickle as cPickle
with open('my_dumped_classifier.pkl', 'wb') as fid:
    cPickle.dump(model_naive, fid)

In [21]:
with open('my_dumped_classifier.pkl', 'rb') as fid:
    gnb_loaded = cPickle.load(fid)

In [22]:
with open('tf.pkl', 'rb') as fid:
    tf = cPickle.load(fid)

In [23]:
text =[['sản_phẩm notpos'], ['sản_phẩm tốt']]
for i in text:
    print(i)
    test = tf.transform(i)
    print(gnb_loaded .predict(test))

['sản_phẩm notpos']
[0]
['sản_phẩm tốt']
[1]


In [25]:
with open('tf.pkl', 'rb') as fid:
    tf = cPickle.load(fid)