In [1]:
import pandas as pd
import re
import numpy as np
from utils import *
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from unidecode import unidecode
import pickle



In [2]:
df = pd.read_csv('sentiment-news-bahasa-v5.csv')
Y = LabelEncoder().fit_transform(df.label)
df.head()

Unnamed: 0,label,text
0,Negative,Lebih-lebih lagi dengan kemudahan internet da...
1,Positive,boleh memberi teguran kepada parti tetapi perl...
2,Negative,Adalah membingungkan mengapa masyarakat Cina b...
3,Positive,Kami menurunkan defisit daripada 6.7 peratus p...
4,Negative,"Ini masalahnya. Bukan rakyat, tetapi sistem"


In [3]:
def textcleaning(string):
    string = re.sub('http\S+|www.\S+', '',' '.join([i for i in string.split() if i.find('#')<0 and i.find('@')<0]))
    string = unidecode(string).replace('.', '. ').replace(',', ', ')
    string = re.sub('[^\'\"A-Za-z\- ]+', ' ', string)
    return ' '.join([i for i in re.findall("[\\w']+|[;:\-\(\)&.,!?\"]", string) if len(i)>1]).lower()

In [4]:
for i in range(df.shape[0]):
    df.iloc[i,1] = textcleaning(df.iloc[i,1])

In [5]:
with open('polarity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('polarity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts
texts += df.iloc[:,1].tolist()
labels += Y.tolist()

assert len(labels) == len(texts)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from malaya.text_functions import STOPWORDS

not found any version, deleting previous version models..


1MB [00:00, 1110.78MB/s]                                

downloading stopwords



Using TensorFlow backend.
1MB [00:00, 16.31MB/s]                                 

downloading ZIP rules-based





In [9]:
target = LabelEncoder().fit_transform(labels)
tfidf = TfidfVectorizer(ngram_range=(1, 3),min_df=2).fit(texts)
vectors = tfidf.transform(texts)
vectors.shape

(14279, 45344)

In [10]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = 0.2)

In [11]:
from sklearn import metrics

In [12]:
multinomial = MultinomialNB().fit(train_X, train_Y)
print(metrics.classification_report(train_Y, multinomial.predict(train_X), target_names = ['negative','positive']))

             precision    recall  f1-score   support

   negative       0.95      0.85      0.90      5254
   positive       0.89      0.96      0.92      6169

avg / total       0.92      0.91      0.91     11423



In [13]:
print(metrics.classification_report(test_Y, multinomial.predict(test_X), target_names = ['negative','positive']))

             precision    recall  f1-score   support

   negative       0.77      0.60      0.67      1347
   positive       0.70      0.84      0.77      1509

avg / total       0.73      0.73      0.72      2856



In [14]:
text = 'kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya'
multinomial.predict_proba(tfidf.transform([text]))

array([[0.37235203, 0.62764797]])

In [15]:
text = 'saya sangat sayangkan kerajaan saya'
multinomial.predict_proba(tfidf.transform([text]))

array([[0.42569504, 0.57430496]])

In [16]:
text = 'bodoh lah awak ni'
multinomial.predict_proba(tfidf.transform([text]))

array([[0.56633906, 0.43366094]])

In [17]:
text = 'kerajaan sebenarnya sangat baik'
multinomial.predict_proba(tfidf.transform([text]))

array([[0.25775276, 0.74224724]])

In [18]:
import pickle
with open('multinomial-sentiment.pkl','wb') as fopen:
    pickle.dump(multinomial,fopen)
with open('tfidf-multinomial-sentiment.pkl','wb') as fopen:
    pickle.dump(tfidf,fopen)