In [1]:
import malaya
from malaya.text_functions import deep_sentiment_textcleaning, STOPWORDS
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB



In [2]:
df = pd.read_csv('sentiment/sentiment-news-bahasa-v5.csv')
df.head()

Unnamed: 0,label,text
0,Negative,Lebih-lebih lagi dengan kemudahan internet da...
1,Positive,boleh memberi teguran kepada parti tetapi perl...
2,Negative,Adalah membingungkan mengapa masyarakat Cina b...
3,Positive,Kami menurunkan defisit daripada 6.7 peratus p...
4,Negative,"Ini masalahnya. Bukan rakyat, tetapi sistem"


In [3]:
processed_strings = [deep_sentiment_textcleaning(i) for i in df.iloc[:,1]]

In [4]:
target = LabelEncoder().fit_transform(df.iloc[:,0])
tfidf = TfidfVectorizer(ngram_range=(1, 3),min_df=2).fit(processed_strings)
vectors = tfidf.transform(processed_strings)
vectors.shape

(3685, 6273)

In [5]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = 0.2)

In [6]:
from sklearn import metrics

In [7]:
multinomial = MultinomialNB().fit(train_X, train_Y)
print(metrics.classification_report(train_Y, multinomial.predict(train_X), target_names = ['negative','positive']))

             precision    recall  f1-score   support

   negative       0.97      0.55      0.71      1038
   positive       0.80      0.99      0.89      1910

avg / total       0.86      0.84      0.82      2948



In [8]:
print(metrics.classification_report(test_Y, multinomial.predict(test_X), target_names = ['negative','positive']))

             precision    recall  f1-score   support

   negative       0.72      0.30      0.43       257
   positive       0.72      0.94      0.81       480

avg / total       0.72      0.72      0.68       737



In [9]:
multinomial = MultinomialNB().fit(vectors, target)
print(metrics.classification_report(train_Y, multinomial.predict(train_X), target_names = ['negative','positive']))
print(metrics.classification_report(test_Y, multinomial.predict(test_X), target_names = ['negative','positive']))

             precision    recall  f1-score   support

   negative       0.95      0.56      0.70      1038
   positive       0.80      0.98      0.88      1910

avg / total       0.85      0.83      0.82      2948

             precision    recall  f1-score   support

   negative       0.96      0.61      0.75       257
   positive       0.83      0.99      0.90       480

avg / total       0.87      0.86      0.85       737



In [10]:
positive_text = 'Kerajaan negeri Kelantan mempersoalkan motif kenyataan Menteri Kewangan Lim Guan Eng yang hanya menyebut Kelantan penerima terbesar bantuan kewangan dari Kerajaan Persekutuan. Sedangkan menurut Timbalan Menteri Besarnya, Datuk Mohd Amar Nik Abdullah, negeri lain yang lebih maju dari Kelantan turut mendapat pembiayaan dan pinjaman.'
negative_text = 'kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya'

In [11]:
multinomial.predict_proba(tfidf.transform([positive_text,negative_text]))

array([[0.23419215, 0.76580785],
       [0.19026158, 0.80973842]])

In [14]:
import pickle
with open('multinomial-sentiment-news.pkl','wb') as fopen:
    pickle.dump(multinomial,fopen)
with open('tfidf-news.pkl','wb') as fopen:
    pickle.dump(tfidf,fopen)