In [1]:
import re
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from unidecode import unidecode



In [2]:
from nltk import sent_tokenize
from itertools import combinations
from toolz import compose
from sklearn.feature_extraction.text import CountVectorizer


class SkipGramVectorizer(CountVectorizer):
    def __init__(self, k = 1, **kwds):
        super(SkipGramVectorizer, self).__init__(**kwds)
        self.k = k

    def build_sent_analyzer(self, preprocess, stop_words, tokenize):
        return lambda sent: self._word_skip_grams(
            compose(tokenize, preprocess, self.decode)(sent), stop_words
        )

    def build_analyzer(self):
        preprocess = self.build_preprocessor()
        stop_words = self.get_stop_words()
        tokenize = self.build_tokenizer()
        sent_analyze = self.build_sent_analyzer(
            preprocess, stop_words, tokenize
        )

        return lambda doc: self._sent_skip_grams(doc, sent_analyze)

    def _sent_skip_grams(self, doc, sent_analyze):
        skip_grams = []
        for sent in sent_tokenize(doc):
            skip_grams.extend(sent_analyze(sent))
        return skip_grams

    def _word_skip_grams(self, tokens, stop_words = None):
        if stop_words is not None:
            tokens = [w for w in tokens if w not in stop_words]
        min_n, max_n = self.ngram_range
        k = self.k
        if max_n != 1:
            original_tokens = tokens
            if min_n == 1:
                tokens = list(original_tokens)
                min_n += 1
            else:
                tokens = []

            n_original_tokens = len(original_tokens)
            tokens_append = tokens.append
            space_join = ' '.join

            for n in range(min_n, min(max_n + 1, n_original_tokens + 1)):
                for i in range(n_original_tokens - n + 1):
                    head = [original_tokens[i]]
                    for skip_tail in combinations(
                        original_tokens[i + 1 : i + n + k], n - 1
                    ):
                        tokens_append(space_join(head + list(skip_tail)))
        return tokens

In [7]:
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = re.findall(r'^(.*?)(%s)$' % ('|'.join(hujung)), word)
    word = hujung_result[0][0] if len(hujung_result) else word
    permulaan_result = re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan[::-1])), word)
    permulaan_result.extend(re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan)), word))
    mula = permulaan_result if len(permulaan_result) else ''
    if len(mula):
        mula = mula[1][1] if len(mula[1][1]) > len(mula[0][1]) else mula[0][1]
    return word.replace(mula, '')

def classification_textcleaning(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = ' '.join(
        [i for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) if len(i)]
    )
    string = string.lower().split()
    string = [(naive_stemmer(word), word) for word in string]
    return (
        ' '.join([word[0] for word in string if len(word[0]) > 1]),
        ' '.join([word[1] for word in string if len(word[0]) > 1]),
    )

In [8]:
df = pd.read_csv('dataset/sentiment-data-v2.csv')
Y = LabelEncoder().fit_transform(df.label)
df.head()

Unnamed: 0,label,text
0,Negative,Lebih-lebih lagi dengan kemudahan internet da...
1,Positive,boleh memberi teguran kepada parti tetapi perl...
2,Negative,Adalah membingungkan mengapa masyarakat Cina b...
3,Positive,Kami menurunkan defisit daripada 6.7 peratus p...
4,Negative,"Ini masalahnya. Bukan rakyat, tetapi sistem"


In [9]:
with open('dataset/polarity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('dataset/polarity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts
texts += df.iloc[:,1].tolist()
labels += Y.tolist()

assert len(labels) == len(texts)

In [10]:
for i in range(len(texts)):
    texts[i] = classification_textcleaning(texts[i])[0]

In [13]:
vectorizer = SkipGramVectorizer(ngram_range=(1,2), k=3, **{}).fit(texts)

In [12]:
vectorizer.transform(texts)

<14279x403410 sparse matrix of type '<class 'numpy.int64'>'
	with 951054 stored elements in Compressed Sparse Row format>

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from malaya.text_functions import STOPWORDS

Using TensorFlow backend.


In [18]:
target = LabelEncoder().fit_transform(labels)
tfidf = TfidfVectorizer(ngram_range=(1, 3),min_df=2).fit(texts)
vectors = tfidf.transform(texts)
vectors.shape

(14279, 39525)

In [19]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = 0.2)

In [20]:
multinomial = MultinomialNB().fit(train_X, train_Y)
print(
    metrics.classification_report(
        train_Y,
        multinomial.predict(train_X),
        target_names = ['negative', 'positive'],
    )
)

             precision    recall  f1-score   support

   negative       0.96      0.83      0.89      5222
   positive       0.87      0.97      0.92      6201

avg / total       0.91      0.91      0.90     11423



In [21]:
multinomial = MultinomialNB().fit(train_X, train_Y)
print(
    metrics.classification_report(
        test_Y,
        multinomial.predict(test_X),
        target_names = ['negative', 'positive'],
    )
)

             precision    recall  f1-score   support

   negative       0.80      0.53      0.64      1379
   positive       0.67      0.88      0.76      1477

avg / total       0.73      0.71      0.70      2856



In [22]:
text = 'kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya'
multinomial.predict_proba(tfidf.transform([classification_textcleaning(text)[0]]))

array([[0.42305397, 0.57694603]])

In [23]:
import pickle
with open('multinomial-sentiment.pkl','wb') as fopen:
    pickle.dump(multinomial,fopen)
with open('tfidf-multinomial-sentiment.pkl','wb') as fopen:
    pickle.dump(tfidf,fopen)