In [1]:
import re
import numpy as np
import pandas as pd
import collections
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from unidecode import unidecode
from nltk.util import ngrams
from tqdm import tqdm
import time

In [2]:
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = re.findall(r'^(.*?)(%s)$' % ('|'.join(hujung)), word)
    word = hujung_result[0][0] if len(hujung_result) else word
    permulaan_result = re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan[::-1])), word)
    permulaan_result.extend(re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan)), word))
    mula = permulaan_result if len(permulaan_result) else ''
    if len(mula):
        mula = mula[1][1] if len(mula[1][1]) > len(mula[0][1]) else mula[0][1]
    return word.replace(mula, '')

In [6]:
def classification_textcleaning(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = ' '.join(
        [i for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) if len(i)]
    )
    string = string.lower().split()
    string = [(naive_stemmer(word), word) for word in string]
    return (
        ' '.join([word[0] for word in string if len(word[0]) > 1]),
        ' '.join([word[1] for word in string if len(word[0]) > 1]),
    )

In [4]:
with open('subjectivity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('subjectivity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts

assert len(labels) == len(texts)

In [5]:
for i in range(len(texts)):
    texts[i] = classification_textcleaning(texts[i])[0]

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [9]:
tfidf = TfidfVectorizer(ngram_range=(1, 3),min_df=2).fit(texts)
vectors = tfidf.transform(texts)
vectors.shape

(9962, 30504)

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, labels, test_size = 0.2)

In [12]:
multinomial = MultinomialNB().fit(train_X, train_Y)
print(
    metrics.classification_report(
        train_Y,
        multinomial.predict(train_X),
        target_names = ['negative', 'positive'],
    )
)


              precision    recall  f1-score   support

    negative       0.97      0.94      0.96      3991
    positive       0.95      0.97      0.96      3978

   micro avg       0.96      0.96      0.96      7969
   macro avg       0.96      0.96      0.96      7969
weighted avg       0.96      0.96      0.96      7969



In [13]:
multinomial = MultinomialNB().fit(train_X, train_Y)
print(
    metrics.classification_report(
        test_Y,
        multinomial.predict(test_X),
        target_names = ['negative', 'positive'],
    )
)


              precision    recall  f1-score   support

    negative       0.91      0.85      0.88       999
    positive       0.86      0.92      0.89       994

   micro avg       0.89      0.89      0.89      1993
   macro avg       0.89      0.89      0.89      1993
weighted avg       0.89      0.89      0.89      1993



In [14]:
text = 'kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya'
multinomial.predict_proba(tfidf.transform([classification_textcleaning(text)[0]]))

array([[0.46203836, 0.53796164]])

In [15]:
import pickle
with open('multinomial-subjective.pkl','wb') as fopen:
    pickle.dump(multinomial,fopen)
with open('tfidf-multinomial-subjective.pkl','wb') as fopen:
    pickle.dump(tfidf,fopen)