In [1]:
import re
import numpy as np
import pandas as pd
import collections
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from unidecode import unidecode
from nltk.util import ngrams
from tqdm import tqdm
import time

In [2]:
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = re.findall(r'^(.*?)(%s)$' % ('|'.join(hujung)), word)
    word = hujung_result[0][0] if len(hujung_result) else word
    permulaan_result = re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan[::-1])), word)
    permulaan_result.extend(re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan)), word))
    mula = permulaan_result if len(permulaan_result) else ''
    if len(mula):
        mula = mula[1][1] if len(mula[1][1]) > len(mula[0][1]) else mula[0][1]
    return word.replace(mula, '')

In [3]:
def classification_textcleaning(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = ' '.join(
        [i for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) if len(i)]
    )
    string = string.lower().split()
    string = [(naive_stemmer(word), word) for word in string]
    return (
        ' '.join([word[0] for word in string if len(word[0]) > 1]),
        ' '.join([word[1] for word in string if len(word[0]) > 1]),
    )

In [4]:
import os
emotion_files = [f for f in os.listdir(os.getcwd()) if 'translated-' in f]
emotion_files

['translated-joy',
 'translated-love',
 'translated-fear',
 'translated-sadness',
 'translated-surprise',
 'translated-anger']

In [5]:
texts, labels = [], []
for f in emotion_files:
    with open(f) as fopen:
        dataset = list(filter(None, fopen.read().split('\n')))
        labels.extend([f.split('-')[1]] * len(dataset))
        texts.extend(dataset)

In [6]:
for i in range(len(texts)):
    texts[i] = classification_textcleaning(texts[i])[0]

In [7]:
unique_labels = np.unique(labels).tolist()
labels = LabelEncoder().fit_transform(labels)
unique_labels

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [9]:
tfidf = TfidfVectorizer(ngram_range=(1, 3),min_df=2).fit(texts)
delattr(tfidf, 'stop_words_')
vectors = tfidf.transform(texts)
vectors.shape

(98515, 150374)

In [10]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, labels, test_size = 0.2)

In [11]:
multinomial = MultinomialNB().fit(train_X, train_Y)
print(
    metrics.classification_report(
        train_Y,
        multinomial.predict(train_X),
        target_names = unique_labels,
    )
)


             precision    recall  f1-score   support

      anger       0.83      0.90      0.86     15040
       fear       0.77      0.91      0.84     15256
        joy       0.80      0.91      0.85     15663
       love       0.92      0.85      0.88     12251
    sadness       0.90      0.79      0.84     12864
   surprise       0.94      0.51      0.66      7738

avg / total       0.85      0.84      0.84     78812



In [12]:
multinomial = MultinomialNB().fit(train_X, train_Y)
print(
    metrics.classification_report(
        test_Y,
        multinomial.predict(test_X),
        target_names = unique_labels,
    )
)


             precision    recall  f1-score   support

      anger       0.72      0.82      0.77      3833
       fear       0.68      0.80      0.74      3802
        joy       0.68      0.84      0.75      3924
       love       0.85      0.71      0.78      2981
    sadness       0.81      0.67      0.73      3189
   surprise       0.80      0.36      0.50      1974

avg / total       0.75      0.73      0.73     19703



In [13]:
text = 'kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya'
multinomial.predict_proba(tfidf.transform([classification_textcleaning(text)[0]]))

array([[0.51380363, 0.15154323, 0.12415014, 0.08683687, 0.08987986,
        0.03378626]])

In [14]:
import pickle
with open('multinomial-emotion.pkl','wb') as fopen:
    pickle.dump(multinomial,fopen)
with open('tfidf-multinomial-emotion.pkl','wb') as fopen:
    pickle.dump(tfidf,fopen)