In [1]:
import re
import numpy as np
import pandas as pd
import collections
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from unidecode import unidecode
from nltk.util import ngrams
from tqdm import tqdm
import time

In [2]:
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = re.findall(r'^(.*?)(%s)$' % ('|'.join(hujung)), word)
    word = hujung_result[0][0] if len(hujung_result) else word
    permulaan_result = re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan[::-1])), word)
    permulaan_result.extend(re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan)), word))
    mula = permulaan_result if len(permulaan_result) else ''
    if len(mula):
        mula = mula[1][1] if len(mula[1][1]) > len(mula[0][1]) else mula[0][1]
    return word.replace(mula, '')

In [3]:
def classification_textcleaning(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = ' '.join(
        [i for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) if len(i)]
    )
    string = string.lower().split()
    string = [(naive_stemmer(word), word) for word in string]
    return (
        ' '.join([word[0] for word in string if len(word[0]) > 1]),
        ' '.join([word[1] for word in string if len(word[0]) > 1]),
    )

In [4]:
import os
emotion_files = [f for f in os.listdir(os.getcwd()) if 'translated-' in f]
emotion_files

['translated-joy',
 'translated-love',
 'translated-fear',
 'translated-sadness',
 'translated-surprise',
 'translated-anger']

In [5]:
texts, labels = [], []
for f in emotion_files:
    with open(f) as fopen:
        dataset = list(filter(None, fopen.read().split('\n')))
        labels.extend([f.split('-')[1]] * len(dataset))
        texts.extend(dataset)
        
for i in range(len(texts)):
    texts[i] = classification_textcleaning(texts[i])[0]

In [6]:
unique_labels = np.unique(labels).tolist()
labels = LabelEncoder().fit_transform(labels)
unique_labels

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb

In [8]:
tfidf = TfidfVectorizer(ngram_range=(1, 3),min_df=2).fit(texts)
delattr(tfidf, 'stop_words_')
vectors = tfidf.transform(texts)
vectors.shape

(98515, 150374)

In [9]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, labels, test_size = 0.2)

In [10]:
train_d = xgb.DMatrix(train_X, train_Y)
test_d = xgb.DMatrix(test_X, test_Y)
params_xgd = {
    'min_child_weight': 10.0,
    'max_depth': 7,
    'objective': 'multi:softprob',
    'max_delta_step': 1.8,
    'num_class': len(unique_labels),
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'learning_rate': 0.1,
    'gamma': 0.65,
    'silent': True,
    'eval_metric': 'mlogloss'
}
model = xgb.train(params_xgd, train_d, 10000, evals=[(test_d, 'validation')], 
                  early_stopping_rounds=100, verbose_eval=5)

[0]	validation-mlogloss:1.74008
Will train until validation-mlogloss hasn't improved in 100 rounds.
[5]	validation-mlogloss:1.57107
[10]	validation-mlogloss:1.44
[15]	validation-mlogloss:1.34743
[20]	validation-mlogloss:1.28066
[25]	validation-mlogloss:1.22507
[30]	validation-mlogloss:1.17591
[35]	validation-mlogloss:1.13362
[40]	validation-mlogloss:1.09899
[45]	validation-mlogloss:1.06596
[50]	validation-mlogloss:1.03805
[55]	validation-mlogloss:1.01202
[60]	validation-mlogloss:0.989304
[65]	validation-mlogloss:0.968563
[70]	validation-mlogloss:0.949455
[75]	validation-mlogloss:0.932211
[80]	validation-mlogloss:0.916403
[85]	validation-mlogloss:0.901435
[90]	validation-mlogloss:0.887864
[95]	validation-mlogloss:0.874737
[100]	validation-mlogloss:0.863097
[105]	validation-mlogloss:0.851857
[110]	validation-mlogloss:0.841303
[115]	validation-mlogloss:0.831604
[120]	validation-mlogloss:0.822591
[125]	validation-mlogloss:0.813961
[130]	validation-mlogloss:0.805982
[135]	validation-mloglos

[1165]	validation-mlogloss:0.575913
[1170]	validation-mlogloss:0.575922
[1175]	validation-mlogloss:0.575805
[1180]	validation-mlogloss:0.57582
[1185]	validation-mlogloss:0.575832
[1190]	validation-mlogloss:0.575805
[1195]	validation-mlogloss:0.575826
[1200]	validation-mlogloss:0.575763
[1205]	validation-mlogloss:0.575751
[1210]	validation-mlogloss:0.575701
[1215]	validation-mlogloss:0.575685
[1220]	validation-mlogloss:0.575671
[1225]	validation-mlogloss:0.575659
Stopping. Best iteration:
[1126]	validation-mlogloss:0.575583



In [13]:
predicted = np.argmax(model.predict(xgb.DMatrix(test_X),ntree_limit=model.best_ntree_limit),axis=1)
print(metrics.classification_report(test_Y, predicted, target_names = unique_labels))

             precision    recall  f1-score   support

      anger       0.80      0.80      0.80      3769
       fear       0.79      0.76      0.78      3808
        joy       0.79      0.81      0.80      3913
       love       0.84      0.85      0.84      2998
    sadness       0.76      0.75      0.76      3250
   surprise       0.77      0.77      0.77      1965

avg / total       0.79      0.79      0.79     19703



In [12]:
import pickle
with open('xgboost-emotion.pkl','wb') as fopen:
    pickle.dump(model,fopen)

with open('tfidf-xgboost-emotion.pkl','wb') as fopen:
    pickle.dump(tfidf,fopen)