In [1]:
import re
import numpy as np
import pandas as pd
import collections
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from unidecode import unidecode
from nltk.util import ngrams
from tqdm import tqdm
import time

In [2]:
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = re.findall(r'^(.*?)(%s)$' % ('|'.join(hujung)), word)
    word = hujung_result[0][0] if len(hujung_result) else word
    permulaan_result = re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan[::-1])), word)
    permulaan_result.extend(re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan)), word))
    mula = permulaan_result if len(permulaan_result) else ''
    if len(mula):
        mula = mula[1][1] if len(mula[1][1]) > len(mula[0][1]) else mula[0][1]
    return word.replace(mula, '')

In [3]:
def classification_textcleaning(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = ' '.join(
        [i for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) if len(i)]
    )
    string = string.lower().split()
    string = [(naive_stemmer(word), word) for word in string]
    return (
        ' '.join([word[0] for word in string if len(word[0]) > 1]),
        ' '.join([word[1] for word in string if len(word[0]) > 1]),
    )

In [4]:
with open('subjectivity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('subjectivity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts

assert len(labels) == len(texts)

In [5]:
for i in range(len(texts)):
    texts[i] = classification_textcleaning(texts[i])[0]

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb

In [7]:
tfidf = TfidfVectorizer(ngram_range=(1, 3),min_df=2).fit(texts)
vectors = tfidf.transform(texts)
vectors.shape

(9962, 30504)

In [8]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, labels, test_size = 0.2)

In [9]:
train_d = xgb.DMatrix(train_X, train_Y)
test_d = xgb.DMatrix(test_X, test_Y)
params_xgd = {
    'min_child_weight': 10.0,
    'max_depth': 7,
    'objective': 'multi:softprob',
    'max_delta_step': 1.8,
    'num_class': 2,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'learning_rate': 0.1,
    'gamma': 0.65,
    'silent': True,
    'eval_metric': 'mlogloss'
}
model = xgb.train(params_xgd, train_d, 10000, evals=[(test_d, 'validation')], 
                  early_stopping_rounds=100, verbose_eval=5)

[0]	validation-mlogloss:0.678167
Will train until validation-mlogloss hasn't improved in 100 rounds.
[5]	validation-mlogloss:0.618297
[10]	validation-mlogloss:0.579638
[15]	validation-mlogloss:0.54779
[20]	validation-mlogloss:0.526034
[25]	validation-mlogloss:0.509992
[30]	validation-mlogloss:0.494278
[35]	validation-mlogloss:0.481547
[40]	validation-mlogloss:0.472083
[45]	validation-mlogloss:0.463034
[50]	validation-mlogloss:0.455712
[55]	validation-mlogloss:0.448212
[60]	validation-mlogloss:0.442091
[65]	validation-mlogloss:0.43584
[70]	validation-mlogloss:0.429555
[75]	validation-mlogloss:0.423943
[80]	validation-mlogloss:0.419133
[85]	validation-mlogloss:0.414704
[90]	validation-mlogloss:0.410628
[95]	validation-mlogloss:0.406343
[100]	validation-mlogloss:0.402458
[105]	validation-mlogloss:0.399247
[110]	validation-mlogloss:0.395572
[115]	validation-mlogloss:0.392212
[120]	validation-mlogloss:0.389533
[125]	validation-mlogloss:0.387173
[130]	validation-mlogloss:0.384787
[135]	valid

In [10]:
predicted = np.argmax(model.predict(xgb.DMatrix(test_X),ntree_limit=model.best_ntree_limit),axis=1)
print(metrics.classification_report(test_Y, predicted, target_names = ['negative','positive']))

              precision    recall  f1-score   support

    negative       0.86      0.85      0.85      1003
    positive       0.85      0.86      0.85       990

   micro avg       0.85      0.85      0.85      1993
   macro avg       0.85      0.85      0.85      1993
weighted avg       0.85      0.85      0.85      1993



In [12]:
import pickle
with open('xgboost-subjective.pkl','wb') as fopen:
    pickle.dump(model,fopen)

with open('tfidf-xgboost-subjective.pkl','wb') as fopen:
    pickle.dump(tfidf,fopen)