In [1]:
import re
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from unidecode import unidecode



In [2]:
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = re.findall(r'^(.*?)(%s)$' % ('|'.join(hujung)), word)
    word = hujung_result[0][0] if len(hujung_result) else word
    permulaan_result = re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan[::-1])), word)
    permulaan_result.extend(re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan)), word))
    mula = permulaan_result if len(permulaan_result) else ''
    if len(mula):
        mula = mula[1][1] if len(mula[1][1]) > len(mula[0][1]) else mula[0][1]
    return word.replace(mula, '')

def classification_textcleaning(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = ' '.join(
        [i for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) if len(i)]
    )
    string = string.lower().split()
    string = [(naive_stemmer(word), word) for word in string]
    return (
        ' '.join([word[0] for word in string if len(word[0]) > 1]),
        ' '.join([word[1] for word in string if len(word[0]) > 1]),
    )

In [3]:
df = pd.read_csv('dataset/sentiment-data-v2.csv')
Y = LabelEncoder().fit_transform(df.label)
df.head()

Unnamed: 0,label,text
0,Negative,Lebih-lebih lagi dengan kemudahan internet da...
1,Positive,boleh memberi teguran kepada parti tetapi perl...
2,Negative,Adalah membingungkan mengapa masyarakat Cina b...
3,Positive,Kami menurunkan defisit daripada 6.7 peratus p...
4,Negative,"Ini masalahnya. Bukan rakyat, tetapi sistem"


In [4]:
with open('dataset/polarity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('dataset/polarity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts
texts += df.iloc[:,1].tolist()
labels += Y.tolist()

assert len(labels) == len(texts)

In [5]:
for i in range(len(texts)):
    texts[i] = classification_textcleaning(texts[i])[0]

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from malaya.text_functions import STOPWORDS

Using TensorFlow backend.


In [7]:
target = LabelEncoder().fit_transform(labels)
tfidf = TfidfVectorizer(ngram_range=(1, 3),min_df=2).fit(texts)
vectors = tfidf.transform(texts)
vectors.shape

(14279, 39525)

In [8]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = 0.2)

In [9]:
train_d = xgb.DMatrix(train_X, train_Y)
test_d = xgb.DMatrix(test_X, test_Y)
params_xgd = {
    'min_child_weight': 10.0,
    'max_depth': 7,
    'objective': 'multi:softprob',
    'max_delta_step': 1.8,
    'num_class': 2,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'learning_rate': 0.1,
    'gamma': 0.65,
    'silent': True,
    'eval_metric': 'mlogloss'
}
model = xgb.train(params_xgd, train_d, 10000, evals=[(test_d, 'validation')], 
                  early_stopping_rounds=100, verbose_eval=5)

[0]	validation-mlogloss:0.689475
Will train until validation-mlogloss hasn't improved in 100 rounds.
[5]	validation-mlogloss:0.674113
[10]	validation-mlogloss:0.664839
[15]	validation-mlogloss:0.655095
[20]	validation-mlogloss:0.648783
[25]	validation-mlogloss:0.644937
[30]	validation-mlogloss:0.639983
[35]	validation-mlogloss:0.636577
[40]	validation-mlogloss:0.63348
[45]	validation-mlogloss:0.631154
[50]	validation-mlogloss:0.628292
[55]	validation-mlogloss:0.625998
[60]	validation-mlogloss:0.623632
[65]	validation-mlogloss:0.621446
[70]	validation-mlogloss:0.619443
[75]	validation-mlogloss:0.618203
[80]	validation-mlogloss:0.616871
[85]	validation-mlogloss:0.61484
[90]	validation-mlogloss:0.613788
[95]	validation-mlogloss:0.612716
[100]	validation-mlogloss:0.611539
[105]	validation-mlogloss:0.610019
[110]	validation-mlogloss:0.609321
[115]	validation-mlogloss:0.607768
[120]	validation-mlogloss:0.606331
[125]	validation-mlogloss:0.605313
[130]	validation-mlogloss:0.604488
[135]	valid

In [10]:
predicted = np.argmax(model.predict(xgb.DMatrix(test_X),ntree_limit=model.best_ntree_limit),axis=1)
print(metrics.classification_report(test_Y, predicted, target_names = ['negative','positive']))

             precision    recall  f1-score   support

   negative       0.69      0.61      0.65      1332
   positive       0.69      0.76      0.72      1524

avg / total       0.69      0.69      0.69      2856



In [15]:
text = (
    'kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya'
)
model.predict(
    xgb.DMatrix(tfidf.transform([classification_textcleaning(text)[0]])),
    ntree_limit = model.best_ntree_limit,
)

array([[0.47532737, 0.5246726 ]], dtype=float32)

In [16]:
import pickle
with open('xgboost-sentiment.pkl','wb') as fopen:
    pickle.dump(model,fopen)

with open('tfidf-xgboost-sentiment.pkl','wb') as fopen:
    pickle.dump(tfidf,fopen)