In [1]:
import pandas as pd
import re
import numpy as np
from utils import *
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from unidecode import unidecode
import pickle



In [2]:
df = pd.read_csv('sentiment-news-bahasa-v5.csv')
Y = LabelEncoder().fit_transform(df.label)
df.head()

Unnamed: 0,label,text
0,Negative,Lebih-lebih lagi dengan kemudahan internet da...
1,Positive,boleh memberi teguran kepada parti tetapi perl...
2,Negative,Adalah membingungkan mengapa masyarakat Cina b...
3,Positive,Kami menurunkan defisit daripada 6.7 peratus p...
4,Negative,"Ini masalahnya. Bukan rakyat, tetapi sistem"


In [3]:
def textcleaning(string):
    string = re.sub('http\S+|www.\S+', '',' '.join([i for i in string.split() if i.find('#')<0 and i.find('@')<0]))
    string = unidecode(string).replace('.', '. ').replace(',', ', ')
    string = re.sub('[^\'\"A-Za-z\- ]+', ' ', string)
    return ' '.join([i for i in re.findall("[\\w']+|[;:\-\(\)&.,!?\"]", string) if len(i)>1]).lower()

In [4]:
for i in range(df.shape[0]):
    df.iloc[i,1] = textcleaning(df.iloc[i,1])

In [5]:
with open('polarity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('polarity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts
texts += df.iloc[:,1].tolist()
labels += Y.tolist()

assert len(labels) == len(texts)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from malaya.text_functions import STOPWORDS

Using TensorFlow backend.


In [7]:
target = LabelEncoder().fit_transform(labels)
tfidf = TfidfVectorizer(ngram_range=(1, 3),min_df=2).fit(texts)
vectors = tfidf.transform(texts)
vectors.shape

(14279, 45344)

In [10]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = 0.2)

In [8]:
from sklearn import metrics

In [11]:
train_d = xgb.DMatrix(train_X, train_Y)
test_d = xgb.DMatrix(test_X, test_Y)
params_xgd = {
    'min_child_weight': 10.0,
    'max_depth': 7,
    'objective': 'multi:softprob',
    'max_delta_step': 1.8,
    'num_class': 2,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'learning_rate': 0.1,
    'gamma': 0.65,
    'silent': True,
    'eval_metric': 'mlogloss'
}
model = xgb.train(params_xgd, train_d, 10000, evals=[(test_d, 'validation')], 
                  early_stopping_rounds=100, verbose_eval=5)

[0]	validation-mlogloss:0.688333
Will train until validation-mlogloss hasn't improved in 100 rounds.
[5]	validation-mlogloss:0.667841
[10]	validation-mlogloss:0.654925
[15]	validation-mlogloss:0.645515
[20]	validation-mlogloss:0.638027
[25]	validation-mlogloss:0.632308
[30]	validation-mlogloss:0.627757
[35]	validation-mlogloss:0.624292
[40]	validation-mlogloss:0.620937
[45]	validation-mlogloss:0.618054
[50]	validation-mlogloss:0.61473
[55]	validation-mlogloss:0.612535
[60]	validation-mlogloss:0.610556
[65]	validation-mlogloss:0.6082
[70]	validation-mlogloss:0.606007
[75]	validation-mlogloss:0.603937
[80]	validation-mlogloss:0.601853
[85]	validation-mlogloss:0.600333
[90]	validation-mlogloss:0.598523
[95]	validation-mlogloss:0.597057
[100]	validation-mlogloss:0.596089
[105]	validation-mlogloss:0.594734
[110]	validation-mlogloss:0.593502
[115]	validation-mlogloss:0.592232
[120]	validation-mlogloss:0.590867
[125]	validation-mlogloss:0.589868
[130]	validation-mlogloss:0.58882
[135]	validat

In [12]:
predicted = np.argmax(model.predict(xgb.DMatrix(test_X),ntree_limit=model.best_ntree_limit),axis=1)
print(metrics.classification_report(test_Y, predicted, target_names = ['negative','positive']))

             precision    recall  f1-score   support

   negative       0.70      0.64      0.67      1332
   positive       0.71      0.76      0.73      1524

avg / total       0.71      0.71      0.70      2856



In [13]:
with open('xgboost-sentiment.pkl','wb') as fopen:
    pickle.dump(model,fopen)

In [14]:
with open('tfidf-xgboost-sentiment.pkl','wb') as fopen:
    pickle.dump(tfidf,fopen)