In [1]:
import pandas as pd
import numpy as np
from malaya.text_functions import deep_sentiment_textcleaning, STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
import re
from unidecode import unidecode
import xgboost as xgb
import pickle



In [2]:
def deep_sentiment_textcleaning(string):
    string = re.sub('http\S+|www.\S+', '',' '.join([i for i in string.split() if i.find('#')<0 and i.find('@')<0]))
    string = unidecode(string).replace('.', '. ').replace(',', ', ')
    string = re.sub('[^\'\"A-Za-z\- ]+', '', string)
    return ' '.join([i for i in re.findall("[\\w']+|[;:\-\(\)&.,!?\"]", string) if len(i)>1 and i not in STOPWORDS]).lower()

In [3]:
df = pd.read_csv('sentiment/sentiment-news-bahasa-v5.csv')
df.head()

Unnamed: 0,label,text
0,Negative,Lebih-lebih lagi dengan kemudahan internet da...
1,Positive,boleh memberi teguran kepada parti tetapi perl...
2,Negative,Adalah membingungkan mengapa masyarakat Cina b...
3,Positive,Kami menurunkan defisit daripada 6.7 peratus p...
4,Negative,"Ini masalahnya. Bukan rakyat, tetapi sistem"


In [4]:
processed_strings = [deep_sentiment_textcleaning(i) for i in df.iloc[:,1]]

In [5]:
target = LabelEncoder().fit_transform(df.iloc[:,0])
tfidf = TfidfVectorizer(ngram_range=(1, 4),min_df=2).fit(processed_strings)
vectors = tfidf.transform(processed_strings)
vectors.shape

(3685, 6515)

In [6]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = 0.2)

In [7]:
from sklearn import metrics

In [8]:
train_d = xgb.DMatrix(train_X, train_Y)
test_d = xgb.DMatrix(test_X, test_Y)
params_xgd = {
    'min_child_weight': 10.0,
    'max_depth': 7,
    'objective': 'multi:softprob',
    'max_delta_step': 1.8,
    'num_class': 2,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'learning_rate': 0.1,
    'gamma': 0.65,
    'silent': True,
    'eval_metric': 'mlogloss'
}
model = xgb.train(params_xgd, train_d, 10000, evals=[(test_d, 'validation')], 
                  early_stopping_rounds=100, verbose_eval=5)

[0]	validation-mlogloss:0.685212
Will train until validation-mlogloss hasn't improved in 100 rounds.
[5]	validation-mlogloss:0.66206
[10]	validation-mlogloss:0.65334
[15]	validation-mlogloss:0.649789
[20]	validation-mlogloss:0.649466
[25]	validation-mlogloss:0.648826
[30]	validation-mlogloss:0.647562
[35]	validation-mlogloss:0.64745
[40]	validation-mlogloss:0.648345
[45]	validation-mlogloss:0.648269
[50]	validation-mlogloss:0.649185
[55]	validation-mlogloss:0.648525
[60]	validation-mlogloss:0.650573
[65]	validation-mlogloss:0.651418
[70]	validation-mlogloss:0.652932
[75]	validation-mlogloss:0.653267
[80]	validation-mlogloss:0.653429
[85]	validation-mlogloss:0.652862
[90]	validation-mlogloss:0.653178
[95]	validation-mlogloss:0.65308
[100]	validation-mlogloss:0.653788
[105]	validation-mlogloss:0.653858
[110]	validation-mlogloss:0.654537
[115]	validation-mlogloss:0.655805
[120]	validation-mlogloss:0.655073
[125]	validation-mlogloss:0.656123
[130]	validation-mlogloss:0.656703
Stopping. Bes

In [9]:
predicted = np.argmax(model.predict(xgb.DMatrix(train_X),ntree_limit=model.best_ntree_limit),axis=1)
print(metrics.classification_report(train_Y, predicted, target_names = ['negative','positive']))

             precision    recall  f1-score   support

   negative       0.76      0.14      0.24       999
   positive       0.69      0.98      0.81      1949

avg / total       0.71      0.69      0.61      2948



In [10]:
predicted = np.argmax(model.predict(xgb.DMatrix(test_X),ntree_limit=model.best_ntree_limit),axis=1)
print(metrics.classification_report(test_Y, predicted, target_names = ['negative','positive']))

             precision    recall  f1-score   support

   negative       0.72      0.09      0.16       296
   positive       0.61      0.98      0.75       441

avg / total       0.66      0.62      0.51       737



In [11]:
with open('xgboost-sentiment.pkl','wb') as fopen:
    pickle.dump(model,fopen)

In [12]:
with open('xgboost-tfidf.pkl','wb') as fopen:
    pickle.dump(tfidf,fopen)