In [1]:
import re
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from unidecode import unidecode

In [2]:
rules_normalizer = {
    'experience': 'pengalaman',
    'bagasi': 'bagasi',
    'kg': 'kampung',
    'kilo': 'kilogram',
    'g': 'gram',
    'grm': 'gram',
    'k': 'okay',
    'abgkat': 'abang dekat',
    'abis': 'habis',
    'ade': 'ada',
    'adoi': 'aduh',
    'adoii': 'aduhh',
    'aerodarat': 'kapal darat',
    'agkt': 'angkat',
    'ahh': 'ah',
    'ailior': 'air liur',
    'airasia': 'air asia x',
    'airasiax': 'penerbangan',
    'airline': 'penerbangan',
    'airlines': 'penerbangan',
    'airport': 'lapangan terbang',
    'airpot': 'lapangan terbang',
    'aje': 'sahaja',
    'ajelah': 'sahajalah',
    'ajer': 'sahaja',
    'ak': 'aku',
    'aq': 'aku',
    'all': 'semua',
    'ambik': 'ambil',
    'amek': 'ambil',
    'amer': 'amir',
    'amik': 'ambil',
    'ana': 'saya',
    'angkt': 'angkat',
    'anual': 'tahunan',
    'apapun': 'apa pun',
    'ape': 'apa',
    'arab': 'arab',
    'area': 'kawasan',
    'aritu': 'hari itu',
    'ask': 'tanya',
    'astro': 'astro',
    'at': 'pada',
    'attitude': 'sikap',
    'babi': 'khinzir',
    'back': 'belakang',
    'bag': 'beg',
    'bang': 'abang',
    'bangla': 'bangladesh',
    'banyk': 'banyak',
    'bard': 'pujangga',
    'bargasi': 'bagasi',
    'bawak': 'bawa',
    'bawanges': 'bawang',
    'be': 'jadi',
    'behave': 'berkelakuan baik',
    'belagak': 'berlagak',
    'berdisiplin': 'berdisplin',
    'berenti': 'berhenti',
    'beskal': 'basikal',
    'bff': 'rakan karib',
    'bg': 'bagi',
    'bgi': 'bagi',
    'biase': 'biasa',
    'big': 'besar',
    'bike': 'basikal',
    'bile': 'bila',
    'binawe': 'binatang',
    'bini': 'isteri',
    'bkn': 'bukan',
    'bla': 'bila',
    'blom': 'belum',
    'bnyak': 'banyak',
    'body': 'tubuh',
    'bole': 'boleh',
    'boss': 'bos',
    'bowling': 'boling',
    'bpe': 'berapa',
    'brand': 'jenama',
    'brg': 'barang',
    'briefing': 'taklimat',
    'brng': 'barang',
    'bro': 'abang',
    'bru': 'baru',
    'bruntung': 'beruntung',
    'bsikal': 'basikal',
    'btnggjwb': 'bertanggungjawab',
    'btul': 'betul',
    'buatlh': 'buatlah',
    'buh': 'letak',
    'buka': 'buka',
    'but': 'tetapi',
    'bwk': 'bawa',
    'by': 'dengan',
    'byr': 'bayar',
    'bz': 'sibuk',
    'camera': 'kamera',
    'camni': 'macam ini',
    'cane': 'macam mana',
    'cant': 'tak boleh',
    'carakerja': 'cara kerja',
    'care': 'jaga',
    'cargo': 'kargo',
    'cctv': 'kamera litar tertutup',
    'celako': 'celaka',
    'cer': 'cerita',
    'cheap': 'murah',
    'check': 'semak',
    'ciput': 'sedikit',
    'cite': 'cerita',
    'citer': 'cerita',
    'ckit': 'sikit',
    'ckp': 'cakap',
    'class': 'kelas',
    'cm': 'macam',
    'cmni': 'macam ini',
    'cmpak': 'campak',
    'committed': 'komited',
    'company': 'syarikat',
    'complain': 'aduan',
    'corn': 'jagung',
    'couldnt': 'tak boleh',
    'cr': 'cari',
    'crew': 'krew',
    'cube': 'cuba',
    'cuma': 'cuma',
    'curinyaa': 'curinya',
    'cust': 'pelanggan',
    'customer': 'pelanggan',
    'd': 'di',
    'da': 'dah',
    'dn': 'dan',
    'dahh': 'dah',
    'damaged': 'rosak',
    'dapek': 'dapat',
    'day': 'hari',
    'dazrin': 'dazrin',
    'dbalingnya': 'dibalingnya',
    'de': 'ada',
    'deep': 'dalam',
    'deliberately': 'sengaja',
    'depa': 'mereka',
    'dessa': 'desa',
    'dgn': 'dengan',
    'dh': 'dah',
    'didunia': 'di dunia',
    'diorang': 'mereka',
    'diorng': 'mereka',
    'direct': 'secara terus',
    'diving': 'junam',
    'dkt': 'dekat',
    'dlempar': 'dilempar',
    'dlm': 'dalam',
    'dlt': 'padam',
    'dlu': 'dulu',
    'done': 'siap',
    'dont': 'jangan',
    'dorg': 'mereka',
    'dpermudhkn': 'dipermudahkan',
    'dpt': 'dapat',
    'dr': 'dari',
    'dri': 'dari',
    'dsb': 'dan sebagainya',
    'dy': 'dia',
    'educate': 'mendidik',
    'ensure': 'memastikan',
    'everything': 'semua',
    'ewahh': 'wah',
    'expect': 'sangka',
    'fb': 'facebook',
    'fired': 'pecat',
    'first': 'pertama',
    'fkr': 'fikir',
    'flight': 'kapal terbang',
    'for': 'untuk',
    'free': 'percuma',
    'friend': 'kawan',
    'fyi': 'untuk pengetahuan anda',
    'gantila': 'gantilah',
    'gantirugi': 'ganti rugi',
    'gentlemen': 'lelaki budiman',
    'gerenti': 'jaminan',
    'gile': 'gila',
    'gk': 'juga',
    'gnti': 'ganti',
    'go': 'pergi',
    'gomen': 'kerajaan',
    'goment': 'kerajaan',
    'good': 'baik',
    'ground': 'tanah',
    'guarno': 'macam mana',
    'hampa': 'mereka',
    'hampeh': 'teruk',
    'hanat': 'jahanam',
    'handle': 'kawal',
    'handling': 'kawalan',
    'hanta': 'hantar',
    'haritu': 'hari itu',
    'hate': 'benci',
    'have': 'ada',
    'hawau': 'celaka',
    'henpon': 'telefon',
    'heran': 'hairan',
    'him': 'dia',
    'his': 'dia',
    'hmpa': 'mereka',
    'hntr': 'hantar',
    'hotak': 'otak',
    'hr': 'hari',
    'i': 'saya',
    'hrga': 'harga',
    'hrp': 'harap',
    'hu': 'sedih',
    'humble': 'merendah diri',
    'ibon': 'ikon',
    'ichi': 'inci',
    'idung': 'hidung',
    'if': 'jika',
    'ig': 'instagram',
    'iklas': 'ikhlas',
    'improve': 'menambah baik',
    'in': 'masuk',
    'isn t': 'tidak',
    'isyaallah': 'insyallah',
    'ja': 'sahaja',
    'japan': 'jepun',
    'jd': 'jadi',
    'je': 'saja',
    'jee': 'saja',
    'jek': 'saja',
    'jepun': 'jepun',
    'jer': 'saja',
    'jerr': 'saja',
    'jez': 'saja',
    'jg': 'juga',
    'jgk': 'juga',
    'jgn': 'jangan',
    'jgnla': 'janganlah',
    'jibake': 'celaka',
    'jjur': 'jujur',
    'job': 'kerja',
    'jobscope': 'skop kerja',
    'jogja': 'jogjakarta',
    'jpam': 'jpam',
    'jth': 'jatuh',
    'jugak': 'juga',
    'ka': 'ke',
    'kalo': 'kalau',
    'kalu': 'kalau',
    'kang': 'nanti',
    'kantoi': 'temberang',
    'kasi': 'beri',
    'kat': 'dekat',
    'kbye': 'ok bye',
    'kearah': 'ke arah',
    'kecik': 'kecil',
    'keja': 'kerja',
    'keje': 'kerja',
    'kejo': 'kerja',
    'keksongan': 'kekosongan',
    'kemana': 'ke mana',
    'kene': 'kena',
    'kenekan': 'kenakan',
    'kesah': 'kisah',
    'ketempat': 'ke tempat',
    'kije': 'kerja',
    'kijo': 'kerja',
    'kiss': 'cium',
    'kite': 'kita',
    'kito': 'kita',
    'kje': 'kerja',
    'kjr': 'kerja',
    'kk': 'okay',
    'kmi': 'kami',
    'kt': 'kat',
    'tlg': 'tolong',
    'kl': 'kuala lumpur',
    'klai': 'kalau',
    'klau': 'kalau',
    'klia': 'klia',
    'klo': 'kalau',
    'klu': 'kalau',
    'kn': 'kan',
    'knapa': 'kenapa',
    'kne': 'kena',
    'ko': 'kau',
    'kompom': 'sah',
    'korang': 'kamu semua',
    'korea': 'korea',
    'korg': 'kamu semua',
    'kot': 'mungkin',
    'krja': 'kerja',
    'ksalahan': 'kesalahan',
    'kta': 'kita',
    'kuar': 'keluar',
    'kut': 'mungkin',
    'la': 'lah',
    'laa': 'lah',
    'lahabau': 'celaka',
    'lahanat': 'celaka',
    'lainda': 'lain dah',
    'lak': 'pula',
    'last': 'akhir',
    'le': 'lah',
    'leader': 'ketua',
    'leave': 'pergi',
    'ler': 'lah',
    'less': 'kurang',
    'letter': 'surat',
    'lg': 'lagi',
    'lgi': 'lagi',
    'lngsong': 'langsung',
    'lol': 'hehe',
    'lorr': 'lah',
    'low': 'rendah',
    'lps': 'lepas',
    'luggage': 'bagasi',
    'lumbe': 'lumba',
    'lyak': 'layak',
    'maap': 'maaf',
    'maapkan': 'maafkan',
    'mahai': 'mahal',
    'mampos': 'mampus',
    'mart': 'kedai',
    'mau': 'mahu',
    'mcm': 'macam',
    'mcmtu': 'macam itu',
    'memerlukn': 'memerlukan',
    'mengembirakan': 'menggembirakan',
    'mengmbilnyer': 'mengambilnya',
    'mengtasi': 'mengatasi',
    'mg': 'memang',
    'mihak': 'memihak',
    'min': 'admin',
    'mingu': 'minggu',
    'mintak': 'minta',
    'mjtuhkn': 'menjatuhkan',
    'mkyong': 'mak yong',
    'mlibatkn': 'melibatkan',
    'mmg': 'memang',
    'mmnjang': 'memanjang',
    'mmpos': 'mampus',
    'mn': 'mana',
    'mna': 'mana',
    'mntak': 'minta',
    'mntk': 'minta',
    'mnyusun': 'menyusun',
    'mood': 'suasana',
    'most': 'paling',
    'mr': 'tuan',
    'msa': 'masa',
    'msia': 'malaysia',
    'mst': 'mesti',
    'mu': 'awak',
    'much': 'banyak',
    'muko': 'muka',
    'mum': 'emak',
    'n': 'dan',
    'nah': 'nah',
    'nanny': 'nenek',
    'napo': 'kenapa',
    'nati': 'nanti',
    'ngan': 'dengan',
    'ngn': 'dengan',
    'ni': 'ini',
    'nie': 'ini',
    'nii': 'ini',
    'nk': 'nak',
    'nmpk': 'nampak',
    'nye': 'nya',
    'ofis': 'pejabat',
    'ohh': 'oh',
    'oii': 'hoi',
    'one': 'satu',
    'online': 'dalam talian',
    'or': 'atau',
    'org': 'orang',
    'orng': 'orang',
    'otek': 'otak',
    'p': 'pergi',
    'paid': 'dah bayar',
    'palabana': 'kepala otak',
    'pasni': 'lepas ini',
    'passengers': 'penumpang',
    'passengger': 'penumpang',
    'pastu': 'lepas itu',
    'pd': 'pada',
    'pegi': 'pergi',
    'pekerje': 'pekerja',
    'pekrja': 'pekerja',
    'perabih': 'perabis',
    'perkerja': 'pekerja',
    'pg': 'pergi',
    'phuii': 'puih',
    'pikir': 'fikir',
    'pilot': 'juruterbang',
    'pk': 'fikir',
    'pkerja': 'pekerja',
    'pkerjaan': 'pekerjaan',
    'pki': 'pakai',
    'please': 'tolong',
    'pls': 'tolong',
    'pn': 'pun',
    'pnh': 'pernah',
    'pnt': 'penat',
    'pnya': 'punya',
    'pon': 'pun',
    'priority': 'keutamaan',
    'properties': 'harta benda',
    'ptugas': 'petugas',
    'pub': 'kelab malam',
    'pulak': 'pula',
    'puye': 'punya',
    'pwrcuma': 'percuma',
    'pyahnya': 'payahnya',
    'quality': 'kualiti',
    'quit': 'keluar',
    'ramly': 'ramly',
    'rege': 'harga',
    'reger': 'harga',
    'report': 'laporan',
    'resigned': 'meletakkan jawatan',
    'respect': 'hormat',
    'rizal': 'rizal',
    'rosak': 'rosak',
    'rosok': 'rosak',
    'rse': 'rasa',
    'sacked': 'buang',
    'sado': 'tegap',
    'salute': 'sanjung',
    'sam': 'sama',
    'same': 'sama',
    'samp': 'sampah',
    'sbb': 'sebab',
    'sbgai': 'sebagai',
    'sblm': 'sebelum',
    'sblum': 'sebelum',
    'sbnarnya': 'sebenarnya',
    'sbum': 'sebelum',
    'sdg': 'sedang',
    'sebb': 'sebab',
    'sebijik': 'sebiji',
    'see': 'lihat',
    'seen': 'dilihat',
    'selangor': 'selangor',
    'selfie': 'swafoto',
    'sempoi': 'cantik',
    'senaraihitam': 'senarai hitam',
    'seorg': 'seorang',
    'service': 'perkhidmatan',
    'sgt': 'sangat',
    'shared': 'kongsi',
    'shirt': 'kemeja',
    'shut': 'tutup',
    'sib': 'nasib',
    'skali': 'sekali',
    'sket': 'sikit',
    'sma': 'sama',
    'smoga': 'semoga',
    'smpoi': 'cantik',
    'sndiri': 'sendiri',
    'sndr': 'sendiri',
    'sndri': 'sendiri',
    'sne': 'sana',
    'so': 'jadi',
    'sop': 'tatacara pengendalian piawai',
    'sorang': 'seorang',
    'spoting': 'pembintikan',
    'sronok': 'seronok',
    'ssh': 'susah',
    'staff': 'staf',
    'standing': 'berdiri',
    'start': 'mula',
    'steady': 'mantap',
    'stiap': 'setiap',
    'stress': 'stres',
    'student': 'pelajar',
    'study': 'belajar',
    'studycase': 'kajian kes',
    'sure': 'pasti',
    'sykt': 'syarikat',
    'tah': 'entah',
    'taik': 'tahi',
    'takan': 'tak akan',
    'takat': 'setakat',
    'takde': 'tak ada',
    'takkan': 'tak akan',
    'taknak': 'tak nak',
    'tang': 'tentang',
    'tanggungjawab': 'bertanggungjawab',
    'taraa': 'sementara',
    'tau': 'tahu',
    'tbabit': 'terbabit',
    'team': 'pasukan',
    'terbaekk': 'terbaik',
    'teruknye': 'teruknya',
    'tgk': 'tengok',
    'that': 'itu',
    'thinking': 'fikir',
    'those': 'itu',
    'time': 'masa',
    'tk': 'tak',
    'tnggongjwb': 'tanggungjawab',
    'tngok': 'tengok',
    'tngu': 'tunggu',
    'to': 'kepada',
    'tosak': 'rosak',
    'tp': 'tapi',
    'tpi': 'tapi',
    'tpon': 'telefon',
    'transfer': 'pindah',
    'trgelak': 'tergelak',
    'ts': 'tan sri',
    'tstony': 'tan sri tony',
    'tu': 'itu',
    'tuh': 'itu',
    'tula': 'itulah',
    'umeno': 'umno',
    'unfortunately': 'malangnya',
    'unhappy': 'tidak gembira',
    'up': 'naik',
    'upkan': 'naikkan',
    'ur': 'awak',
    'utk': 'untuk',
    'very': 'sangat',
    'viral': 'tular',
    'vote': 'undi',
    'warning': 'amaran',
    'warranty': 'waranti',
    'wassap': 'whatsapp',
    'wat': 'apa',
    'weii': 'wei',
    'well': 'maklumlah',
    'win': 'menang',
    'with': 'dengan',
    'wt': 'buat',
    'x': 'tak',
    'tw': 'tahu',
    'ye': 'ya',
    'yee': 'ya',
    'yg': 'yang',
    'yng': 'yang',
    'you': 'awak',
    'your': 'awak',
    'sakai': 'selekeh',
    'rmb': 'billion ringgit',
    'rmj': 'juta ringgit',
    'rmk': 'ribu ringgit',
    'rm': 'ringgit',
}

In [3]:
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = [e for e in hujung if word.endswith(e)]
    if len(hujung_result):
        hujung_result = max(hujung_result, key = len)
        if len(hujung_result):
            word = word[: -len(hujung_result)]
    permulaan_result = [e for e in permulaan if word.startswith(e)]
    if len(permulaan_result):
        permulaan_result = max(permulaan_result, key = len)
        if len(permulaan_result):
            word = word[len(permulaan_result) :]
    return word

def build_dataset(words, n_words):
    count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 3)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


def classification_textcleaning(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string.lower()).strip()
    string = [rules_normalizer.get(w, w) for w in string.split()]
    string = [naive_stemmer(word) for word in string]
    return ' '.join([word for word in string if len(word) > 1])


def str_idx(corpus, dic, maxlen, UNK = 3):
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
            X[i, -1 - no] = dic.get(k, UNK)
    return X

In [4]:
classification_textcleaning('kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya')

'raja benar sangat benci rakyat minyak naik gala'

In [5]:
df = pd.read_csv('sentiment-data-v2.csv')
Y = LabelEncoder().fit_transform(df.label)
with open('polarity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('polarity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts
texts += df.iloc[:,1].tolist()
labels += Y.tolist()

assert len(labels) == len(texts)

In [6]:
import json
with open('bm-amazon.json') as fopen:
    amazon = json.load(fopen)
    
with open('bm-imdb.json') as fopen:
    imdb = json.load(fopen)
    
with open('bm-yelp.json') as fopen:
    yelp = json.load(fopen)
    
texts += amazon['negative']
labels += [0] * len(amazon['negative'])
texts += amazon['positive']
labels += [1] * len(amazon['positive'])

texts += imdb['negative']
labels += [0] * len(imdb['negative'])
texts += imdb['positive']
labels += [1] * len(imdb['positive'])

texts += yelp['negative']
labels += [0] * len(yelp['negative'])
texts += yelp['positive']
labels += [1] * len(yelp['positive'])

In [7]:
import os
for i in [i for i in os.listdir('negative') if 'Store' not in i]:
    with open('negative/'+i) as fopen:
        a = json.load(fopen)
        texts += a
        labels += [0] * len(a)

In [8]:
import os
for i in [i for i in os.listdir('positive') if 'Store' not in i]:
    with open('positive/'+i) as fopen:
        a = json.load(fopen)
        texts += a
        labels += [1] * len(a)

In [9]:
x, y = [], []
for i in range(len(texts)):
    s = classification_textcleaning(texts[i])
    if len(s) > 2:
        x.append(s)
        y.append(labels[i])

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb

In [11]:
tfidf = TfidfVectorizer(ngram_range=(1, 3),min_df=2).fit(x)
vectors = tfidf.transform(x)
vectors.shape

(674245, 862729)

In [12]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, y, test_size = 0.2)

In [13]:
train_d = xgb.DMatrix(train_X, train_Y)
test_d = xgb.DMatrix(test_X, test_Y)
params_xgd = {
    'min_child_weight': 10.0,
    'max_depth': 7,
    'objective': 'multi:softprob',
    'max_delta_step': 1.8,
    'num_class': 2,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'learning_rate': 0.1,
    'gamma': 0.65,
    'silent': True,
    'eval_metric': 'mlogloss'
}
model = xgb.train(params_xgd, train_d, 10000, evals=[(test_d, 'validation')], 
                  early_stopping_rounds=100, verbose_eval=5)

[0]	validation-mlogloss:0.682564
Will train until validation-mlogloss hasn't improved in 100 rounds.
[5]	validation-mlogloss:0.646801
[10]	validation-mlogloss:0.62514
[15]	validation-mlogloss:0.609519
[20]	validation-mlogloss:0.598222
[25]	validation-mlogloss:0.589751
[30]	validation-mlogloss:0.582024
[35]	validation-mlogloss:0.576187
[40]	validation-mlogloss:0.571209
[45]	validation-mlogloss:0.566791
[50]	validation-mlogloss:0.562978
[55]	validation-mlogloss:0.559323
[60]	validation-mlogloss:0.556244
[65]	validation-mlogloss:0.553185
[70]	validation-mlogloss:0.550547
[75]	validation-mlogloss:0.548156
[80]	validation-mlogloss:0.545777
[85]	validation-mlogloss:0.54353
[90]	validation-mlogloss:0.541328
[95]	validation-mlogloss:0.539422
[100]	validation-mlogloss:0.537604
[105]	validation-mlogloss:0.535899
[110]	validation-mlogloss:0.534286
[115]	validation-mlogloss:0.532623
[120]	validation-mlogloss:0.53116
[125]	validation-mlogloss:0.529637
[130]	validation-mlogloss:0.528223
[135]	valida

[1165]	validation-mlogloss:0.461095
[1170]	validation-mlogloss:0.460988
[1175]	validation-mlogloss:0.46089
[1180]	validation-mlogloss:0.460788
[1185]	validation-mlogloss:0.460721
[1190]	validation-mlogloss:0.460643
[1195]	validation-mlogloss:0.460542
[1200]	validation-mlogloss:0.460447
[1205]	validation-mlogloss:0.460357
[1210]	validation-mlogloss:0.460269
[1215]	validation-mlogloss:0.460183
[1220]	validation-mlogloss:0.460095
[1225]	validation-mlogloss:0.460009
[1230]	validation-mlogloss:0.459913
[1235]	validation-mlogloss:0.459824
[1240]	validation-mlogloss:0.459748
[1245]	validation-mlogloss:0.45964
[1250]	validation-mlogloss:0.459568
[1255]	validation-mlogloss:0.459487
[1260]	validation-mlogloss:0.459401
[1265]	validation-mlogloss:0.459307
[1270]	validation-mlogloss:0.459216
[1275]	validation-mlogloss:0.45913
[1280]	validation-mlogloss:0.459049
[1285]	validation-mlogloss:0.458965
[1290]	validation-mlogloss:0.458886
[1295]	validation-mlogloss:0.458798
[1300]	validation-mlogloss:0.45

[2310]	validation-mlogloss:0.447823
[2315]	validation-mlogloss:0.44779
[2320]	validation-mlogloss:0.447746
[2325]	validation-mlogloss:0.447705
[2330]	validation-mlogloss:0.447668
[2335]	validation-mlogloss:0.447644
[2340]	validation-mlogloss:0.447608
[2345]	validation-mlogloss:0.447566
[2350]	validation-mlogloss:0.447536
[2355]	validation-mlogloss:0.447511
[2360]	validation-mlogloss:0.447484
[2365]	validation-mlogloss:0.447456
[2370]	validation-mlogloss:0.447428
[2375]	validation-mlogloss:0.447394
[2380]	validation-mlogloss:0.447351
[2385]	validation-mlogloss:0.447316
[2390]	validation-mlogloss:0.447287
[2395]	validation-mlogloss:0.447243
[2400]	validation-mlogloss:0.44721
[2405]	validation-mlogloss:0.447192
[2410]	validation-mlogloss:0.447132
[2415]	validation-mlogloss:0.447105
[2420]	validation-mlogloss:0.447056
[2425]	validation-mlogloss:0.447012
[2430]	validation-mlogloss:0.446983
[2435]	validation-mlogloss:0.446945
[2440]	validation-mlogloss:0.446913
[2445]	validation-mlogloss:0.4

[3455]	validation-mlogloss:0.441832
[3460]	validation-mlogloss:0.441805
[3465]	validation-mlogloss:0.441783
[3470]	validation-mlogloss:0.441763
[3475]	validation-mlogloss:0.441731
[3480]	validation-mlogloss:0.441723
[3485]	validation-mlogloss:0.441711
[3490]	validation-mlogloss:0.441693
[3495]	validation-mlogloss:0.441678
[3500]	validation-mlogloss:0.441667
[3505]	validation-mlogloss:0.44164
[3510]	validation-mlogloss:0.441623
[3515]	validation-mlogloss:0.441618
[3520]	validation-mlogloss:0.441587
[3525]	validation-mlogloss:0.441567
[3530]	validation-mlogloss:0.441552
[3535]	validation-mlogloss:0.441539
[3540]	validation-mlogloss:0.44153
[3545]	validation-mlogloss:0.44151
[3550]	validation-mlogloss:0.441498
[3555]	validation-mlogloss:0.441477
[3560]	validation-mlogloss:0.441451
[3565]	validation-mlogloss:0.441435
[3570]	validation-mlogloss:0.441432
[3575]	validation-mlogloss:0.441414
[3580]	validation-mlogloss:0.4414
[3585]	validation-mlogloss:0.441374
[3590]	validation-mlogloss:0.4413

[4600]	validation-mlogloss:0.438788
[4605]	validation-mlogloss:0.438787
[4610]	validation-mlogloss:0.438778
[4615]	validation-mlogloss:0.438764
[4620]	validation-mlogloss:0.438764
[4625]	validation-mlogloss:0.438743
[4630]	validation-mlogloss:0.438724
[4635]	validation-mlogloss:0.438713
[4640]	validation-mlogloss:0.438706
[4645]	validation-mlogloss:0.438704
[4650]	validation-mlogloss:0.438703
[4655]	validation-mlogloss:0.438684
[4660]	validation-mlogloss:0.438661
[4665]	validation-mlogloss:0.438651
[4670]	validation-mlogloss:0.438642
[4675]	validation-mlogloss:0.438638
[4680]	validation-mlogloss:0.438629
[4685]	validation-mlogloss:0.438618
[4690]	validation-mlogloss:0.438614
[4695]	validation-mlogloss:0.438602
[4700]	validation-mlogloss:0.438595
[4705]	validation-mlogloss:0.438595
[4710]	validation-mlogloss:0.438589
[4715]	validation-mlogloss:0.438578
[4720]	validation-mlogloss:0.438565
[4725]	validation-mlogloss:0.438545
[4730]	validation-mlogloss:0.438548
[4735]	validation-mlogloss:0

[5745]	validation-mlogloss:0.437008
[5750]	validation-mlogloss:0.437012
[5755]	validation-mlogloss:0.437009
[5760]	validation-mlogloss:0.437015
[5765]	validation-mlogloss:0.437011
[5770]	validation-mlogloss:0.43701
[5775]	validation-mlogloss:0.437013
[5780]	validation-mlogloss:0.43701
[5785]	validation-mlogloss:0.437004
[5790]	validation-mlogloss:0.436991
[5795]	validation-mlogloss:0.436987
[5800]	validation-mlogloss:0.436987
[5805]	validation-mlogloss:0.436988
[5810]	validation-mlogloss:0.436983
[5815]	validation-mlogloss:0.436976
[5820]	validation-mlogloss:0.436969
[5825]	validation-mlogloss:0.436966
[5830]	validation-mlogloss:0.436963
[5835]	validation-mlogloss:0.436951
[5840]	validation-mlogloss:0.43696
[5845]	validation-mlogloss:0.43696
[5850]	validation-mlogloss:0.436956
[5855]	validation-mlogloss:0.436944
[5860]	validation-mlogloss:0.436938
[5865]	validation-mlogloss:0.436925
[5870]	validation-mlogloss:0.436916
[5875]	validation-mlogloss:0.436903
[5880]	validation-mlogloss:0.436

In [14]:
predicted = np.argmax(model.predict(xgb.DMatrix(test_X),ntree_limit=model.best_ntree_limit),axis=1)
print(metrics.classification_report(test_Y, predicted, target_names = ['negative','positive']))

              precision    recall  f1-score   support

    negative       0.81      0.80      0.81     70356
    positive       0.79      0.80      0.79     64493

   micro avg       0.80      0.80      0.80    134849
   macro avg       0.80      0.80      0.80    134849
weighted avg       0.80      0.80      0.80    134849



In [15]:
text = (
    'kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya'
)
model.predict(
    xgb.DMatrix(tfidf.transform([classification_textcleaning(text)])),
    ntree_limit = model.best_ntree_limit,
)

array([[0.844284  , 0.15571605]], dtype=float32)

In [16]:
delattr(tfidf, 'stop_words_')

In [17]:
import pickle
with open('xgboost-sentiment.pkl','wb') as fopen:
    pickle.dump(model,fopen)

with open('tfidf-xgboost-sentiment.pkl','wb') as fopen:
    pickle.dump(tfidf,fopen)