In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import joblib

from xgboost import XGBClassifier
import pandas as pd
pd.set_option('display.max_colwidth', None)

from pandarallel import pandarallel
pandarallel.initialize()

from gensim.models import Word2Vec

from emogest.pipeline import ThaiPreprocessor, ThaiTokenizer, MeanEmbeddingVectorizer
from sklearn.pipeline import make_pipeline

from emogest.datasets import Dataset

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
SEED = 76
COMPRESS_LEVEL = 3

In [3]:
tweets = pd.read_csv("datasets/prepared_data.csv")
w2v_model = Word2Vec.load("models/tweet_embedding_256.model")

In [4]:
dataset = Dataset(tweets, y_column="emoji", test_size=0.05)

In [8]:
pipeline = make_pipeline(
    ThaiPreprocessor(), 
    ThaiTokenizer(remove_entities=True), 
    MeanEmbeddingVectorizer(w2v_model=w2v_model), 
    XGBClassifier(objective="multi:softprob", random_state=SEED)
)

In [10]:
%time pipeline.fit(dataset.training_set.tweet, dataset.training_set.emoji)

CPU times: user 19h 4min 6s, sys: 7min 26s, total: 19h 11min 32s
Wall time: 6h 37min 58s


Pipeline(steps=[('thaipreprocessor', ThaiPreprocessor()),
                ('thaitokenizer', ThaiTokenizer(remove_entities=True)),
                ('meanembeddingvectorizer',
                 <emogest.pipeline.MeanEmbeddingVectorizer object at 0x1383eef70>),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_...n',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=4, num_parallel_tree=1,
                               objective='multi:softprob', random_state=76,
                         

In [13]:
joblib.dump(pipeline, "models/emoji_xgb.pipeline", compress=COMPRESS_LEVEL)

['emoji_xgb.pipeline']