In [28]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
import emoji
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import itertools
%matplotlib inline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import hstack, csr_matrix, vstack
import pickle as pickle
analyzer_emoji = SentimentIntensityAnalyzer()

### Load data

In [3]:
train_df = pd.read_csv("./data/train.csv")
train_df = shuffle(train_df)
# test set don't have label
test_df = pd.read_csv("./data/test.csv")

In [4]:
train_df.head()

Unnamed: 0,id,comment,label
14261,train_014261,Chất lượng sản phẩm tuyệt vời Chất lượng sản ...,0
29626,foody_13539,"Gà rán của Texas thì khỏi bàn r , gà miếng bự ...",0
22567,foody_6480,"Nhân viên ở đây siêu dễ thươngg , đang có km m...",0
31975,foody_15888,"Sáng 8h , vào quán , 2 nhân viên ko 1 nụ cười ...",1
10483,train_010483,Ko kẹp đc. Rụng cả mi thật,1


In [5]:
test_df.head()

Unnamed: 0,id,comment
0,test_000000,Chưa dùng thử nên chưa biết
1,test_000001,Không đáng tiềnVì ngay đợt sale nên mới mua n...
2,test_000002,Cám ơn shop. Đóng gói sản phẩm rất đẹp và chắc...
3,test_000003,Vải đẹp.phom oki luôn.quá ưng
4,test_000004,Chuẩn hàng đóng gói đẹp


### Utils extract emoji sentiments

In [2]:
def extract_emojis(str):
    return [c for c in str if c in emoji.UNICODE_EMOJI]
def sentiment_emojis(sentence):
    emojis = extract_emojis(sentence)
    result = [0,0,0,0]
    if len(emojis) == 0:
        return result
    for icon in emojis:
        sen_dict = analyzer_emoji.polarity_scores(icon)
        sen = [sen_dict['neg'],sen_dict['neu'],sen_dict['pos'],sen_dict['compound']]
        result = [result[i] + sen[i] for i in range(4)]
    return [result[i] / len(emojis) for i in range(4)]
def sentiment_emojis_row(row):
    comment = row['comment']
    sen_comment = sentiment_emojis(comment)
    
    row['emoji_neg'] = sen_comment[0]
    row['emoji_neu'] = sen_comment[1]
    row['emoji_pos'] = sen_comment[2]
    row['emoji_compound'] = sen_comment[3]
    
    return row

## Create features

In [9]:
# concat to add statistic featute
df = pd.concat([train_df, test_df], axis=0, sort=True)

### Statistic features

In [13]:
# Fill none value
df['comment'] = df['comment'].astype(str).fillna(' ')
# Lower case comment
df['comment'] = df['comment'].str.lower()
# Add num words of comment as feature
df['num_words'] = df['comment'].apply(lambda s: len(s.split()))
# Add num words unique of comment as feature
df['num_unique_words'] = df['comment'].apply(lambda s: len(set(w for w in s.split())))
# Add num words unique per num words of comment as feature
df['words_vs_unique'] = df['num_unique_words'] / df['num_words'] * 100
# Add emojis features
df = df.apply(sentiment_emojis_row, axis=1)

In [14]:
# recover train set test set as before
train_df = df[~df['label'].isnull()]
test_df = df[df['label'].isnull()]
y_train = train_df['label'].values

In [19]:
# Just keep statistic feature to process by model
EXCLUED_COLS = ['id', 'comment', 'label']
static_cols = [c for c in train_df.columns if not c in EXCLUED_COLS]
print(static_cols)
X_train_static = train_df[static_cols].values
X_test_static = test_df[static_cols].values
print(X_train_static.shape, X_test_static.shape)

['num_words', 'num_unique_words', 'words_vs_unique', 'emoji_neg', 'emoji_neu', 'emoji_pos', 'emoji_compound']
(46087, 7) (10981, 7)


### Sent2Vec using tf-idf

In [21]:
tfidf = TfidfVectorizer(
    min_df = 5, 
    max_df = 0.8, 
    max_features=10000,
    sublinear_tf=True
)

In [22]:
# make tf-idf model and transform
train_comments = train_df['comment'].values
test_comments = test_df['comment'].values
X_train_tfidf = tfidf.fit_transform(train_comments)
X_test_tfidf = tfidf.transform(test_comments)

In [29]:
# save tf-idf model
with open('vectorizer.pk', 'wb') as fin:
    pickle.dump(tfidf, fin)

In [32]:
# modeltf = pickle.load(open("./vectorizer.pk", "rb" ))

### Combine statistic and sent2vec feature

In [54]:
X_train = hstack([X_train_tfidf, csr_matrix(X_train_static)]).tocsr()
X_test = hstack([X_test_tfidf, csr_matrix(X_test_static)]).tocsr()

In [58]:
# Because test set don't have label, split train set for evaluate model
X_train_split, X_valid, y_train_split, y_valid = train_test_split(X_train, y_train, test_size=0.1)

## Train model

In [69]:
param = {'num_leaves':100, 'num_trees':300, 'objective':'binary', "max_bin":255, "learning_rate":0.1, "device" : "gpu", "is_unbalance": True}

In [70]:
train_data = lgb.Dataset(X_train_split, y_train_split)
valid_data = lgb.Dataset(X_valid, y_valid)

In [71]:
bst = lgb.train(param, train_data, num_boost_round=500, valid_sets=[valid_data])
print("accuracy: {}".format(accuracy_score(y_valid, 1*(bst.predict(X_valid)>0.5))))



[1]	valid_0's binary_logloss: 0.654694
[2]	valid_0's binary_logloss: 0.622024
[3]	valid_0's binary_logloss: 0.594908
[4]	valid_0's binary_logloss: 0.572158
[5]	valid_0's binary_logloss: 0.551032
[6]	valid_0's binary_logloss: 0.53302
[7]	valid_0's binary_logloss: 0.517216
[8]	valid_0's binary_logloss: 0.502591
[9]	valid_0's binary_logloss: 0.490065
[10]	valid_0's binary_logloss: 0.477383
[11]	valid_0's binary_logloss: 0.466669
[12]	valid_0's binary_logloss: 0.456832
[13]	valid_0's binary_logloss: 0.446664
[14]	valid_0's binary_logloss: 0.438123
[15]	valid_0's binary_logloss: 0.430772
[16]	valid_0's binary_logloss: 0.423855
[17]	valid_0's binary_logloss: 0.416563
[18]	valid_0's binary_logloss: 0.410406
[19]	valid_0's binary_logloss: 0.404737
[20]	valid_0's binary_logloss: 0.399116
[21]	valid_0's binary_logloss: 0.39366
[22]	valid_0's binary_logloss: 0.388729
[23]	valid_0's binary_logloss: 0.383663
[24]	valid_0's binary_logloss: 0.379357
[25]	valid_0's binary_logloss: 0.375744
[26]	valid_

## Save model

In [73]:
print('Saving model...')
# save model to file
bst.save_model('model_gbm_sentiment.txt')

Saving model...


<lightgbm.basic.Booster at 0x7efebd0282b0>