## Import lib

In [1]:
import os
path = '/content/drive/MyDrive/ProjectBigData/Data'
os.chdir(path)

In [2]:
!pip install pyvi

import numpy as np
from tqdm import tqdm
np.random.seed(42)
import pandas as pd
from pyvi import ViTokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tensorflow.keras.layers import RepeatVector
from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D
from keras.layers import BatchNormalization, MaxPooling1D, Conv1D#, Merge
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D, LSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras import backend as K
from keras.models import model_from_json, load_model
from keras.utils.data_utils import pad_sequences

import warnings
warnings.filterwarnings('ignore')


Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Collecting sklearn-crfsuite (from pyvi)
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite->pyvi)
  Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m993.5/993.5 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-crfsuite, sklearn-crfsuite, pyvi
Successfully installed python-crfsuite-0.9.9 pyvi-0.1.1 sklearn-crfsuite-0.3.6


In [3]:
EMBEDDING_FILE= '/content/drive/MyDrive/ProjectBigData/Data/cc.vi.300.vec'
max_features = 2500
maxlen = 500
embed_size = 300
batch_size = 32
epochs = 5
l2_reg = .00001
filter_sizes = [3,4,5]
num_filters = 32

## Read dataset

In [4]:
df = pd.read_csv('/content/drive/MyDrive/ProjectBigData/Data/comb_extraSNS_ReINTEL.csv')
df['post_message']=df['post_message'].fillna('none')

In [5]:
train, test = train_test_split(df, test_size=0.15, random_state=123)
print(train.shape, test.shape)
X_train = train["post_message"].fillna("none").values
y_train = train[['label']].values
X_test = test["post_message"].fillna("none").values
y_test = test[['label']].values

(6467, 2) (1142, 2)


## Tokenizers

In [6]:
tokenizer = text.Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [7]:
## Create Vector
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

In [8]:
embeddings_index = {}
with open(EMBEDDING_FILE, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

word_index = tokenizer.word_index
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))

for word, i in word_index.items():
    if i >= max_features:
        continue

    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## Metrics

In [9]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

## Build BiLSTM

In [10]:
inp_lstm = Input(shape=(maxlen,))

lstm = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=True)(inp_lstm)
lstm = SpatialDropout1D(0.35)(lstm)
lstm = Bidirectional(LSTM(128, return_sequences=True, dropout=0.15, recurrent_dropout=0.15))(lstm)

x = Conv1D(64, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform')(lstm)

avg_pool = GlobalAveragePooling1D()(lstm)
max_pool = GlobalMaxPooling1D()(lstm)
lstm = concatenate([avg_pool, max_pool])

out_lstm = Dense(1, activation='sigmoid')(lstm)

modelBiLSTM = Model(inp_lstm, out_lstm)

## Build CNN

In [11]:
inp_cnn = Input(shape=(maxlen,))
cnn = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp_cnn)
cnn = SpatialDropout1D(0.3)(cnn)
cnn = Reshape((maxlen, embed_size, 1))(cnn)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal',
                activation='elu')(cnn)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal',
                activation='elu')(cnn)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal',
                activation='elu')(cnn)

maxpool_0 = MaxPool2D(pool_size=(maxlen - filter_sizes[0] + 1, 1))(conv_0)
maxpool_1 = MaxPool2D(pool_size=(maxlen - filter_sizes[1] + 1, 1))(conv_1)
maxpool_2 = MaxPool2D(pool_size=(maxlen - filter_sizes[2] + 1, 1))(conv_2)

z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
z = Flatten()(z)
z = Dropout(0.35)(z)
outp_cnn = Dense(1, activation="sigmoid")(z)



model_cnn = Model(inputs=inp_cnn, outputs=outp_cnn)

## Combine BiLSTM and CNN

In [12]:
combined_layer = concatenate([modelBiLSTM.output, model_cnn.output])
dense_layer = Dense(64, activation='relu')(combined_layer)
output_layer = Dense(1, activation='sigmoid')(dense_layer)
comb_model = Model(inputs=[modelBiLSTM.input, model_cnn.input], outputs=output_layer)

In [18]:
comb_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [14]:
comb_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 500)]        0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 500, 300)     750000      ['input_2[0][0]']                
                                                                                                  
 spatial_dropout1d_1 (SpatialDr  (None, 500, 300)    0           ['embedding_1[0][0]']            
 opout1D)                                                                                         
                                                                                                  
 input_1 (InputLayer)           [(None, 500)]        0           []                         

In [29]:
comb_model.fit([X_train, X_train], y_train,
               epochs=7,
               batch_size=batch_size)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x78cfdaa69900>

In [30]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
predict = comb_model.predict([X_test, X_test])



In [31]:
rounded_predict = np.round(predict)
rounded_predict
accuracy = accuracy_score(y_test, rounded_predict)
f1_score = f1_score(y_test, rounded_predict, average='macro')
roc_auc = roc_auc_score(y_test, rounded_predict)

# Print the evaluation metrics
print('Accuracy Score:', accuracy)
print('F1 Score:', f1_score)
print('ROC AUC Score:', roc_auc)

Accuracy Score: 0.8984238178633975
F1 Score: 0.8715958436724566
ROC AUC Score: 0.8556733063307995


## Another comb

In [None]:
from tensorflow.keras.layers import Input, Average
models = [modelBiLSTM, model_cnn]
asemble_inp = Input(shape=(maxlen,))
model_outs = [model(asemble_inp) for model in models]
ensemble_output = Average()(model_outs)
ensemble_model = Model(inputs=asemble_inp, outputs=ensemble_output, name='ensemble')

ensemble_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc',f1_m, precision_m, recall_m])

In [None]:
history = ensemble_model.fit(X_train,  y_train,
                            validation_data = (X_test, y_test),
                            batch_size = batch_size,
                            epochs = epochs,
                            verbose = 1)