In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Input, LSTM, Bidirectional, Conv1D
from keras.layers import Dropout, Embedding
from keras.preprocessing import text, sequence
from keras.utils.data_utils import pad_sequences
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D
from keras.models import Model
from keras import backend as K
from keras.models import model_from_json
from keras.models import load_model

In [None]:
EMBEDDING_FILE= '/content/drive/MyDrive/ProjectBigData/Data/cc.vi.300.vec'

In [None]:
max_features=2500
maxlen=500
embed_size=300

In [None]:
df = pd.read_csv('/content/drive/MyDrive/ProjectBigData/Data/comb_extraSNS_ReINTEL.csv')
df['post_message']=df['post_message'].fillna('none')

In [None]:
train, test = train_test_split(df, test_size=0.15, random_state=123)
print(train.shape, test.shape)

(6467, 2) (1142, 2)


In [None]:
X_train = train["post_message"].fillna("none").values
y_train = train[['label']].values
X_test = test["post_message"].fillna("none").values
y_test = test[['label']].values

In [None]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(6467,) (6467, 1) (1142,) (1142, 1)


In [None]:
tokenizer = text.Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(list(X_train))

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
## Create Vector
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

In [None]:
embeddings_index = {}
with open(EMBEDDING_FILE, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

word_index = tokenizer.word_index
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))

In [None]:
for word, i in word_index.items():
    if i >= max_features:
        continue

    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
from tensorflow.keras.layers import RepeatVector

inp = Input(shape=(maxlen,))

x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=True)(inp)
x = SpatialDropout1D(0.35)(x)
x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.15, recurrent_dropout=0.15))(x)

# content_expanded = RepeatVector(maxlen)(content_input)
# concat = concatenate([x, content_expanded])
x = Conv1D(64, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform')(x)

avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])

out = Dense(1, activation='sigmoid')(x)

model = Model(inp, out)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc',f1_m, precision_m, recall_m])



In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 800)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 800, 300)     750000      ['input_1[0][0]']                
                                                                                                  
 spatial_dropout1d (SpatialDrop  (None, 800, 300)    0           ['embedding[0][0]']              
 out1D)                                                                                           
                                                                                                  
 bidirectional (Bidirectional)  (None, 800, 256)     439296      ['spatial_dropout1d[0][0]']  

In [None]:
batch_size = 32
epochs =5
history = model.fit(X_train,  y_train,
                    validation_data = (X_test, y_test),
                    batch_size = batch_size,
                    epochs = epochs,
                    verbose = 1)
# # evaluate the model
# loss, accuracy, f1_score, precision, recall = model.evaluate([X_val,user_val,X_vall['len']], y_val, verbose=0)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, y_test, verbose=0)

In [None]:
print(loss, accuracy, f1_score, precision, recall )

0.32000046968460083 0.8984237909317017 0.8117678165435791 0.8488360047340393 0.7980484366416931


In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
predict = model.predict(X_test)



In [None]:
rounded_predict = np.round(predict)
rounded_predict
accuracy = accuracy_score(y_test, rounded_predict)
f1_score = f1_score(y_test, rounded_predict, average='macro')
roc_auc = roc_auc_score(y_test, rounded_predict)

# Print the evaluation metrics
print('Accuracy Score:', accuracy)
print('F1 Score:', f1_score)
print('ROC AUC Score:', roc_auc)

Accuracy Score: 0.8984238178633975
F1 Score: 0.8752735147349591
ROC AUC Score: 0.8687676857348944
