#Import Datasets and libraries

In [None]:
!gdown 1SaSq8kwvNmxq2HoQBenhXC3ejM8BU70d
!gdown 1uGv2afj67P9BGEMwFPyv_IopjMzaqMuG

In [None]:
import pandas as pd
import numpy as np
import keras
from keras.layers import Input, Dense, GRU, Embedding, Dropout, LSTM, Concatenate, SimpleRNN, Bidirectional
from keras.models import Model
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

# Preprocess

In [None]:
def str_to_npa(s):
  data_list = s.split(' ')
  c = 0
  for x in data_list:
    if x == '':
      c += 1
  for i in range(c):
    data_list.remove('')
  data_array = np.array([float(num) for num in data_list])
  return data_array

def remove_punctuations(data):
    punct_tag=re.compile(r'[^\w\s]')
    data=punct_tag.sub(r'',data)
    return data

def remove_url(data):
    url_tag=re.compile(r'URL')
    data=url_tag.sub(r'', data)
    return data

def remove_double_spaces(data):
    data = re.sub(' +', ' ', data)
    return data

def get_tokens(data):
    return data.split(' ')

def remove_stopwords(data):
    data = ' '.join([word for word in data.split() if word not in stop_words])
    return data

In [None]:
d1 = pd.read_csv('t15_text_n2v.csv', encoding='utf-8')
d2 = pd.read_csv('t16_text_n2v.csv', encoding='utf-8')

d1['n2v'] = d1['n2v'].apply(lambda x: x.replace('[', ''))
d1['n2v'] = d1['n2v'].apply(lambda x: x.replace(']', ''))
d1['n2v'] = d1['n2v'].apply(lambda x: str_to_npa(x))

d2['n2v'] = d2['n2v'].apply(lambda x: x.replace('[', ''))
d2['n2v'] = d2['n2v'].apply(lambda x: x.replace(']', ''))
d2['n2v'] = d2['n2v'].apply(lambda x: str_to_npa(x))

content1 = d1['text']
content2 = d2['text']

content1=content1.apply(lambda z: remove_punctuations(z))
content1=content1.apply(lambda z: remove_url(z))
content1=content1.apply(lambda z: remove_double_spaces(z))
content1=content1.apply(lambda z: remove_stopwords(z))
content1=content1.apply(lambda z: get_tokens(z))

content2=content2.apply(lambda z: remove_punctuations(z))
content2=content2.apply(lambda z: remove_url(z))
content2=content2.apply(lambda z: remove_double_spaces(z))
content2=content2.apply(lambda z: remove_stopwords(z))
content2=content2.apply(lambda z: get_tokens(z))

# Encoding for testing
model_enc = {1 : 'RNN',
             2 : 'BiRNN',
             3 : 'GRU',
             4 : 'BiGRU',
             5 : 'LSTM',
             6 : 'BiLSTM'}



# Twitter15

In [None]:
maxlen = 0
for text in content1:
  if maxlen < len(text):
    maxlen = len(text)
e1 = np.load('t15_w2v_emb_matrix.npy')
vocab1_size = e1[0].shape

## Without Node Embeddings

In [None]:
def RNN_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  embedding_layer = Embedding(np.array(e1).shape[0], 100, weights=[e1], trainable=False)(text_input)
  rnn_output = SimpleRNN(64)(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  hidden = Dense(32, activation='relu')(rnn_output)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=text_input, outputs=output)
  return model

def BiRNN_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  embedding_layer = Embedding(np.array(e1).shape[0], 100, weights=[e1], trainable=False)(text_input)
  rnn_output = Bidirectional(SimpleRNN(64))(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  hidden = Dense(32, activation='relu')(rnn_output)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=text_input, outputs=output)
  return model

def GRU_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  embedding_layer = Embedding(np.array(e1).shape[0], 100, weights=[e1], trainable=False)(text_input)
  rnn_output = GRU(64)(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  hidden = Dense(32, activation='relu')(rnn_output)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=text_input, outputs=output)
  return model

def BiGRU_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  embedding_layer = Embedding(np.array(e1).shape[0], 100, weights=[e1], trainable=False)(text_input)
  rnn_output = Bidirectional(GRU(64))(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  hidden = Dense(32, activation='relu')(rnn_output)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=text_input, outputs=output)
  return model

def LSTM_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  embedding_layer = Embedding(np.array(e1).shape[0], 100, weights=[e1], trainable=False)(text_input)
  rnn_output = LSTM(64)(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  hidden = Dense(32, activation='relu')(rnn_output)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=text_input, outputs=output)
  return model

def BiLSTM_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  embedding_layer = Embedding(np.array(e1).shape[0], 100, weights=[e1], trainable=False)(text_input)
  rnn_output = Bidirectional(LSTM(64))(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  hidden = Dense(32, activation='relu')(rnn_output)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=text_input, outputs=output)
  return model

In [None]:
for test in range(1, 11):
  # randomize train-test-split
  random_state = random.randint(1, 100)
  train_embeddings, test_embeddings, train_labels, test_labels = train_test_split(
        content1, d1['label'].values, test_size=0.2, random_state=random_state
    )
  # tokenize text input
  tokenizer1 = Tokenizer(num_words = vocab1_size)
  tokenizer1.fit_on_texts(train_embeddings)
  sequences_train1 = tokenizer1.texts_to_sequences(train_embeddings)
  train_embeddings = pad_sequences(sequences_train1, maxlen=maxlen ,padding='post')

  sequences_test1= tokenizer1.texts_to_sequences(test_embeddings)
  test_embeddings = pad_sequences(sequences_test1, maxlen=maxlen,padding='post')

  filename = "results/t15/w2v/without/output" + str(j) + ".txt"
  file = open(filename, "a")
  for i in model_enc.keys():
      if model_enc[i] == 'RNN':
        model = RNN_model()
      elif model_enc[i] == 'BiRNN':
        model = BiRNN_model()
      elif model_enc[i] == 'GRU':
        model = GRU_model()
      elif model_enc[i] == 'BiGRU':
        model = BiGRU_model()
      elif model_enc[i] == 'LSTM':
        model = LSTM_model()
      else:
        model = BiLSTM_model()
      model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
      print(f"Training Model {model_enc[i]}")
      model.fit(train_embeddings, train_labels, epochs=8, batch_size=64, validation_split=0.2)
      score = model.evaluate(test_embeddings, test_labels, verbose=0)
      file.write(f"Model {model_enc[i]} Test Accuracy: {score[1]}" + '\n')
      y_pred = model.predict(test_embeddings)
      y_pred = [1.0 if p > 0.5 else 0 for p in y_pred]
      report = classification_report(test_labels, y_pred)
      file.write(report + '\n')
  file.close()

## With Node2Vec Node Embeddings

In [None]:
from sklearn.model_selection import train_test_split
train_embeddings, test_embeddings, train_n2v, test_n2v, train_labels, test_labels = train_test_split(
    content1, np.array(d1['n2v'].tolist()), d1['label'].values, test_size=0.2, random_state=42
)

def RNN_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(train_n2v.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e1).shape[0], 100, weights=[e1], trainable=False)(text_input)
  rnn_output = SimpleRNN(64)(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def BiRNN_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(train_n2v.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e1).shape[0], 100, weights=[e1], trainable=False)(text_input)
  rnn_output = Bidirectional(SimpleRNN(64))(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def GRU_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(train_n2v.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e1).shape[0], 100, weights=[e1], trainable=False)(text_input)
  rnn_output = GRU(64)(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def BiGRU_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(train_n2v.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e1).shape[0], 100, weights=[e1], trainable=False)(text_input)
  rnn_output = Bidirectional(GRU(64))(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def LSTM_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(train_n2v.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e1).shape[0], 100, weights=[e1], trainable=False)(text_input)
  rnn_output = LSTM(64)(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def BiLSTM_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(train_n2v.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e1).shape[0], 100, weights=[e1], trainable=False)(text_input)
  rnn_output = Bidirectional(LSTM(64))(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

In [None]:
for test in range(1, 11):
  # randomize train-test-split
  random_state = random.randint(1, 100)
  from sklearn.model_selection import train_test_split
  train_embeddings, test_embeddings, train_n2v, test_n2v, train_labels, test_labels = train_test_split(
      content1, np.array(d1['n2v'].tolist()), d1['label'].values, test_size=0.2, random_state=random_state
  )

  # tokenize
  tokenizer1 = Tokenizer(num_words = vocab1_size)
  tokenizer1.fit_on_texts(train_embeddings)
  sequences_train1 = tokenizer1.texts_to_sequences(train_embeddings)
  train_embeddings = pad_sequences(sequences_train1, maxlen=maxlen ,padding='post')

  sequences_test1= tokenizer1.texts_to_sequences(test_embeddings)
  test_embeddings = pad_sequences(sequences_test1, maxlen=maxlen,padding='post')

  filename = "results/t15/w2v/n2v/output" + str(j) + ".txt"
  file = open(filename, "a")
  for i in model_enc.keys():
      if model_enc[i] == 'RNN':
        model = RNN_model()
      elif model_enc[i] == 'BiRNN':
        model = BiRNN_model()
      elif model_enc[i] == 'GRU':
        model = GRU_model()
      elif model_enc[i] == 'BiGRU':
        model = BiGRU_model()
      elif model_enc[i] == 'LSTM':
        model = LSTM_model()
      else:
        model = BiLSTM_model()
      model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
      print(f"Training Model {model_enc[i]}")
      model.fit([train_embeddings, train_n2v], train_labels, epochs=8, batch_size=64, validation_split=0.2)
      score = model.evaluate([test_embeddings, test_n2v], test_labels, verbose=0)
      file.write(f"Model {model_enc[i]} Test Accuracy: {score[1]}" + '\n')
      y_pred = model.predict([test_embeddings, test_n2v])
      y_pred = [1.0 if p > 0.5 else 0 for p in y_pred]
      report = classification_report(test_labels, y_pred)
      file.write(report + '\n')

  file.close()

## With DeepWalk Node Embeddings

In [None]:
node_emb = np.load('100d/t15_dw_emb.npy')
d1['dw'] = list(node_emb)

def RNN_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(node_emb.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e1).shape[0], 100, weights=[e1], trainable=False)(text_input)
  rnn_output = SimpleRNN(64)(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def BiRNN_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(node_emb.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e1).shape[0], 100, weights=[e1], trainable=False)(text_input)
  rnn_output = Bidirectional(SimpleRNN(64))(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def GRU_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(node_emb.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e1).shape[0], 100, weights=[e1], trainable=False)(text_input)
  rnn_output = GRU(64)(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def BiGRU_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(node_emb.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e1).shape[0], 100, weights=[e1], trainable=False)(text_input)
  rnn_output = Bidirectional(GRU(64))(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def LSTM_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(node_emb.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e1).shape[0], 100, weights=[e1], trainable=False)(text_input)
  rnn_output = LSTM(64)(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def BiLSTM_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(node_emb.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e1).shape[0], 100, weights=[e1], trainable=False)(text_input)
  rnn_output = Bidirectional(LSTM(64))(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

In [None]:
for j in range(1, 11):
  # randomize
  random_state = random.randint(1, 100)
  train_embeddings, test_embeddings, train_n2v, test_n2v, train_labels, test_labels = train_test_split(
        content1, np.array(d1['dw'].tolist()), d1['label'].values, test_size=0.2, random_state=random_state
    )

  # tokenize
  from sklearn.model_selection import train_test_split
  train_embeddings, test_embeddings, train_n2v, test_n2v, train_labels, test_labels = train_test_split(
      content1, np.array(d1['dw'].tolist()), d1['label'].values, test_size=0.2, random_state=42
  )
  tokenizer1 = Tokenizer(num_words = vocab1_size)
  tokenizer1.fit_on_texts(train_embeddings)
  sequences_train1 = tokenizer1.texts_to_sequences(train_embeddings)
  train_embeddings = pad_sequences(sequences_train1, maxlen=maxlen ,padding='post')

  sequences_test1= tokenizer1.texts_to_sequences(test_embeddings)
  test_embeddings = pad_sequences(sequences_test1, maxlen=maxlen,padding='post')

  filename = "results/t15/w2v/dw/output" + str(j) + ".txt"
  file = open(filename, "a")
  for i in model_enc.keys():
      if model_enc[i] == 'RNN':
        model = RNN_model()
      elif model_enc[i] == 'BiRNN':
        model = BiRNN_model()
      elif model_enc[i] == 'GRU':
        model = GRU_model()
      elif model_enc[i] == 'BiGRU':
        model = BiGRU_model()
      elif model_enc[i] == 'LSTM':
        model = LSTM_model()
      else:
        model = BiLSTM_model()
      model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
      print(f"Training Model {model_enc[i]}")
      model.fit([train_embeddings, train_n2v], train_labels, epochs=8, batch_size=64, validation_split=0.2)
      score = model.evaluate([test_embeddings, test_n2v], test_labels, verbose=0)
      file.write(f"Model {model_enc[i]} Test Accuracy: {score[1]}" + '\n')
      y_pred = model.predict([test_embeddings, test_n2v])
      y_pred = [1.0 if p > 0.5 else 0 for p in y_pred]
      report = classification_report(test_labels, y_pred)
      file.write(report + '\n')

  file.close()

# Twitter16

In [None]:
maxlen = 0
for text in content2:
  if maxlen < len(text):
    maxlen = len(text)
e2 = np.load('t16_w2v_emb_matrix.npy')
vocab2_size = e2[0].shape

## Without Node Embeddings

In [None]:
def RNN_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  embedding_layer = Embedding(np.array(e2).shape[0], 100, weights=[e2], trainable=False)(text_input)
  rnn_output = SimpleRNN(64)(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  hidden = Dense(32, activation='relu')(rnn_output)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=text_input, outputs=output)
  return model

def BiRNN_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  embedding_layer = Embedding(np.array(e2).shape[0], 100, weights=[e2], trainable=False)(text_input)
  rnn_output = Bidirectional(SimpleRNN(64))(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  hidden = Dense(32, activation='relu')(rnn_output)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=text_input, outputs=output)
  return model

def GRU_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  embedding_layer = Embedding(np.array(e2).shape[0], 100, weights=[e2], trainable=False)(text_input)
  rnn_output = GRU(64)(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  hidden = Dense(32, activation='relu')(rnn_output)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=text_input, outputs=output)
  return model

def BiGRU_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  embedding_layer = Embedding(np.array(e2).shape[0], 100, weights=[e2], trainable=False)(text_input)
  rnn_output = Bidirectional(GRU(64))(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  hidden = Dense(32, activation='relu')(rnn_output)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=text_input, outputs=output)
  return model

def LSTM_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  embedding_layer = Embedding(np.array(e2).shape[0], 100, weights=[e2], trainable=False)(text_input)
  rnn_output = LSTM(64)(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  hidden = Dense(32, activation='relu')(rnn_output)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=text_input, outputs=output)
  return model

def BiLSTM_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  embedding_layer = Embedding(np.array(e2).shape[0], 100, weights=[e2], trainable=False)(text_input)
  rnn_output = Bidirectional(LSTM(64))(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  hidden = Dense(32, activation='relu')(rnn_output)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=text_input, outputs=output)
  return model

In [None]:
for test in range(1, 11):
  # randomize train-test-split
  random_state = random.randint(1, 100)
  train_embeddings, test_embeddings, train_labels, test_labels = train_test_split(
        content2, d2['label'].values, test_size=0.2, random_state=random_state
    )
  # tokenize text input
  tokenizer2 = Tokenizer(num_words = vocab2_size)
  tokenizer2.fit_on_texts(train_embeddings)
  sequences_train2 = tokenizer2.texts_to_sequences(train_embeddings)
  train_embeddings = pad_sequences(sequences_train2, maxlen=maxlen ,padding='post')

  sequences_test2= tokenizer2.texts_to_sequences(test_embeddings)
  test_embeddings = pad_sequences(sequences_test2, maxlen=maxlen,padding='post')

  filename = "results/t16/w2v/without/output" + str(j) + ".txt"
  file = open(filename, "a")
  for i in model_enc.keys():
      if model_enc[i] == 'RNN':
        model = RNN_model()
      elif model_enc[i] == 'BiRNN':
        model = BiRNN_model()
      elif model_enc[i] == 'GRU':
        model = GRU_model()
      elif model_enc[i] == 'BiGRU':
        model = BiGRU_model()
      elif model_enc[i] == 'LSTM':
        model = LSTM_model()
      else:
        model = BiLSTM_model()
      model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
      print(f"Training Model {model_enc[i]}")
      model.fit(train_embeddings, train_labels, epochs=8, batch_size=64, validation_split=0.2)
      score = model.evaluate(test_embeddings, test_labels, verbose=0)
      file.write(f"Model {model_enc[i]} Test Accuracy: {score[1]}" + '\n')
      y_pred = model.predict(test_embeddings)
      y_pred = [1.0 if p > 0.5 else 0 for p in y_pred]
      report = classification_report(test_labels, y_pred)
      file.write(report + '\n')
  file.close()

## With Node2Vec Node Embeddings

In [None]:
from sklearn.model_selection import train_test_split
train_embeddings, test_embeddings, train_n2v, test_n2v, train_labels, test_labels = train_test_split(
    content2, np.array(d2['n2v'].tolist()), d2['label'].values, test_size=0.2, random_state=42
)

def RNN_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(train_n2v.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e2).shape[0], 100, weights=[e2], trainable=False)(text_input)
  rnn_output = SimpleRNN(64)(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def BiRNN_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(train_n2v.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e2).shape[0], 100, weights=[e2], trainable=False)(text_input)
  rnn_output = Bidirectional(SimpleRNN(64))(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def GRU_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(train_n2v.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e2).shape[0], 100, weights=[e2], trainable=False)(text_input)
  rnn_output = GRU(64)(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def BiGRU_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(train_n2v.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e2).shape[0], 100, weights=[e2], trainable=False)(text_input)
  rnn_output = Bidirectional(GRU(64))(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def LSTM_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(train_n2v.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e2).shape[0], 100, weights=[e2], trainable=False)(text_input)
  rnn_output = LSTM(64)(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def BiLSTM_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(train_n2v.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e2).shape[0], 100, weights=[e2], trainable=False)(text_input)
  rnn_output = Bidirectional(LSTM(64))(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

In [None]:
for test in range(1, 11):
  # randomize train-test-split
  random_state = random.randint(1, 100)
  from sklearn.model_selection import train_test_split
  train_embeddings, test_embeddings, train_n2v, test_n2v, train_labels, test_labels = train_test_split(
      content2, np.array(d1['n2v'].tolist()), d1['label'].values, test_size=0.2, random_state=random_state
  )

  # tokenize
  tokenizer2 = Tokenizer(num_words = vocab2_size)
  tokenizer2.fit_on_texts(train_embeddings)
  sequences_train2 = tokenizer2.texts_to_sequences(train_embeddings)
  train_embeddings = pad_sequences(sequences_train2, maxlen=maxlen ,padding='post')

  sequences_test2= tokenizer1.texts_to_sequences(test_embeddings)
  test_embeddings = pad_sequences(sequences_test2, maxlen=maxlen,padding='post')

  filename = "results/t16/w2v/n2v/output" + str(j) + ".txt"
  file = open(filename, "a")
  for i in model_enc.keys():
      if model_enc[i] == 'RNN':
        model = RNN_model()
      elif model_enc[i] == 'BiRNN':
        model = BiRNN_model()
      elif model_enc[i] == 'GRU':
        model = GRU_model()
      elif model_enc[i] == 'BiGRU':
        model = BiGRU_model()
      elif model_enc[i] == 'LSTM':
        model = LSTM_model()
      else:
        model = BiLSTM_model()
      model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
      print(f"Training Model {model_enc[i]}")
      model.fit([train_embeddings, train_n2v], train_labels, epochs=8, batch_size=64, validation_split=0.2)
      score = model.evaluate([test_embeddings, test_n2v], test_labels, verbose=0)
      file.write(f"Model {model_enc[i]} Test Accuracy: {score[1]}" + '\n')
      y_pred = model.predict([test_embeddings, test_n2v])
      y_pred = [1.0 if p > 0.5 else 0 for p in y_pred]
      report = classification_report(test_labels, y_pred)
      file.write(report + '\n')

  file.close()

## With DeepWalk Node Embeddings

In [None]:
node_emb = np.load('100d/t16_dw_emb.npy')
d2['dw'] = list(node_emb)

def RNN_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(node_emb.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e2).shape[0], 100, weights=[e2], trainable=False)(text_input)
  rnn_output = SimpleRNN(64)(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def BiRNN_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(node_emb.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e2).shape[0], 100, weights=[e2], trainable=False)(text_input)
  rnn_output = Bidirectional(SimpleRNN(64))(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def GRU_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(node_emb.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e2).shape[0], 100, weights=[e2], trainable=False)(text_input)
  rnn_output = GRU(64)(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def BiGRU_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(node_emb.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e2).shape[0], 100, weights=[e2], trainable=False)(text_input)
  rnn_output = Bidirectional(GRU(64))(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def LSTM_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(node_emb.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e2).shape[0], 100, weights=[e2], trainable=False)(text_input)
  rnn_output = LSTM(64)(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

def BiLSTM_model():
  text_input = Input(shape=(maxlen,), name='text_input')
  graph_input = Input(shape=(node_emb.shape[1],), name='graph_input')
  embedding_layer = Embedding(np.array(e2).shape[0], 100, weights=[e2], trainable=False)(text_input)
  rnn_output = Bidirectional(LSTM(64))(embedding_layer)
  rnn_output = Dropout(0.2)(rnn_output)
  graph_output = Dense(32, activation='relu')(graph_input)
  concatenated = Concatenate()([rnn_output, graph_output])
  hidden = Dense(32, activation='relu')(concatenated)
  output = Dense(1, activation='sigmoid')(hidden)
  model = Model(inputs=[text_input, graph_input], outputs=output)
  return model

In [None]:
for j in range(1, 11):
  # randomize
  random_state = random.randint(1, 100)
  train_embeddings, test_embeddings, train_n2v, test_n2v, train_labels, test_labels = train_test_split(
        content1, np.array(d2['dw'].tolist()), d2['label'].values, test_size=0.2, random_state=random_state
    )
  # tokenize
  tokenizer2 = Tokenizer(num_words = vocab2_size)
  tokenizer2.fit_on_texts(train_embeddings)
  sequences_train2 = tokenizer2.texts_to_sequences(train_embeddings)
  train_embeddings = pad_sequences(sequences_train2, maxlen=maxlen ,padding='post')

  sequences_test2= tokenizer2.texts_to_sequences(test_embeddings)
  test_embeddings = pad_sequences(sequences_test2, maxlen=maxlen,padding='post')

  filename = "results/t16/w2v/dw/output" + str(j) + ".txt"
  file = open(filename, "a")
  for i in model_enc.keys():
      if model_enc[i] == 'RNN':
        model = RNN_model()
      elif model_enc[i] == 'BiRNN':
        model = BiRNN_model()
      elif model_enc[i] == 'GRU':
        model = GRU_model()
      elif model_enc[i] == 'BiGRU':
        model = BiGRU_model()
      elif model_enc[i] == 'LSTM':
        model = LSTM_model()
      else:
        model = BiLSTM_model()
      model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
      print(f"Training Model {model_enc[i]}")
      model.fit([train_embeddings, train_n2v], train_labels, epochs=8, batch_size=64, validation_split=0.2)
      score = model.evaluate([test_embeddings, test_n2v], test_labels, verbose=0)
      file.write(f"Model {model_enc[i]} Test Accuracy: {score[1]}" + '\n')
      y_pred = model.predict([test_embeddings, test_n2v])
      y_pred = [1.0 if p > 0.5 else 0 for p in y_pred]
      report = classification_report(test_labels, y_pred)
      file.write(report + '\n')

  file.close()