In [None]:
!pip install sentence_transformers
!pip install jsonlines
!pip install optuna

In [None]:
from sentence_transformers import SentenceTransformer
import re
import numpy as np
import jsonlines
import json
from helper_function import *
from Tweet_Info_Obj import *

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

folder='/content/gdrive/My Drive/project-data'

In [None]:
model = SentenceTransformer('stsb-mpnet-base-v2')

In [None]:
train_tweets_corpus,train_tweet_id,train_tweet_info=extract_data(folder+'/train.data.jsonl')
dev_tweets_corpus,dev_tweet_id,dev_tweet_info=extract_data(folder+'/dev.data.jsonl')
test_tweets_corpus,test_tweet_id,test_tweet_info=extract_data(folder+'/test.data.jsonl')

train_data_label=get_labels(folder+'/train.label.json',train_tweet_id)
dev_data_label=get_labels(folder+'/dev.label.json',dev_tweet_id)

preprocess_train_tweet_corpous=preprocees_tweets(train_tweets_corpus)
preprocess_test_tweet_corpous=preprocees_tweets(test_tweets_corpus)
preprocess_dev_tweet_corpous=preprocees_tweets(dev_tweets_corpus)

In [None]:
TrainY=[0 if x=='non-rumour' else 1 for x in train_data_label]
DevY=[0 if x=='non-rumour' else 1 for x in dev_data_label]

In [None]:
def convert_sentence_to_embedding(corpus,tweet_info):
  embedd_corp=[]
  pool = model.start_multi_process_pool()
  for i in range(len(corpus)):
    emd=model.encode_multi_process(corpus[i], pool)
    vec=np.zeros(emd[0].shape)
    retweet_count_sum=sum([t.retweet_count for t in tweet_info[i]])
    for j in range(len(corpus[i])):
      vec+=emd[j]*(tweet_info[i][j].retweet_count+1)/(retweet_count_sum+len(corpus[i]))
    embedd_corp.append(vec)
  return embedd_corp

train_tweet_embedd_corp=convert_sentence_to_embedding(preprocess_train_tweet_corpous,train_tweet_info)
dev_tweet_embedd_corp=convert_sentence_to_embedding(preprocess_dev_tweet_corpous,dev_tweet_info)
test_tweet_embedd_corp=convert_sentence_to_embedding(preprocess_test_tweet_corpous,test_tweet_info)

In [None]:
import tensorflow as tf
from sklearn.metrics import recall_score,precision_score,f1_score,precision_recall_fscore_support

def print_scores(y_true,y_pred):
    print(f1_score(y_true,y_pred),precision_score(y_true,y_pred),recall_score(y_true,y_pred))

In [None]:
train_tweet_embedd_corp[0].shape

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(500, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(200, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100)),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['binary_accuracy',f1_m,precision_m,recall_m])


In [None]:
trainX=np.array(train_tweet_embedd_corp).reshape(-1, 1, 768)
trainY=np.reshape(np.array(TrainY),(4641,1)).reshape(-1,1,1)

In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [None]:
tf.config.run_functions_eagerly(True)
model.fit(trainX,trainY,epochs=10)

In [None]:
devX=np.array(dev_tweet_embedd_corp).reshape(-1, 1, 768)
devY=np.reshape(np.array(DevY),(580,1)).reshape(-1,1,1)

In [None]:
pred=model.predict(devX)
predict=[0 if i[0]<0.0 else 1 for i in pred.tolist()]

In [None]:
print_scores(DevY,predict)

In [None]:
from collections import Counter



In [None]:
new_trainX=np.array(train_tweet_embedd_corp+dev_tweet_embedd_corp).reshape(-1, 1, 768)
new_trainY=np.reshape(np.array(TrainY+DevY),(len(TrainY+DevY),1)).reshape(-1,1,1)

In [None]:
model.fit(new_trainX,new_trainY,epochs=10)

In [None]:
testX=np.array(test_tweet_embedd_corp).reshape(-1, 1, 768)
pred=model.predict(testX)
y_pred=[0 if i[0]<0.0 else 1 for i in pred.tolist()]
output_dict={}

for i in range(len(test_tweet_id)):
    if y_pred[i]==0:
        output_dict[test_tweet_id[i]]='non-rumour'
    else: 
        output_dict[test_tweet_id[i]]='rumour'

with open(folder+'/test-output.json', 'w') as f:
    json.dump(output_dict, f)