In [None]:
!pip install sentence_transformers
!pip install jsonlines
!pip install optuna


In [None]:
from sentence_transformers import SentenceTransformer
import re
import numpy as np
import jsonlines
import json
from helper_function import *
from Tweet_Info_Obj import *

In [None]:
# model = SentenceTransformer('paraphrase-distilroberta-base-v1')
model = SentenceTransformer('stsb-mpnet-base-v2')

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

folder='/content/gdrive/My Drive/project-data'

In [None]:
train_tweets_corpus,train_tweet_id,train_tweet_info=extract_data(folder+'/train.data.jsonl')
dev_tweets_corpus,dev_tweet_id,dev_tweet_info=extract_data(folder+'/dev.data.jsonl')
test_tweets_corpus,test_tweet_id,test_tweet_info=extract_data(folder+'/test.data.jsonl')

train_data_label=get_labels(folder+'/train.label.json',train_tweet_id)
dev_data_label=get_labels(folder+'/dev.label.json',dev_tweet_id)

preprocess_train_tweet_corpous=preprocees_tweets(train_tweets_corpus)
preprocess_test_tweet_corpous=preprocees_tweets(test_tweets_corpus)
preprocess_dev_tweet_corpous=preprocees_tweets(dev_tweets_corpus)

In [None]:
def convert_sentence_to_embedding(corpus,tweet_info):
  embedd_corp=[]
  pool = model.start_multi_process_pool()
  for i in range(len(corpus)):
    emd=model.encode_multi_process(corpus[i], pool)
    vec=np.zeros(emd[0].shape)
    retweet_count_sum=sum([t.retweet_count for t in tweet_info[i]])
    for j in range(len(corpus[i])):
      vec+=emd[j]*(tweet_info[i][j].retweet_count+1)/(retweet_count_sum+len(corpus[i]))
    embedd_corp.append(vec)
  return embedd_corp

train_tweet_embedd_corp=convert_sentence_to_embedding(preprocess_train_tweet_corpous,train_tweet_info)
dev_tweet_embedd_corp=convert_sentence_to_embedding(preprocess_dev_tweet_corpous,dev_tweet_info)
test_tweet_embedd_corp=convert_sentence_to_embedding(preprocess_test_tweet_corpous,test_tweet_info)

In [None]:
TrainY=[0 if x=='non-rumour' else 1 for x in train_data_label]
DevY=[0 if x=='non-rumour' else 1 for x in dev_data_label]

In [None]:
from sklearn.metrics import recall_score,precision_score,f1_score,precision_recall_fscore_support
import lightgbm as lgb
import optuna

In [None]:
del preprocess_test_tweet_corpous

In [None]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat =y_hat = np.where(y_hat < 0.5, 0, 1)  
    return 'f1', f1_score(y_true, y_hat), True

def objective(trial):
    param = {
        "objective": "binary",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "metric":"lgb_f1_score",
        "lambda_l1": trial.suggest_float("lambda_l1", 0.01, 10.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.01, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 300),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 300),
        "learning_rate":trial.suggest_float("learning_rate", 0.001, 0.5),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 100, 500),
        'num_iterations': trial.suggest_int('num_iterations', 400, 800),
        'n_estimators': trial.suggest_int('n_estimators', 160000,160000),
        'min_data_per_group':trial.suggest_int('min_data_per_group', 100,500),
        'gpu_device_id':0
    }
    
    model=lgb.LGBMClassifier()
    model=model.set_params(**param)
    model=model.fit(train_tweet_embedd_corp,TrainY)
    pred=model.predict(dev_tweet_embedd_corp)
    f1 = f1_score(DevY,pred)
    
    return f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

In [None]:
param=study.best_trial.params
param['metric']="lgb_f1_score"
param["objective"]= "binary"
param["verbosity"]= -1
param["boosting_type"]="gbdt"
param["gpu_device_id"]=0
param

In [None]:
def print_scores(y_true,y_pred):
    print(f1_score(y_true,y_pred),precision_score(y_true,y_pred),recall_score(y_true,y_pred))

In [None]:
model=lgb.LGBMClassifier()
model=model.set_params(**param)
model=model.fit(train_tweet_embedd_corp,TrainY)
pred=model.predict(dev_tweet_embedd_corp)
print_scores(DevY,pred)

In [None]:
import joblib

joblib.dump(model,folder+'/st_adv_retweetcnt.pkl')

In [None]:
model=lgb.LGBMClassifier()
model=model.set_params(**param)
model=model.fit(train_tweet_embedd_corp+dev_tweet_embedd_corp,TrainY+DevY)
y_pred=model.predict(test_tweet_embedd_corp)

In [None]:
output_dict={}

for i in range(len(test_tweet_id)):
    if y_pred[i]==0:
        output_dict[test_tweet_id[i]]='non-rumour'
    else: 
        output_dict[test_tweet_id[i]]='rumour'

In [None]:
with open(folder+'/test-output.json', 'w') as f:
    json.dump(output_dict, f)