In [None]:
import numpy as np
import lightgbm as lgb
import optuna
from helper_function import *
from Tweet_Info_Obj import *
from sklearn.metrics import recall_score,precision_score,f1_score,precision_recall_fscore_support

In [None]:
train_tweets_corpus,train_tweet_id,train_tweet_info=extract_data('project-data/train.data.jsonl')
dev_tweets_corpus,dev_tweet_id,dev_tweet_info=extract_data('project-data/dev.data.jsonl')
test_tweets_corpus,test_tweet_id,test_tweet_info=extract_data('project-data/test.data.jsonl')

train_data_label=get_labels('project-data/train.label.json',train_tweet_id)
dev_data_label=get_labels('project-data/dev.label.json',dev_tweet_id)

preprocess_train_tweet_corpous=preprocees_tweets(train_tweets_corpus)
preprocess_test_tweet_corpous=preprocees_tweets(test_tweets_corpus)
preprocess_dev_tweet_corpous=preprocees_tweets(dev_tweets_corpus)

In [None]:
vector_dimension=200

In [None]:
embeddings_dict={}
with open("glove/glove.twitter.27B.200d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector            

In [None]:
def split_tweet(tweet):
    # separate punctuations
    tweet = tweet.replace(".", " . ") \
                 .replace(",", " , ") \
                 .replace(";", " ; ") \
                 .replace("?", " ? ")\
                 .replace("\'","")
    return tweet.split()

def tweet2vec(tweet,embeddings_dict):
    vector_sum = sum(embeddings_dict.get(w,np.zeros(vector_dimension)) for w in split_tweet(tweet))
    return vector_sum

def tweet_corpous_to_vector(corpous,embeddings_dict):
    corp_vec=[]
    for tweet_group in corpous:
        tweet_vec_grp=[tweet2vec(tweet,embeddings_dict) for tweet in tweet_group]
        corp_vec.append(tweet_vec_grp)
    return corp_vec

In [None]:
train_tweet_grp_vectors=tweet_corpous_to_vector(preprocess_train_tweet_corpous,embeddings_dict)
test_tweet_grp_vectors=tweet_corpous_to_vector(preprocess_test_tweet_corpous,embeddings_dict)
dev_tweet_grp_vectors=tweet_corpous_to_vector(preprocess_dev_tweet_corpous,embeddings_dict)

In [None]:
def group2single(group,tweet_info):
    sumvec=np.zeros(group[0].shape)
    retweet_count_sum=sum([t.retweet_count for t in tweet_info])
    for i in range(len(group)):
        sumvec+=group[i]*((tweet_info[i].retweet_count+1)/(retweet_count_sum+len(group)))
    return sumvec

def convert2single(dataset,tweet_info_corp):
    return [group2single(dataset[i],tweet_info_corp[i]) for i in range(len(dataset))]

In [None]:
TrainX=convert2single(train_tweet_grp_vectors,train_tweet_info)
TestX=convert2single(test_tweet_grp_vectors,test_tweet_info)
DevX=convert2single(dev_tweet_grp_vectors,dev_tweet_info)

In [None]:
TrainY=[0 if x=='non-rumour' else 1 for x in train_data_label]
DevY=[0 if x=='non-rumour' else 1 for x in dev_data_label]

In [None]:
def print_scores(y_true,y_pred):
    print(f1_score(y_true,y_pred),precision_score(y_true,y_pred),recall_score(y_true,y_pred))

In [None]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat =y_hat = np.where(y_hat < 0.5, 0, 1)  
    return 'f1', f1_score(y_true, y_hat), True

def objective(trial):
    param = {
        "objective": "binary",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "metric":"lgb_f1_score",
        "lambda_l1": trial.suggest_float("lambda_l1", 0.01, 10.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.01, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 300),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 300),
        "learning_rate":trial.suggest_float("learning_rate", 0.001, 0.5),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 100, 500),
        'num_iterations': trial.suggest_int('num_iterations', 400, 800),
        'n_estimators': trial.suggest_int('n_estimators', 160000,160000),
        'min_data_per_group':trial.suggest_int('min_data_per_group', 100,500)
    }
    
    model=lgb.LGBMClassifier()
    model=model.set_params(**param)
    model=model.fit(TrainX,TrainY)
    pred=model.predict(DevX)
    f1 = f1_score(DevY,pred)
    
    return f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

In [None]:
param=study.best_trial.params
param['metric']="lgb_f1_score"
param["objective"]= "binary"
param["verbosity"]= -1
param["boosting_type"]="gbdt"

In [None]:
model=lgb.LGBMClassifier()
model=model.set_params(**param)
model=model.fit(TrainX,TrainY)
pred=model.predict(DevX)
print_scores(DevY,pred)

In [None]:
import joblib

joblib.dump(model, 'lgb_200glove_retweetcountWeight.pkl')

In [None]:
def group2single(group,tweet_info):
    sumvec=np.zeros(group[0].shape)
    retweet_count_sum=sum([t.user_follower_count for t in tweet_info])
    for i in range(len(group)):
        sumvec+=group[i]*((tweet_info[i].user_follower_count+1)/(retweet_count_sum+len(group)))
    return sumvec

def convert2single(dataset,tweet_info_corp):
    return [group2single(dataset[i],tweet_info_corp[i]) for i in range(len(dataset))]

In [None]:
TrainX=convert2single(train_tweet_grp_vectors,train_tweet_info)
TestX=convert2single(test_tweet_grp_vectors,test_tweet_info)
DevX=convert2single(dev_tweet_grp_vectors,dev_tweet_info)

In [None]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat =y_hat = np.where(y_hat < 0.5, 0, 1)  
    return 'f1', f1_score(y_true, y_hat), True

def objective(trial):
    param = {
        "objective": "binary",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "metric":"lgb_f1_score",
        "lambda_l1": trial.suggest_float("lambda_l1", 0.01, 10.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.01, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 300),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 300),
        "learning_rate":trial.suggest_float("learning_rate", 0.001, 0.5),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 100, 500),
        'num_iterations': trial.suggest_int('num_iterations', 400, 800),
        'n_estimators': trial.suggest_int('n_estimators', 160000,160000),
        'min_data_per_group':trial.suggest_int('min_data_per_group', 100,500)
    }
    
    model=lgb.LGBMClassifier()
    model=model.set_params(**param)
    model=model.fit(TrainX,TrainY)
    pred=model.predict(DevX)
    f1 = f1_score(DevY,pred)
    
    return f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

In [None]:
param=study.best_trial.params
param['metric']="lgb_f1_score"
param["objective"]= "binary"
param["verbosity"]= -1
param["boosting_type"]="gbdt"

In [None]:
model=lgb.LGBMClassifier()
model=model.set_params(**param)
model=model.fit(TrainX,TrainY)
pred=model.predict(DevX)
print_scores(DevY,pred)

In [None]:
joblib.dump(model,'lgb_200glove_usrfollcnt.pkl')

In [None]:
from collections import Counter
print(Counter(TrainY))
print(Counter(DevY))

In [None]:
print(len(TrainY))
print(len(DevY))

In [None]:
print(len(TestX))