In [None]:
import jsonlines
import json
import re
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score,precision_score,f1_score,precision_recall_fscore_support
import lightgbm as lgb
import optuna
from helper_function import *
from Tweet_Info_Obj import *

In [None]:
!pip install lightgbm
!pip nstall optuna

Train,Development and Test data is extracted from the respective files and preprocessed.  

In [None]:
train_tweets_corpus,train_tweet_id,=extract_data('project-data/train.data.jsonl')
dev_tweets_corpus,dev_tweet_id,=extract_data('project-data/dev.data.jsonl')
test_tweets_corpus,test_tweet_id,=extract_data('project-data/test.data.jsonl')

train_data_label=get_labels('project-data/train.label.json',train_tweet_id)
dev_data_label=get_labels('project-data/dev.label.json',dev_tweet_id)
# Tweet text is preprocessed
preprocess_train_tweet_corpous=preprocees_tweets(train_tweets_corpus)
preprocess_test_tweet_corpous=preprocees_tweets(test_tweets_corpus)
preprocess_dev_tweet_corpous=preprocees_tweets(dev_tweets_corpus)

We are utlizing the GLoVe of dimension 200 to get a single representation of each tweet and reply group.  

In [None]:
vector_dimension=200

In [None]:
embeddings_dict={}
with open("glove/glove.twitter.27B.200d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [None]:
def split_tweet(tweet):
    # separate punctuations
    tweet = tweet.replace(".", " . ") \
                 .replace(",", " , ") \
                 .replace(";", " ; ") \
                 .replace("?", " ? ")\
                 .replace("\'","")
    return tweet.split()

In [None]:
def tweet2vec(tweet,embeddings_dict):
    vector_sum = sum(embeddings_dict.get(w,np.zeros(vector_dimension)) for w in split_tweet(tweet))
    return vector_sum

def tweet_corpous_to_vector(corpous,embeddings_dict):
    corp_vec=[]
    for tweet_group in corpous:
        tweet_vec_grp=[tweet2vec(tweet,embeddings_dict) for tweet in tweet_group]
        corp_vec.append(tweet_vec_grp)
    return corp_vec

In [None]:
train_tweet_grp_vectors=tweet_corpous_to_vector(preprocess_train_tweet_corpous,embeddings_dict)
test_tweet_grp_vectors=tweet_corpous_to_vector(preprocess_test_tweet_corpous,embeddings_dict)
dev_tweet_grp_vectors=tweet_corpous_to_vector(preprocess_dev_tweet_corpous,embeddings_dict)

In [None]:
def group2single(group):
    return sum(j for j in group)/len(group)

def convert2single(dataset):
    return [group2single(group) for group in dataset]

In [None]:
TrainX=convert2single(train_tweet_grp_vectors)
TestX=convert2single(test_tweet_grp_vectors)
DevX=convert2single(dev_tweet_grp_vectors)

In [None]:
TrainY=[0 if x=='non-rumour' else 1 for x in train_data_label]
DevY=[0 if x=='non-rumour' else 1 for x in dev_data_label]

In [None]:
def print_scores(y_true,y_pred):
    print(f1_score(y_true,y_pred),precision_score(y_true,y_pred),recall_score(y_true,y_pred))

Creating a model with a basic Random Forest model.  

In [None]:
clf = RandomForestClassifier(max_depth=50, random_state=0)
clf=clf.fit(TrainX,TrainY)
y_pred=clf.predict(DevX)
print_scores(DevY,y_pred)

Tuning the appropriate paramaters for a Light GBM model using Optuna.  

In [None]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat =y_hat = np.where(y_hat < 0.5, 0, 1)  
    return 'f1', f1_score(y_true, y_hat), True

def objective(trial):
    param = {
        "objective": "binary",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "metric":"lgb_f1_score",
        "lambda_l1": trial.suggest_float("lambda_l1", 0.01, 10.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.01, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 300),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 300),
        "learning_rate":trial.suggest_float("learning_rate", 0.001, 0.5),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 100, 500),
        'num_iterations': trial.suggest_int('num_iterations', 400, 800),
        'n_estimators': trial.suggest_int('n_estimators', 160000,160000),
        'min_data_per_group':trial.suggest_int('min_data_per_group', 100,500)
    }
    
    model=lgb.LGBMClassifier()
    model=model.set_params(**param)
    model=model.fit(TrainX,TrainY)
    pred=model.predict(DevX)
    f1 = f1_score(DevY,pred)
    
    return f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

In [None]:
param=study.best_trial.params
param['metric']="lgb_f1_score"
param["objective"]= "binary"
param["verbosity"]= -1
param["boosting_type"]="gbdt"

In [None]:
model=lgb.LGBMClassifier()
model=model.set_params(**param)
model=model.fit(TrainX,TrainY)
pred=model.predict(DevX)
print_scores(DevY,pred)

In [None]:
import joblib

joblib.dump(model, 'lgb_200glove.pkl')

In [None]:
model=lgb.LGBMClassifier()
model=model.set_params(**param)
model=model.fit(TrainX+DevX,TrainY+DevY)
y_pred=model.predict(TestX)

In [None]:
output_dict={}

for i in range(len(test_tweet_id)):
    if y_pred[i]==0:
        output_dict[test_tweet_id[i]]='non-rumour'
    else: 
        output_dict[test_tweet_id[i]]='rumour'

In [None]:
with open('test-output.json', 'w') as f:
    json.dump(output_dict, f)

In [None]:
baseline=json.load(open('project-data/dev.baseline.json'))

In [None]:
def convert_label(label):
    if label == "rumour":
        return 1
    elif label == "non-rumour":
        return 0
    else:
        raise Exception("label classes must be 'rumour' or 'non-rumour'")

In [None]:
y_true, y_pred = [], []

try:
    for k, v in baseline.items():
        if k in output_dict:
            y_pred.append(convert_label(output_dict[k]))
        else:
            y_pred.append(int(not(bool(convert_label(v)))))
        y_true.append(convert_label(v))

    p, r, f, _ = precision_recall_fscore_support(y_true, y_pred, pos_label=1, average="binary")
except Exception as error:
    print("Error:", error)
    raise SystemExit


print("Performance on the rumour class:")
print("Precision =", p)
print("Recall    =", r)
print("F1        =", f)