In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer
import matplotlib.pyplot as plt

import nltk
import numpy as np

train_data = []
with open("drive/MyDrive/train.data.jsonl") as file:
    for line in file:
        train_data.append(json.loads(line))

train_label = json.load(open("drive/MyDrive/train.label.json"))

dev_data = []
with open("drive/MyDrive/dev.data.jsonl") as file:
    for line in file:
        dev_data.append(json.loads(line))

dev_label = json.load(open("drive/MyDrive/dev.label.json"))

test_data = []
with open("drive/MyDrive/test.data.jsonl") as file:
    for line in file:
        test_data.append(json.loads(line))

# # 
# def preprocess_texts(text):
#     text = text.lower()
#     tt = TweetTokenizer()
#     words = tt.tokenize(text)
#     new_text = ""
#     for word in words:
#         if word[0] == '@':
#             continue
#         else:
#             new_text += word
#             new_text += " "
#     return new_text[0:-1]

import re

# regular expression to find english character
my_re = re.compile(r'[A-Za-z]')

# TODO: 
def preprocess_texts(text):
    text = text.lower()
    tt = TweetTokenizer()
    words = tt.tokenize(text)
    new_text = ""
    for word in words:
        flag = True
        for char in word:
            if not bool(re.match(my_re, char)):
                flag = False
        if not flag:
            continue
        if word[0] == '@':
            continue
        if word.startswith("http"):
            continue
        else:
            new_text += word
            new_text += " "
    return new_text[0:-1]


# 
def build_input_text():
    train_texts = []
    dev_texts = []
    test_texts = []

    train_tweet_to_row = dict()  # tweet id to row
    row = 0
    for record in train_data:
        for tweet in record:
            train_texts.append(preprocess_texts(tweet['text']))
            train_tweet_to_row[tweet['id_str']] = row
            row += 1

    dev_tweet_to_row = dict()  # tweet id to row
    row = 0
    for record in dev_data:
        for tweet in record:
            dev_texts.append(preprocess_texts(tweet['text']))
            dev_tweet_to_row[tweet['id_str']] = row
            row += 1

    test_tweet_to_row = dict()  # tweet id to row
    row = 0
    for record in test_data:
        for tweet in record:
            test_texts.append(preprocess_texts(tweet['text']))
            test_tweet_to_row[tweet['id_str']] = row
            row += 1
    return train_texts, dev_texts, test_texts, train_tweet_to_row, dev_tweet_to_row, test_tweet_to_row


train_texts, dev_texts, test_texts, train_tweet_to_row, dev_tweet_to_row, test_tweet_to_row = build_input_text()

In [None]:
# def merge_hashtags(record, data_type):
#   tweet_to_row = None
#   texts = None
#   if data_type == "train":
#     tweet_to_row = train_tweet_to_row
#     texts = train_texts
#   elif data_type == "dev":
#     tweet_to_row = dev_tweet_to_row
#     texts = dev_texts
#   else:
#     tweet_to_row = test_tweet_to_row
#     texts = test_texts

#   for tweet in record:
#     if len(tweet['entities']['hashtags']) > 0:
#       texts[tweet_to_row[tweet['id_str']]] += " "
#       texts[tweet_to_row[tweet['id_str']]] += tweet['entities']['hashtags'][0]['text'].lower()


# for record in train_data:
#   merge_hashtags(record, "train")
# for record in dev_data:
#   merge_hashtags(record, "dev")
# for record in test_data:
#   merge_hashtags(record, "test")

In [None]:
# 
def build_tf_idf_text():
    vectorizer = TfidfVectorizer(tokenizer=TweetTokenizer().tokenize,
                                 max_df=0.8, min_df=0.00004, max_features=5000, stop_words='english')
    train_tf_tdf = vectorizer.fit_transform(train_texts)
    dev_tf_tdf = vectorizer.transform(dev_texts)
    test_tf_idf = vectorizer.transform(test_texts)
    return train_tf_tdf, dev_tf_tdf, test_tf_idf


train_tf_tdf, dev_tf_tdf, test_tf_idf = build_tf_idf_text()

# TODO: 
print(train_tf_tdf[0].shape)
print(dev_tf_tdf[0].shape)

(1, 6000)
(1, 6000)


In [None]:
# plot the distribution of given values, based on train set
def show_value_distribution(param):
    values = []
    for record in train_data:
        values.append(record[0]['user'][param])

    plt.hist(values)
    plt.show()


# 
def cut_within_scope(input, scope):
    result = np.zeros(len(scope) - 1)
    for index, value in enumerate(scope):
        if index == len(scope) - 1:
            break
        if input < scope[index + 1]:
            result[index] = 1
            break
    return result


In [None]:
# 
def get_features(record, data_set_type):
    # 
    source_id = record[0]['id_str']
    if data_set_type == 'train':
        source_text_vector = train_tf_tdf[train_tweet_to_row[source_id]].toarray()
    else:
        source_text_vector = dev_tf_tdf[dev_tweet_to_row[source_id]].toarray()
    # 
    source_text_vector = np.squeeze(source_text_vector)

    # 2. 
    reply_text_vectors = []
    for reply in record[1:]:
        reply_tweet_id = reply['id_str']
        if data_set_type == 'train':
            reply_text_vectors.append(train_tf_tdf[train_tweet_to_row[reply_tweet_id]].toarray())
        else:
            reply_text_vectors.append(dev_tf_tdf[dev_tweet_to_row[reply_tweet_id]].toarray())
    # 
    if len(reply_text_vectors) <= 0:
        reply_text_vectors = np.zeros(6000)
    else:
        reply_text_vectors = np.squeeze(np.stack(reply_text_vectors).mean(axis=0))

    # verified
    verified_vec = [int(record[0]['user']['verified'])]
    # friends
    friends_vec = cut_within_scope(record[0]['user']['friends_count'],
                                   [0, 100, 1000, 5000, 10000, 20000, 50000])
    # statuses
    statuses_vec = cut_within_scope(record[0]['user']['statuses_count'],
                                    [0, 100, 1000, 5000, 10000, 50000, 100000, 200000])
    # followers_count
    followers_vec = cut_within_scope(record[0]['user']['followers_count'],
                                     [0, 1000, 10000, 50000, 100000, 200000, 500000, 1000000, 1500000])
    # listed_count
    listed_vec = cut_within_scope(record[0]['user']['listed_count'],
                                  [0, 10000, 20000, 40000, 100000, 120000, 160000])
    # favourites_count
    favourites_vec = cut_within_scope(record[0]['user']['favourites_count'],
                                      [0, 10000, 20000, 40000, 50000])

    return source_text_vector, reply_text_vectors, verified_vec, friends_vec, \
           statuses_vec, followers_vec, listed_vec, favourites_vec


In [None]:
feature_vector = get_features(train_data[0], data_set_type="train")
print(len(feature_vector))

8


In [None]:
# 
def get_nb_input_matrix(data_set_type):
    data_set = []
    labels = []
    original_data = None
    original_labels = None
    if data_set_type == "train":
        original_data = train_data
        original_labels = train_label
    else:
        original_data = dev_data
        original_labels = dev_label

    for record in original_data:
        source_text_vector, reply_text_vectors, verified_vec, friends_vec, \
        statuses_vec, followers_vec, listed_vec, favourites_vec = get_features(record, data_set_type)

        features = np.concatenate([source_text_vector, reply_text_vectors, verified_vec, friends_vec,
                                   statuses_vec, followers_vec, listed_vec, favourites_vec])
        feature_vector = features
        labels.append(original_labels[record[0]['id_str']])
        data_set.append(feature_vector)

    return np.stack(data_set), labels


train_data_matrix, train_label_matrix = get_nb_input_matrix("train")
dev_data_matrix, dev_label_matrix = get_nb_input_matrix("dev")

In [None]:
def build_tree_vectors(record, data_set_type):
    """

    :param record:
    :param data_set_type:
    :return:
    """
    tf_idf = None
    tweet_to_row = None
    if data_set_type == "train":
        tf_idf = train_tf_tdf
        tweet_to_row = train_tweet_to_row
    elif data_set_type == "dev":
      tf_idf = dev_tf_tdf
      tweet_to_row = dev_tweet_to_row
    else:
        tf_idf = test_tf_idf
        tweet_to_row = test_tweet_to_row

    tree_vectors = dict()

    for tweet in record:
        row = tweet_to_row[tweet["id_str"]]
        tf_idf_vec = np.squeeze(tf_idf[row].toarray())

        # verified
        verified_vec = [int(tweet['user']['verified'])]
        # friends
        friends_vec = cut_within_scope(tweet['user']['friends_count'],
                                       [0, 100, 1000, 5000, 10000, 20000, 50000])
        # statuses
        statuses_vec = cut_within_scope(tweet['user']['statuses_count'],
                                        [0, 100, 1000, 5000, 10000, 50000, 100000, 200000])
        # followers_count
        followers_vec = cut_within_scope(tweet['user']['followers_count'],
                                         [0, 1000, 10000, 50000, 100000, 200000, 500000, 1000000, 1500000])
        # listed_count
        listed_vec = cut_within_scope(tweet['user']['listed_count'],
                                      [0, 10000, 20000, 40000, 100000, 120000, 160000])
        # favourites_count
        favourites_vec = cut_within_scope(tweet['user']['favourites_count'],
                                          [0, 10000, 20000, 40000, 50000])
        # protect
        retweeted_vec = [int(tweet['user']['protected'])]
        # profile_image
        profile_image_vec = [int(tweet['user']['default_profile_image'])]
        # profile_background
        background_vec = [int(tweet['user']['profile_use_background_image'])]
        # geo_enabled
        geo_vec = [int(tweet['user']['geo_enabled'])]
        # profile
        profile_vec = [int(tweet['user']['default_profile'])]
        
        # 
        feature_vec = np.concatenate([tf_idf_vec, verified_vec, friends_vec,
                                   statuses_vec, followers_vec, listed_vec, favourites_vec, retweeted_vec, profile_image_vec, background_vec, geo_vec, 
                                   profile_vec])
        vec = feature_vec.reshape(1, len(feature_vec))
        tree_vectors[row] = vec   # 
        
    return tree_vectors

print(build_tree_vectors(train_data[0], data_set_type="train")[0].shape)

(1, 6037)


In [None]:
### 

In [None]:
def traditional_model():
    """

    :return: 
    """
    from sklearn.naive_bayes import ComplementNB
    from sklearn.metrics import classification_report


    nb = ComplementNB()
    nb.fit(train_data_matrix, np.array(train_label_matrix))
    print(classification_report(dev_label_matrix, nb.predict(dev_data_matrix)))
    print(classification_report(train_label_matrix, nb.predict(train_data_matrix)))
    
traditional_model()


              precision    recall  f1-score   support

  non-rumour       0.85      0.91      0.88       393
      rumour       0.78      0.66      0.72       187

    accuracy                           0.83       580
   macro avg       0.82      0.79      0.80       580
weighted avg       0.83      0.83      0.83       580

              precision    recall  f1-score   support

  non-rumour       0.87      0.91      0.89      3058
      rumour       0.81      0.75      0.78      1583

    accuracy                           0.85      4641
   macro avg       0.84      0.83      0.83      4641
weighted avg       0.85      0.85      0.85      4641



In [None]:
"""
pytorch 
"""
import torch
from torch import nn

print(torch.__version__)


def topological_order(parent_to_children):
    """

    :param parent_to_children: {0: {1, 3, 4, 5, 6, 7, 8, 11}, 1: {2}, 8: {9, 10}, 2: {12}}
    :return: [[12,2],[2,1],[9,10,8],[1, 3, 4, 5, 6, 7, 8, 11, 0]] 
    """

    result = []
    queue = [0]  # 
    while len(queue) > 0:
        cur_parent = queue.pop(0)
        if cur_parent in parent_to_children:
            children = parent_to_children[cur_parent]
            result += [list(children) + [cur_parent]]
            for child in children:
                queue.append(child)
    # TODO：
    if len(result) <= 0:
        result = [[0, 0]]
    return list(reversed(result))


1.8.1+cu101


In [None]:
class TreeNN_layer(nn.Module):
    """

    """

    def __init__(self, input_dim, memory_dim):
        super(TreeNN_layer, self).__init__()
        self.input_dim = input_dim
        self.memory_dim = memory_dim

        # 
        self.E = nn.Linear(input_dim, memory_dim)
        self.Wr = nn.Linear(memory_dim, memory_dim)
        self.Wz = nn.Linear(memory_dim, memory_dim)
        self.Wh = nn.Linear(memory_dim, memory_dim)
        self.Ur = nn.Linear(memory_dim, memory_dim)
        self.Uz = nn.Linear(memory_dim, memory_dim)
        self.Uh = nn.Linear(memory_dim, memory_dim)

        # # drop out, 
        # self.dropout = nn.Dropout(p=0.3)

    def process(self, vector):
        vector = self.E(vector)
        # r_j = torch.sigmoid(self.Wr(vector))
        z_j = torch.sigmoid(self.Wz(vector))
        h_j = torch.tanh(self.Wh(vector))
        h_j = z_j * h_j
        return h_j

    # 
    def forward(self, parent_vector, children_vectors):
        parent_vector_to_memory_shape = self.E(parent_vector)  # 1*memory_dim

        h_S = []
        for index, vec in enumerate(children_vectors):
            # 
            if vec.size()[1] == self.input_dim:
                h_S.append(self.process(vec))
            elif vec.size()[1] == self.memory_dim:
                h_S.append(vec)

        # TODO: 
        if len(h_S) > 1:
            h_S = torch.stack(h_S).squeeze().sum(dim=0)
        else:
            h_S = h_S[0]

        r_j = torch.sigmoid(self.Wr(parent_vector_to_memory_shape) + self.Ur(h_S))
        z_j = torch.sigmoid(self.Wz(parent_vector_to_memory_shape) + self.Uz(h_S))
        h_j = torch.tanh(self.Wh(parent_vector_to_memory_shape) + self.Uh(h_S * r_j))
        h_j = (1 - z_j) * h_S + z_j * h_j
        # # TODO：
        # h_j = self.dropout(h_j)
        return h_j

In [None]:
class TreeNN(nn.Module):
    def __init__(self, input_dim, memory_dim):
        super(TreeNN, self).__init__()
        self.tree_layer = TreeNN_layer(input_dim, memory_dim)
        self.output = nn.Linear(memory_dim, 2)  # 

    # 
    def forward(self, orders, index_to_vec):
        """

        :param order: 
        :return:
        """

        processed_vecs = dict()
        for order in orders:
            parent = order[-1]
            children = order[:-1]
            children_vecs = []
            for index in children:
                if index in processed_vecs:
                    children_vecs.append(processed_vecs[index])
                else:
                    children_vecs.append(index_to_vec[index])
            processed_vecs[parent] = self.tree_layer(index_to_vec[parent], children_vecs)

        return self.output(processed_vecs[0])


In [None]:
"""

"""


def show_tree(nodes, edges):
    import networkx as netx

    tree = netx.DiGraph()
    tree.add_nodes_from(nodes)
    tree.add_edges_from(edges)
    netx.draw_networkx(tree)
    plt.show()


# 
def make_tree(record):
    # 
    nodes = set()
    edges = []
    tweet_to_index = dict()
    index_to_tweet = dict()
    index_to_tree = {0: 0}  # 

    # 
    for index, tweet in enumerate(record):
        tweet_to_index[tweet['id_str']] = index
        index_to_tweet[index] = tweet['id_str']

    parent_to_children = dict()  
    child_to_parent = dict()  


    for index, tweet in enumerate(record):

        if tweet['in_reply_to_status_id_str'] is not None:
            parent_id = tweet['in_reply_to_status_id_str']
            parent_index = tweet_to_index.get(parent_id, -1) 
            if parent_index != -1:
                child_to_parent[index] = parent_index
    
                if parent_index not in parent_to_children:
                    parent_to_children[parent_index] = set()
                parent_to_children[parent_index].add(index)

                if parent_index not in index_to_tree:
                    tree = len(index_to_tree)
                    index_to_tree[index] = tree
                    index_to_tree[parent_index] = tree
                else:
        
                    add_tree(parent_index, index_to_tree, parent_to_children)


    for index, tree in index_to_tree.items():
        if tree == 0:
            nodes.add(index)
            if index in child_to_parent:
                edges.append([index, child_to_parent[index]])

    return nodes, edges, parent_to_children, index_to_tweet, tweet_to_index


def add_tree(parent_index, index_to_tree, parent_to_children):
    tree = index_to_tree[parent_index]

    if parent_index in parent_to_children:
        for child in parent_to_children[parent_index]:
            
            if child not in index_to_tree or index_to_tree[child] != tree:
                index_to_tree[child] = tree
                add_tree(child, index_to_tree, parent_to_children)


In [None]:
### 

In [None]:
import random
import os

model = TreeNN(6037, 64)  # 

 # 
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
if torch.cuda.is_available():
    model.cuda()

# 
random.shuffle(train_data)

# 
criterion = nn.CrossEntropyLoss(weight=torch.tensor([1, 2], dtype=torch.float32, device='cuda'))
# criterion = nn.CrossEntropyLoss(weight=torch.tensor([1, 2], dtype=torch.float32))

# 
# optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.00005)
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001, weight_decay=0.001, alpha=0.9)


In [None]:
# 
def one_hot_label():
    train_label_hot = dict()
    dev_label_hot = dict()

    for id in train_label:
        if train_label[id] == 'rumour':
            train_label_hot[id] = 1
        else:
            train_label_hot[id] = 0

    for id in dev_label:
        if dev_label[id] == 'rumour':
            dev_label_hot[id] = 1
        else:
            dev_label_hot[id] = 0

    return train_label_hot, dev_label_hot

In [None]:
def predict(data_type):
    model.eval()
    prediction_label = []
    real_label = []
    data_set = None
    tweet_to_row = None
    tf_tdf = None
    label_hot = None

    if data_type == "train":
        data_set = train_data
        tweet_to_row = train_tweet_to_row
        tf_tdf = train_tf_tdf
        label_hot = one_hot_label()[0]
    else:
        data_set = dev_data
        tweet_to_row = dev_tweet_to_row
        tf_tdf = dev_tf_tdf
        label_hot = one_hot_label()[1]

    for record in data_set:
        nodes, edges, parent_to_children, index_to_tweet, tweet_to_index = make_tree(record)
        index_to_vec = dict()
        
        if data_type == "train":
            tree_vecs = build_tree_vectors(record, "train")
        else:
            tree_vecs = build_tree_vectors(record, "dev")

        for node in nodes:
            tweet_id = index_to_tweet[node]
            row = tweet_to_row[tweet_id]
            # TODO: 
            vec = tree_vecs[row]
            index_to_vec[node] = torch.tensor(vec, dtype=torch.float32, device='cuda')
            # index_to_vec[node] = torch.tensor(vec, dtype=torch.float32)

        order = topological_order(parent_to_children)
        output = model(order, index_to_vec)
        predicted_label = output.cpu().argmax().item()
        prediction_label.append(predicted_label)

        cur_id = record[0]['id_str']
        label = label_hot[cur_id]
        real_label.append(label)

    return prediction_label, real_label

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score




# 
def train(epochs, steps, model):
    """


    :return:
    """

    train_prediction_label = []
    train_real_label = []

    epochs_no_improve = 0
    max_f1 = 0
    best_model_state = None

    for epoch in range(epochs):
        optimizer.zero_grad()  # 
        count = 0  # 
        for record in train_data:
            nodes, edges, parent_to_children, index_to_tweet, tweet_to_index = make_tree(record)
            index_to_vec = dict()
            tree_vecs = build_tree_vectors(record, "train")

            for node in nodes:
                # TODO: 
                tweet_id = index_to_tweet[node]
                row = train_tweet_to_row[tweet_id]
                vec = tree_vecs[row]
                index_to_vec[node] = torch.tensor(vec, dtype=torch.float32, device='cuda')
                # index_to_vec[node] = torch.tensor(vec, dtype=torch.float32)

            order = topological_order(parent_to_children)
            output = model(order, index_to_vec)
            # predicted_label = output.cpu().argmax().item()
            # train_prediction_label.append(predicted_label)

            # TODO: 
            train_label_hot = one_hot_label()[0]
            cur_id = record[0]['id_str']
            label = train_label_hot[cur_id]
            # train_real_label.append(label)
            label_tensor = torch.tensor([label], dtype=torch.long, device='cuda')
            # label_tensor = torch.tensor([label], dtype=torch.long)

            loss = criterion(output, label_tensor)
            loss.backward()
            count += 1

            # 
            if count % steps == 0:
                optimizer.step()
                optimizer.zero_grad()  # 

        # TODO: 
        # random.shuffle(dev_data)
        real_label = predict("dev")[1]
        predicted_label = predict("dev")[0]
        dev_f1 = f1_score(real_label, predicted_label)
        print("----the current rumour f1 of dev_data----")
        print(dev_f1)

        print("epoch " + str(epoch + 1) + " finished, the average loss is " + str(loss / len(train_data)))
        print("------------------------------")

        model.train()  # 

        # 
        if dev_f1 > max_f1:
          # best_model_state = model.state_dict()
          torch.save(model.state_dict(),"drive/MyDrive/best.pth")
          max_f1 = dev_f1
          epochs_no_improve = 0
        else:
          epochs_no_improve += 1

        if epoch >= 10 and epochs_no_improve >= 6:
          print("no improve, stop training!!")
          break
        

    # return best_model_state


model.train()  #
train(100, 64, model)
model.load_state_dict(torch.load('drive/MyDrive/best.pth'))
# torch.save(model.state_dict(), "drive/MyDrive/model.pth")

----the current rumour f1 of dev_data----
0.7142857142857142
epoch 1 finished, the average loss is tensor(9.8215e-05, device='cuda:0', grad_fn=<DivBackward0>)
------------------------------
----the current rumour f1 of dev_data----
0.7893333333333333
epoch 2 finished, the average loss is tensor(7.5936e-05, device='cuda:0', grad_fn=<DivBackward0>)
------------------------------
----the current rumour f1 of dev_data----
0.7722772277227723
epoch 3 finished, the average loss is tensor(8.2210e-05, device='cuda:0', grad_fn=<DivBackward0>)
------------------------------
----the current rumour f1 of dev_data----
0.7635467980295567
epoch 4 finished, the average loss is tensor(8.3366e-05, device='cuda:0', grad_fn=<DivBackward0>)
------------------------------
----the current rumour f1 of dev_data----
0.7280898876404494
epoch 5 finished, the average loss is tensor(0.0001, device='cuda:0', grad_fn=<DivBackward0>)
------------------------------
----the current rumour f1 of dev_data----
0.7155555555

KeyboardInterrupt: ignored

In [None]:
model.load_state_dict(torch.load('drive/MyDrive/best.pth'))
train_prediction_label, train_real_label = predict("train")
dev_prediction_label, dev_real_label = predict("dev")

In [None]:
def show_result(data_set):
    real = None
    prediction = None
    if data_set == "train":
        real = train_real_label
        prediction = train_prediction_label
    else:
        real = dev_real_label
        prediction = dev_prediction_label

    acc = accuracy_score(real, prediction)
    rumor_f1 = f1_score(real, prediction)
    non_rumor_f1 = f1_score(real, prediction, pos_label=0)
    rumor_precision = precision_score(real, prediction)
    non_rumor_precision = precision_score(real, prediction, pos_label=0)
    rumor_recall = recall_score(real, prediction)
    non_rumor_recall = recall_score(real, prediction, pos_label=0)

    if data_set == "train":
        print("----the result of train_data----")
    else:
        print("----the result of dev_data----")

    print("accuracy: " + str(acc))
    print("rumour: f1 is " + str(rumor_f1) + ", recall is " + str(rumor_recall) + ", precision is " +
          str(rumor_precision))
    print("non_rumour: f1 is " + str(non_rumor_f1) + ", recall is " + str(non_rumor_recall) + ", precision is " +
          str(non_rumor_precision))

show_result("train")
show_result("dev")

----the result of train_data----
accuracy: 0.9103641456582633
rumour: f1 is 0.870404984423676, recall is 0.8825015792798484, precision is 0.8586355255070682
non_rumour: f1 is 0.9314888010540184, recall is 0.9247874427730542, precision is 0.9382879893828799
----the result of dev_data----
accuracy: 0.8637931034482759
rumour: f1 is 0.7893333333333333, recall is 0.7914438502673797, precision is 0.7872340425531915
non_rumour: f1 is 0.8993630573248408, recall is 0.8982188295165394, precision is 0.9005102040816326


In [None]:
def get_test_result():
    test_prediction_label = {}

    for record in test_data:
        nodes, edges, parent_to_children, index_to_tweet, tweet_to_index = make_tree(record)
        index_to_vec = dict()
        tree_vecs = build_tree_vectors(record, "test")

        for node in nodes:
            tweet_id = index_to_tweet[node]
            row = test_tweet_to_row[tweet_id]
            vec = tree_vecs[row]
            index_to_vec[node] = torch.tensor(vec, dtype=torch.float32, device='cuda')
            # index_to_vec[node] = torch.tensor(vec, dtype=torch.float32)

        order = topological_order(parent_to_children)
        output = model(order, index_to_vec)
        predicted_label = output.cpu().argmax().item()
        if predicted_label == 0:
          predicted_label = "non-rumour"
        else:
          predicted_label = "rumour"
        source_id = record[0]['id_str']
        test_prediction_label[source_id] = predicted_label

    return test_prediction_label

result_dic = get_test_result()
print(result_dic)
with open("drive/MyDrive/test-output.json","w") as f:
  json.dump(result_dic,f)
  print("加载入文件完成...")

{'544382249178001408': 'rumour', '525027317551079424': 'rumour', '544273220128739329': 'rumour', '499571799764770816': 'non-rumour', '552844104418091008': 'non-rumour', '524977651476623360': 'rumour', '544514988078280704': 'non-rumour', '524928863714168832': 'rumour', '544390718253699072': 'non-rumour', '580322349569994752': 'rumour', '544475905926524928': 'non-rumour', '544389986809036800': 'non-rumour', '498530293116968960': 'non-rumour', '498293625420148736': 'non-rumour', '552831230735962113': 'non-rumour', '553589469849583616': 'rumour', '544415816851021824': 'non-rumour', '552850116324130816': 'non-rumour', '544318036715782144': 'non-rumour', '524974318087061504': 'rumour', '553592195786506240': 'rumour', '524959778125385728': 'non-rumour', '553502311872733184': 'non-rumour', '499698366402789376': 'non-rumour', '525032872647065600': 'rumour', '553110609513177088': 'non-rumour', '544267656597995521': 'rumour', '552845393541988352': 'non-rumour', '500422971320963072': 'non-rumour',