In [13]:
import pandas as pd
import json
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Separating newer tweets to a test file and older tweets to a train file

In [58]:
df = pd.read_json("data/elon_musk.json")
df = df.sort_values(by="created_at", ascending=False)

In [59]:
test_df = df[:100]
train_df = df[100:]

In [60]:
test = []
train = []

for row in test_df.itertuples():

    temp = {}
    temp["text"] = row.text
    temp["created_at"] = str(row.created_at)
    temp["retweet_count"] = row.retweet_count
    temp["lang"] = row.lang
    temp["prediction"] = row.prediction

    test.append(temp)

for row in train_df.itertuples():

    temp = {}
    temp["text"] = row.text
    temp["created_at"] = str(row.created_at)
    temp["retweet_count"] = row.retweet_count
    temp["lang"] = row.lang
    temp["prediction"] = row.prediction

    train.append(temp)

In [61]:
with open("data/elon_musk_train.json", "w") as f:
    json.dump(train, f)

with open("data/elon_musk_test.json", "w") as f:
    json.dump(test, f)

# Evaluating base model on test dataset

In [89]:
with open("data/elon_musk_test.json", "r") as f:
    data = json.load(f)

In [77]:
"""for tweet in data:
    tweet["prediction"] = tweet["prediction"][0]

with open("data/elon_musk_test.json", "w") as f:
    json.dump(data,f)"""

{'text': '@SaphAffectionVA @AlexWentzell @AlexWentzell said it already. \n\nElon Musk fully understands mammalian biology. Yet you pay for his check?',
 'created_at': '2023-02-07 19:14:56',
 'retweet_count': 0,
 'lang': 'en',
 'prediction': {'LABEL_0': 0.504618644714355,
  'LABEL_1': 0.45587491989135703,
  'LABEL_2': 0.039506431668996006},
 'true_label': 1}

In [82]:
def get_prediction(tweet, index: int) -> int:
    r = 0
    i = 0
    m = -1

    l = []
    if isinstance(tweet["prediction"], list):
        l.append(tweet["prediction"][index]["LABEL_0"])
        l.append(tweet["prediction"][index]["LABEL_1"])
        l.append(tweet["prediction"][index]["LABEL_2"])
    else:
        l.append(tweet["prediction"]["LABEL_0"])
        l.append(tweet["prediction"]["LABEL_1"])
        l.append(tweet["prediction"]["LABEL_2"])

    for prob in l:
        if prob > m:
            m = prob
            i = r
        r += 1
    return i

In [83]:
predictions = [get_prediction(t,0) for t in data]
true_labels = [t["true_label"] for t in data]

In [84]:
acc = 0

for p,t in zip(predictions, true_labels):
    if p == t:
        acc += 1

print(f"Accuracy of base model: {acc/len(predictions)*100:.0f}%")

Accuracy of base model: 57%


In [85]:
f1 = f1_score(true_labels, predictions, average="micro")
precision = precision_score(true_labels, predictions, average="micro")
recall = recall_score(true_labels, predictions, average="micro")

print(f"F1 score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

F1 score: 0.57
Precision: 0.57
Recall: 0.57


In [86]:
confusion_matrix(true_labels, predictions)

array([[23,  8,  0],
       [15, 26, 11],
       [ 4,  5,  8]])

# Evaluating model after running Active Learning experiments on the training set

In [103]:
with open("data/elon_musk_test.json", "r") as f:
    data = json.load(f)

size = len(data[0]["prediction"])

In [104]:
best_i = 0
best_acc = 0

for i in range(size):

    predictions = [get_prediction(t,i) for t in data]

    acc = 0

    for p,t in zip(predictions, true_labels):
        if p == t:
            acc += 1

    accuracy = acc/len(predictions)*100
    f = f1_score(true_labels, predictions, average="micro")
    print(f"F1 score of model after {i*8} labeled samples: {f*100:.0f}%")

    if best_acc < accuracy:
        best_acc = accuracy
        best_i = i

F1 score of model after 0 labeled samples: 57%
F1 score of model after 8 labeled samples: 61%
F1 score of model after 16 labeled samples: 63%
F1 score of model after 24 labeled samples: 62%
F1 score of model after 32 labeled samples: 63%
F1 score of model after 40 labeled samples: 66%


In [105]:
predictions = [get_prediction(t,best_i) for t in data]
confusion_matrix(true_labels, predictions)

array([[19, 12,  0],
       [ 4, 38, 10],
       [ 3,  5,  9]])