# Load Dataset

In [1]:
from dataset import ClimateDataset

dataset = "./unlabelled_articles_17K/opinion_climate_all_with_bias.csv"
annotated_id_file = "./annotated_data_500/pretty_0611_lcad.txt"
annotated_file = "./annotated_data_500/0611_majority.json"

entire_dataset = ClimateDataset(dataset, annotated_id_file, annotated_file, True)
labelled_dataset = entire_dataset.get_labelled_dataset()

# Document Length Statitics (After Tokenization)

In [2]:
from torch.utils.data import Subset
from utils import Evaluator

fold = 5

train_set = Subset(labelled_dataset, [i for i in range((-1 + fold)%5, len(labelled_dataset), 5)] + [i for i in range((0 + fold)%5, len(labelled_dataset), 5)] + [i for i in range((1 + fold)%5, len(labelled_dataset), 5)])
valid_set = Subset(labelled_dataset, [i for i in range((2 + fold)%5, len(labelled_dataset), 5)])
test_set  = Subset(labelled_dataset, [i for i in range((3 + fold)%5, len(labelled_dataset), 5)])

evaluator = Evaluator(classifier='naive', mode='macro', detail=True)

len(train_set), len(valid_set), len(test_set)

(302, 100, 100)

## length

In [3]:
from transformers import BertTokenizer
tk = BertTokenizer.from_pretrained("bert-base-uncased")

def length_statistic(dataset):
    length = 0
    max_len = 0
    min_len = 100000
    if type(dataset) == ClimateDataset:
        for i in range(0, len(dataset)):
            temp = len(tk.tokenize(dataset[i]))
            if max_len < temp:
                max_len = temp
            if min_len > temp:
                min_len = temp
            length += temp
    else:
        for i in range(0, len(dataset)):
            temp = len(tk.tokenize(dataset[i][0]))
            if max_len < temp:
                max_len = temp
            if min_len > temp:
                min_len = temp
            length += temp

    average_length = length / len(dataset)

    return {'min':min_len, 'max':max_len, 'average':average_length}

In [5]:
length_statistic(entire_dataset)

{'min': 4, 'max': 3716, 'average': 877.1968556614819}

In [11]:
length_statistic(labelled_dataset)

{'min': 70, 'max': 626, 'average': 333.5239043824701}

In [6]:
length_statistic(train_set), length_statistic(valid_set), length_statistic(test_set)

({'min': 141, 'max': 1221, 'average': 730.274834437086},
 {'min': 218, 'max': 1157, 'average': 697.52},
 {'min': 210, 'max': 1177, 'average': 664.2})

In [10]:
# 6 empty news, why?
for i in range(0, len(entire_dataset)):
    if len(tk.tokenize(entire_dataset[i])) <= 10:
        print(entire_dataset[i], i)

# delete and manually fixed

Your testimony is not serious. 5580
'We should let our children be children' 5596
Err501 7397
Err501 7410
Err501 7435
Err501 7449
Err501 7451
Err501 7454
Err501 7461
Err501 7463
Err501 7465
The desperate pleas of asylum seekers who Remain in Mexico 10474
How Democrats are prepping for their first debate 10479
How Democrats are prepping for their first debate 10523
Skip to main content  11094
Err501 11891
Err501 12133
Err501 12187
Click for more article by throngsman .. 15645
The Latest Nearby states sending fire help to California 17082


## labels

In [26]:
def label_distribution(dataset):
    labels = [0]*5

    for i in range(0, len(dataset)):
        for j in range(0, 5):
            if dataset[i][1][j] == 1:
                labels[j] += 1

    return labels

In [27]:
label_distribution(labelled_dataset)

[255, 154, 44, 334, 202]

In [28]:
label_distribution(train_set), label_distribution(valid_set), label_distribution(test_set)

([148, 88, 25, 201, 113], [53, 32, 5, 62, 46], [54, 34, 14, 71, 43])

# Naive Baselines

## Random Baseline

In [29]:
import random

def random_baseline(dataset):
    pred = []
    truth = []
    for i in range(0, len(dataset)):
        temp = []
        for j in range(0,5):
            temp.append(random.randint(0,1))
        pred.append(temp)
        truth.append(dataset[i][1])

    return {"y_true": truth, "y_pred": pred}

In [4]:
input_dict = random_baseline(labelled_dataset)

evaluator.eval(input_dict)

{'Precision': [0.5397489539748954,
  0.3346456692913386,
  0.0778688524590164,
  0.6706827309236948,
  0.366412213740458],
 'Recall': [0.5058823529411764,
  0.551948051948052,
  0.4318181818181818,
  0.5,
  0.4752475247524752],
 'F1': [0.5222672064777327,
  0.41666666666666663,
  0.13194444444444445,
  0.5728987993138938,
  0.41379310344827586],
 'Acc': [0.5298804780876494,
  0.5258964143426295,
  0.50199203187251,
  0.5039840637450199,
  0.4581673306772908]}

In [13]:
input_dict = random_baseline(train_set)

evaluator.eval(input_dict)

{'Precision': 0.4010473615509209,
 'Recall': 0.47460626563847963,
 'F1': 0.40805327993078855}

In [31]:
input_dict = random_baseline(valid_set)

evaluator.eval(input_dict)

{'Precision': [0.49019607843137253,
  0.26666666666666666,
  0.04,
  0.62,
  0.4883720930232558],
 'Recall': [0.4716981132075472, 0.5, 0.4, 0.5, 0.45652173913043476],
 'F1': [0.4807692307692308,
  0.3478260869565218,
  0.07272727272727272,
  0.5535714285714285,
  0.47191011235955055],
 'Acc': [0.46534653465346537,
  0.40594059405940597,
  0.49504950495049505,
  0.504950495049505,
  0.5346534653465347]}

In [32]:
input_dict = random_baseline(test_set)

evaluator.eval(input_dict)

{'Precision': [0.6226415094339622,
  0.38596491228070173,
  0.10869565217391304,
  0.7083333333333334,
  0.5471698113207547],
 'Recall': [0.6111111111111112,
  0.6470588235294118,
  0.35714285714285715,
  0.4788732394366197,
  0.6744186046511628],
 'F1': [0.616822429906542,
  0.48351648351648346,
  0.16666666666666666,
  0.5714285714285715,
  0.6041666666666666],
 'Acc': [0.59, 0.53, 0.5, 0.49, 0.62]}

## Majority Voting

In [33]:
def majority_voting(train, dataset):
    labels = [0]*5

    for i in range(0, len(train)):
        for j in range(0, 5):
            if train[i][1][j] == 1:
                labels[j] += 1

    trans = [1] * 5
    for i in range(0, 5):
        if labels[i] > (len(train_set)  // 2):
            trans[i] = 1
        else:
            trans[i] = 0
    pred = []
    truth = []
    for i in range(0, len(dataset)):
        pred.append(trans)
        truth.append(dataset[i][1])
    
    return {"y_true": truth, "y_pred": pred}

In [22]:
input_dict = majority_voting(train_set, valid_set)

evaluator.eval(input_dict)

{'Precision': 0.21400000000000002, 'Recall': 0.4, 'F1': 0.27637728125533007}

In [34]:
input_dict = majority_voting(train_set, test_set)

evaluator.eval(input_dict)

{'Precision': [0, 0, 0, 0.71, 0],
 'Recall': [0, 0, 0, 1.0, 0],
 'F1': [0, 0, 0, 0.8304093567251462, 0],
 'Acc': [0.46, 0.66, 0.86, 0.71, 0.57]}

## TF-IDF

In [220]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_corpus = []
valid_corpus = []
test_corpus  = []

train_labels = {'ar':[], 'co':[], 'ec':[], 'mo':[], 'hi':[]}
valid_labels = {'ar':[], 'co':[], 'ec':[], 'mo':[], 'hi':[]}
test_labels  = {'ar':[], 'co':[], 'ec':[], 'mo':[], 'hi':[]}

for i in range(len(train_set)):
    train_corpus.append(train_set[i][0])
    train_labels['ar'].append(train_set[i][1][0])
    train_labels['hi'].append(train_set[i][1][1])
    train_labels['mo'].append(train_set[i][1][2])
    train_labels['co'].append(train_set[i][1][3])
    train_labels['ec'].append(train_set[i][1][4])

for i in range(len(valid_set)):
    valid_corpus.append(valid_set[i][0])
    valid_labels['ar'].append(valid_set[i][1][0])
    valid_labels['hi'].append(valid_set[i][1][1])
    valid_labels['mo'].append(valid_set[i][1][2])
    valid_labels['co'].append(valid_set[i][1][3])
    valid_labels['ec'].append(valid_set[i][1][4])

for i in range(len(test_set)):
    test_corpus.append(test_set[i][0])
    test_labels['ar'].append(test_set[i][1][0])
    test_labels['hi'].append(test_set[i][1][1])
    test_labels['mo'].append(test_set[i][1][2])
    test_labels['co'].append(test_set[i][1][3])
    test_labels['ec'].append(test_set[i][1][4])

vectorizer = TfidfVectorizer()

train_vectors = vectorizer.fit_transform(train_corpus)
valid_vectors = vectorizer.transform(valid_corpus)
test_vectors = vectorizer.transform(test_corpus)

#### KNN

In [233]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from dataset import LABELS

label = 'ar'

best_metric = {'label': '', 'neighbors':'', 'precision': -1, 'recall':-1, 'f1':-1, 'acc':-1}
for i in range(1, 20):

    classifier = KNeighborsClassifier(n_neighbors=i)
    classifier.fit(train_vectors, train_labels[label])

    preds = classifier.predict(valid_vectors)

    precision = precision_score(valid_labels[label], preds)
    recall = recall_score(valid_labels[label], preds)
    f1 = f1_score(valid_labels[label], preds)
    acc = accuracy_score(valid_labels[label], preds)

    if f1 > best_metric['f1']:
        best_metric = {'label': label, 'neighbors': i, 'precision': precision, 'recall':recall, 'f1':f1, 'acc':acc}

print('label-{}-neigbours-{}: precision:{}, recall:{}, f1:{}, acc:{}'.format(best_metric['label'], best_metric['neighbors'], best_metric['precision'], best_metric['recall'], best_metric['f1'], best_metric['acc']))


label-ar-neigbours-19: precision:0.5853658536585366, recall:0.8888888888888888, f1:0.7058823529411764, acc:0.6


In [234]:
classifier = KNeighborsClassifier(n_neighbors=best_metric['neighbors'])
classifier.fit(train_vectors, train_labels[label])

preds = classifier.predict(test_vectors)

precision = precision_score(test_labels[label], preds)
recall = recall_score(test_labels[label], preds)
f1 = f1_score(test_labels[label], preds)
acc = accuracy_score(test_labels[label], preds)

print('fold-{}-label-{}: precision:{}, recall:{}, f1:{}, acc:{}'.format(fold, label, precision, recall, f1, acc))

fold-5-label-ar: precision:0.45348837209302323, recall:0.9069767441860465, f1:0.6046511627906976, acc:0.49


#### LR

In [239]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

label = 'ar'

classifier = LogisticRegression(random_state=42)

classifier.fit(train_vectors, train_labels[label])

preds = classifier.predict(test_vectors)

precision = precision_score(test_labels[label], preds)
recall = recall_score(test_labels[label], preds)
f1 = f1_score(test_labels[label], preds)
acc = accuracy_score(test_labels[label], preds)

print('fold-{}-label-{}: precision:{}, recall:{}, f1:{}, acc:{}'.format(fold, label, precision, recall, f1, acc))

fold-5-label-ar: precision:0.527027027027027, recall:0.7222222222222222, f1:0.609375, acc:0.5


#### NB

In [256]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

label = 'hi'

classifier = GaussianNB()

classifier.fit(train_vectors, train_labels[label])

preds = classifier.predict(test_vectors)

precision = precision_score(test_labels[label], preds)
recall = recall_score(test_labels[label], preds)
f1 = f1_score(test_labels[label], preds)
acc = accuracy_score(test_labels[label], preds)

print('fold-{}-label-{}: precision:{}, recall:{}, f1:{}, acc:{}'.format(fold, label, precision, recall, f1, acc))

fold-5-label-hi: precision:0.14285714285714285, recall:0.029411764705882353, f1:0.04878048780487805, acc:0.61


#### Random Forest

In [264]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

label = 'hi'

classifier = RandomForestClassifier(max_depth=3, random_state=42)

classifier.fit(train_vectors, train_labels[label])

preds = classifier.predict(test_vectors)

precision = precision_score(test_labels[label], preds)
recall = recall_score(test_labels[label], preds)
f1 = f1_score(test_labels[label], preds)
acc = accuracy_score(test_labels[label], preds)

print('fold-{}-label-{}: precision:{}, recall:{}, f1:{}, acc:{}'.format(fold, label, precision, recall, f1, acc))

fold-5-label-hi: precision:0.0, recall:0.0, f1:0.0, acc:0.72


  _warn_prf(average, modifier, msg_start, len(result))


#### SVM

In [268]:
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

label = 'hi'

classifier = SVC(gamma='auto')

classifier.fit(train_vectors, train_labels[label])

preds = classifier.predict(test_vectors)

precision = precision_score(test_labels[label], preds)
recall = recall_score(test_labels[label], preds)
f1 = f1_score(test_labels[label], preds)
acc = accuracy_score(test_labels[label], preds)

print('fold-{}-label-{}: precision:{}, recall:{}, f1:{}, acc:{}'.format(fold, label, precision, recall, f1, acc))

fold-5-label-hi: precision:0.0, recall:0.0, f1:0.0, acc:0.72


  _warn_prf(average, modifier, msg_start, len(result))
