# Load Dataset

In [2]:
from dataset import ClimateDataset, TSVDataset

dataset = "./unlabelled_articles_17K/opinion_climate_all_with_bias.csv"
annotated_id_file = "./annotated_data_500/pretty_0611_lcad.txt"
annotated_file = "./annotated_data_500/0611_majority.json"

entire_dataset = ClimateDataset(dataset, annotated_id_file, annotated_file, True)
labelled_dataset = TSVDataset('./annotated_data_500/final_dataset_v2.tsv', True, False, True)

# Document Length Statitics (After Tokenization)

In [68]:
from torch.utils.data import Subset
from utils import Evaluator

fold = 5

train_set = Subset(labelled_dataset, [i for i in range((-1 + fold)%5, len(labelled_dataset), 5)] + [i for i in range((0 + fold)%5, len(labelled_dataset), 5)] + [i for i in range((1 + fold)%5, len(labelled_dataset), 5)])
valid_set = Subset(labelled_dataset, [i for i in range((2 + fold)%5, len(labelled_dataset), 5)])
test_set  = Subset(labelled_dataset, [i for i in range((3 + fold)%5, len(labelled_dataset), 5)])

evaluator = Evaluator(classifier='naive', mode='macro', detail=True)

len(train_set), len(valid_set), len(test_set)

(257, 86, 85)

In [4]:
train_set[1]

('record surge atmospheric concentrations earths atmosphere surged record meteorological organization wmolast increase average yearsresearchers combination human activities el nio weather phenomenon drove level yearsthis greenhouse gas bulletin produced wmo based measurements countries stations dotted globe measure concentrations warming gases including carbon dioxide methane nitrous oxidethe figures published wmo left atmosphere amounts absorbed earths sinks oceans biosphere average concentrations hit ppm largest increase network dr oksana tarasova chief wmos global atmosphere watch programme told bbc newsthe largest increase previous el nio ppm ppm average ten yearsel nio impacts amount carbon atmosphere causing droughts limit uptake plants treesemissions human sources slowed couple dr tarasova cumulative total atmosphere matters stays aloft active centuriesover report increase atmosphere times larger ice agerapidly increasing atmospheric levels gases potential study initiate unpredi

## length

In [3]:
from transformers import BertTokenizer
tk = BertTokenizer.from_pretrained("bert-base-uncased")

def length_statistic(dataset):
    length = 0
    max_len = 0
    min_len = 100000
    if type(dataset) == ClimateDataset:
        for i in range(0, len(dataset)):
            temp = len(tk.tokenize(dataset[i]))
            if max_len < temp:
                max_len = temp
            if min_len > temp:
                min_len = temp
            length += temp
    else:
        for i in range(0, len(dataset)):
            temp = len(tk.tokenize(dataset[i][0]))
            if max_len < temp:
                max_len = temp
            if min_len > temp:
                min_len = temp
            length += temp

    average_length = length / len(dataset)

    return {'min':min_len, 'max':max_len, 'average':average_length}

In [5]:
length_statistic(entire_dataset)

{'min': 4, 'max': 3716, 'average': 877.1968556614819}

In [11]:
length_statistic(labelled_dataset)

{'min': 70, 'max': 626, 'average': 333.5239043824701}

In [6]:
length_statistic(train_set), length_statistic(valid_set), length_statistic(test_set)

({'min': 141, 'max': 1221, 'average': 730.274834437086},
 {'min': 218, 'max': 1157, 'average': 697.52},
 {'min': 210, 'max': 1177, 'average': 664.2})

In [10]:
# 6 empty news, why?
for i in range(0, len(entire_dataset)):
    if len(tk.tokenize(entire_dataset[i])) <= 10:
        print(entire_dataset[i], i)

# delete and manually fixed

Your testimony is not serious. 5580
'We should let our children be children' 5596
Err501 7397
Err501 7410
Err501 7435
Err501 7449
Err501 7451
Err501 7454
Err501 7461
Err501 7463
Err501 7465
The desperate pleas of asylum seekers who Remain in Mexico 10474
How Democrats are prepping for their first debate 10479
How Democrats are prepping for their first debate 10523
Skip to main content  11094
Err501 11891
Err501 12133
Err501 12187
Click for more article by throngsman .. 15645
The Latest Nearby states sending fire help to California 17082


## labels

In [6]:
def label_distribution(dataset):
    labels = [0]*5

    for i in range(0, len(dataset)):
        for j in range(0, 5):
            if dataset[i][1][j] == 1:
                labels[j] += 1

    return labels

In [27]:
label_distribution(labelled_dataset)

[255, 154, 44, 334, 202]

In [15]:
label_distribution(train_set), label_distribution(valid_set), label_distribution(test_set)

([125, 27, 159, 26, 78], [37, 8, 50, 7, 27], [30, 14, 59, 11, 22])

# Naive Baselines

## Random Baseline

In [8]:
import random

def random_baseline(dataset):
    pred = []
    truth = []
    for i in range(0, len(dataset)):
        temp = []
        for j in range(0,5):
            if random.randint(0,1)==1:
                temp.append(1)
            else:
                temp.append(0)
        pred.append(temp)
        truth.append(dataset[i][1])

    return {"y_true": truth, "y_pred": pred}

In [9]:
input_dict = random_baseline(labelled_dataset)

evaluator.eval(input_dict)

{'Precision': [0.5025641025641026,
  0.10407239819004525,
  0.5707762557077626,
  0.11274509803921569,
  0.29850746268656714],
 'Recall': [0.5104166666666666,
  0.46938775510204084,
  0.4664179104477612,
  0.5227272727272727,
  0.47244094488188976],
 'F1': [0.5064599483204135,
  0.17037037037037037,
  0.513347022587269,
  0.18548387096774194,
  0.3658536585365853],
 'Acc': [0.5537383177570093,
  0.4766355140186916,
  0.4462616822429907,
  0.5280373831775701,
  0.514018691588785]}

In [10]:
input_dict = random_baseline(train_set)

evaluator.eval(input_dict)

{'Precision': [0.48091603053435117,
  0.10714285714285714,
  0.5887096774193549,
  0.11764705882352941,
  0.304],
 'Recall': [0.504,
  0.4444444444444444,
  0.4591194968553459,
  0.6153846153846154,
  0.48717948717948717],
 'F1': [0.4921875,
  0.17266187050359713,
  0.5159010600706714,
  0.19753086419753085,
  0.37438423645320196],
 'Acc': [0.49416342412451364,
  0.5525291828793775,
  0.4669260700389105,
  0.49416342412451364,
  0.5058365758754864]}

In [11]:
input_dict = random_baseline(valid_set)

evaluator.eval(input_dict)

{'Precision': [0.3333333333333333,
  0.08108108108108109,
  0.6382978723404256,
  0.1794871794871795,
  0.375],
 'Recall': [0.40540540540540543, 0.375, 0.6, 1.0, 0.6666666666666666],
 'F1': [0.36585365853658536,
  0.13333333333333333,
  0.6185567010309279,
  0.30434782608695654,
  0.4800000000000001],
 'Acc': [0.3953488372093023,
  0.5465116279069767,
  0.5697674418604651,
  0.627906976744186,
  0.5465116279069767]}

In [12]:
input_dict = random_baseline(test_set)

evaluator.eval(input_dict)

{'Precision': [0.42105263157894735,
  0.09523809523809523,
  0.75,
  0.07142857142857142,
  0.27906976744186046],
 'Recall': [0.5333333333333333,
  0.2857142857142857,
  0.5084745762711864,
  0.2727272727272727,
  0.5454545454545454],
 'F1': [0.47058823529411764,
  0.14285714285714285,
  0.6060606060606061,
  0.11320754716981131,
  0.36923076923076914],
 'Acc': [0.5764705882352941,
  0.43529411764705883,
  0.5411764705882353,
  0.4470588235294118,
  0.5176470588235295]}

## Majority Voting

In [17]:
def majority_voting(train, dataset):
    labels = [0]*5

    for i in range(0, len(train)):
        for j in range(0, 5):
            if train[i][1][j] == 1:
                labels[j] += 1

    trans = [1] * 5

    for i in range(0, 5):
        if labels[i] > (len(train_set)  // 2):
            trans[i] = 1
        else:
            trans[i] = 0
    pred = []
    truth = []
    for i in range(0, len(dataset)):
        pred.append(trans)
        truth.append(dataset[i][1])
    
    return {"y_true": truth, "y_pred": pred}

In [18]:
input_dict = majority_voting(train_set, valid_set)

evaluator.eval(input_dict)

{'Precision': [0, 0, 0.5813953488372093, 0, 0],
 'Recall': [0, 0, 1.0, 0, 0],
 'F1': [0, 0, 0.7352941176470588, 0, 0],
 'Acc': [0.5697674418604651,
  0.9069767441860465,
  0.5813953488372093,
  0.9186046511627907,
  0.686046511627907]}

In [19]:
input_dict = majority_voting(train_set, test_set)

evaluator.eval(input_dict)

{'Precision': [0, 0, 0.6941176470588235, 0, 0],
 'Recall': [0, 0, 1.0, 0, 0],
 'F1': [0, 0, 0.8194444444444444, 0, 0],
 'Acc': [0.6470588235294118,
  0.8352941176470589,
  0.6941176470588235,
  0.8705882352941177,
  0.7411764705882353]}

## TF-IDF

In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

balancing = False
label = 'ec'

train_corpus = []
valid_corpus = []
test_corpus  = []

train_labels = {'ar':[], 'co':[], 'ec':[], 'mo':[], 'hi':[]}
valid_labels = {'ar':[], 'co':[], 'ec':[], 'mo':[], 'hi':[]}
test_labels  = {'ar':[], 'co':[], 'ec':[], 'mo':[], 'hi':[]}

for i in range(len(train_set)):
    train_corpus.append(train_set[i][0])
    train_labels['ar'].append(int(train_set[i][1][0]==1))
    train_labels['hi'].append(int(train_set[i][1][1]==1))
    train_labels['co'].append(int(train_set[i][1][2]==1))
    train_labels['mo'].append(int(train_set[i][1][3]==1))
    train_labels['ec'].append(int(train_set[i][1][4]==1))


def dataset_balancing(specified_label):
    positives = []
    negatives = []
    for i in range(0, len(train_labels[specified_label])):
        if train_labels[specified_label][i] == 1:
            positives.append(i)
        else:
            negatives.append(i)

    if len(positives) > len(negatives):
        for i in range(len(negatives), len(positives)):
            choice = np.random.choice(negatives)
            train_corpus.append(train_corpus[choice])
            train_labels[specified_label].append(train_labels[specified_label][choice])
    else:
        for i in range(len(positives), len(negatives)):
            choice = np.random.choice(positives)
            train_corpus.append(train_corpus[choice])
            train_labels[specified_label].append(train_labels[specified_label][choice])

dataset_balancing(label)

for i in range(len(valid_set)):
    valid_corpus.append(valid_set[i][0])
    valid_labels['ar'].append(int(valid_set[i][1][0]==1))
    valid_labels['hi'].append(int(valid_set[i][1][1]==1))
    valid_labels['co'].append(int(valid_set[i][1][2]==1))
    valid_labels['mo'].append(int(valid_set[i][1][3]==1))
    valid_labels['ec'].append(int(valid_set[i][1][4]==1))

for i in range(len(test_set)):
    test_corpus.append(test_set[i][0])
    test_labels['ar'].append(int(test_set[i][1][0]==1))
    test_labels['hi'].append(int(test_set[i][1][1]==1))
    test_labels['co'].append(int(test_set[i][1][2]==1))
    test_labels['mo'].append(int(test_set[i][1][3]==1))
    test_labels['ec'].append(int(test_set[i][1][4]==1))

vectorizer = TfidfVectorizer()

train_vectors = vectorizer.fit_transform(train_corpus)
valid_vectors = vectorizer.transform(valid_corpus)
test_vectors = vectorizer.transform(test_corpus)

#### KNN

In [82]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from dataset import LABELS


best_metric = {'label': '', 'neighbors':'', 'precision': -1, 'recall':-1, 'f1':-1, 'acc':-1}
for i in range(1, 20):

    classifier = KNeighborsClassifier(n_neighbors=i)
    classifier.fit(train_vectors, train_labels[label])

    preds = classifier.predict(valid_vectors)

    precision = precision_score(valid_labels[label], preds)
    recall = recall_score(valid_labels[label], preds)
    f1 = f1_score(valid_labels[label], preds)
    acc = accuracy_score(valid_labels[label], preds)

    if f1 > best_metric['f1']:
        best_metric = {'label': label, 'neighbors': i, 'precision': precision, 'recall':recall, 'f1':f1, 'acc':acc}

print('label-{}-neigbours-{}: precision:{}, recall:{}, f1:{}, acc:{}'.format(best_metric['label'], best_metric['neighbors'], best_metric['precision'], best_metric['recall'], best_metric['f1'], best_metric['acc']))


label-ec-neigbours-2: precision:0.5769230769230769, recall:0.5555555555555556, f1:0.5660377358490566, acc:0.7325581395348837


In [83]:
classifier = KNeighborsClassifier(n_neighbors=best_metric['neighbors'])
classifier.fit(train_vectors, train_labels[label])

preds = classifier.predict(test_vectors)

precision = precision_score(test_labels[label], preds)
recall = recall_score(test_labels[label], preds)
f1 = f1_score(test_labels[label], preds)
acc = accuracy_score(test_labels[label], preds)

print('fold-{}-label-{}: precision:{}, recall:{}, f1:{}, acc:{}'.format(fold, label, precision, recall, f1, acc))

fold-5-label-ec: precision:0.3548387096774194, recall:0.5, f1:0.41509433962264153, acc:0.6352941176470588


#### LR

In [77]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

label = 'ar'

classifier = LogisticRegression(random_state=42)

classifier.fit(train_vectors, train_labels[label])

preds = classifier.predict(test_vectors)

precision = precision_score(test_labels[label], preds)
recall = recall_score(test_labels[label], preds)
f1 = f1_score(test_labels[label], preds)
acc = accuracy_score(test_labels[label], preds)

print('fold-{}-label-{}: precision:{}, recall:{}, f1:{}, acc:{}'.format(fold, label, precision, recall, f1, acc))

fold-3-label-ar: precision:0.6538461538461539, recall:0.3695652173913043, f1:0.4722222222222222, acc:0.5581395348837209


#### NB

In [256]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

label = 'hi'

classifier = GaussianNB()

classifier.fit(train_vectors, train_labels[label])

preds = classifier.predict(test_vectors)

precision = precision_score(test_labels[label], preds)
recall = recall_score(test_labels[label], preds)
f1 = f1_score(test_labels[label], preds)
acc = accuracy_score(test_labels[label], preds)

print('fold-{}-label-{}: precision:{}, recall:{}, f1:{}, acc:{}'.format(fold, label, precision, recall, f1, acc))

fold-5-label-hi: precision:0.14285714285714285, recall:0.029411764705882353, f1:0.04878048780487805, acc:0.61


#### Random Forest

In [264]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

label = 'hi'

classifier = RandomForestClassifier(max_depth=3, random_state=42)

classifier.fit(train_vectors, train_labels[label])

preds = classifier.predict(test_vectors)

precision = precision_score(test_labels[label], preds)
recall = recall_score(test_labels[label], preds)
f1 = f1_score(test_labels[label], preds)
acc = accuracy_score(test_labels[label], preds)

print('fold-{}-label-{}: precision:{}, recall:{}, f1:{}, acc:{}'.format(fold, label, precision, recall, f1, acc))

fold-5-label-hi: precision:0.0, recall:0.0, f1:0.0, acc:0.72


  _warn_prf(average, modifier, msg_start, len(result))


#### SVM

In [268]:
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

label = 'hi'

classifier = SVC(gamma='auto')

classifier.fit(train_vectors, train_labels[label])

preds = classifier.predict(test_vectors)

precision = precision_score(test_labels[label], preds)
recall = recall_score(test_labels[label], preds)
f1 = f1_score(test_labels[label], preds)
acc = accuracy_score(test_labels[label], preds)

print('fold-{}-label-{}: precision:{}, recall:{}, f1:{}, acc:{}'.format(fold, label, precision, recall, f1, acc))

fold-5-label-hi: precision:0.0, recall:0.0, f1:0.0, acc:0.72


  _warn_prf(average, modifier, msg_start, len(result))
