## Baseline Method Notebook
Model's for Stance Detection:
- Random Baseline
- Majority Baseline
- Support Vector Machine with n-gram features

Indivisual Datasets:
- SemEval2016Task6
- SethB
- SethC
- ARC

In [1]:
import os
import random

import pandas as pd
import numpy as np
import seaborn as sns
from statistics import median

import scipy
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score

## SemEval2016T6

In [3]:
file_dir = '../data/SemEval2016Task6'
os.listdir(file_dir)

df_training = pd.read_csv(file_dir+'/trainingdata-all-annotations.txt', sep='\t', encoding='latin1')
df_test = pd.read_csv(file_dir + '/testdata-taskA-all-annotations.txt', sep='\t', encoding='latin1')

In [4]:
def map_labels(stance):
    if stance == 'AGAINST':
        return 0
    if stance == 'FAVOR':
        return 1
    return 2

def get_ngrams(train_x, test_x):
    ngram_words = CountVectorizer(analyzer='word', ngram_range=(1,3))
    ngram_char = CountVectorizer(analyzer='char', ngram_range=(2,5))
    
    x_w = ngram_words.fit_transform(list(train_x))
    x_c = ngram_char.fit_transform(list(train_x))
    train_x = scipy.sparse.hstack([x_w, x_c])
    
    test_x_w = ngram_words.transform(list(test_x))
    test_x_c = ngram_char.transform(list(test_x))
    test_x = scipy.sparse.hstack([test_x_w,test_x_c])
    
    return train_x, test_x
    
def get_data_SemEval(target):
    train = df_training[df_training.Target == target]
    test = df_test[df_test.Target == target]
    train_x = train['Tweet']
    test_x = test['Tweet']
    train_x, test_x = get_ngrams(train_x, test_x)
    
    train_y = train['Stance'].apply(map_labels).values
    test_y = test['Stance'].apply(map_labels).values
    return train_x, train_y, test_x, test_y


In [None]:
 def run_svm():
    scores = []
    accuracies = []
    for target in df_training['Target'].unique():
        train_x, train_y, test_x, test_y = get_data(target)
        clf = svm.SVC(kernel='linear')
        clf.fit(train_x, train_y)
        preds = clf.predict(test_x)
        score = f1_score(test_y, preds, average='macro')
        correct = [1 if test_y[i] == preds[i] else 0  for i in range(len(test_y))]
        acc = sum(correct)/len(correct)
        print(correct) 
        print(target, ': ', score,'f1', acc,'acc')
        scores.append(score)
        accuracies.append(acc)
    print('SemEval2016T6 avg:', sum(scores)/len(scores), sum(accuracies)/len(accuracies))
    

def majority_baseline():
    scores = []
    accuracies = []
    for target in df_training['Target'].unique():
        train_x, train_y, test_x, test_y = get_data(target)
        majority = median(train_y)
        preds = [majority for _ in test_y]
        score = f1_score(test_y, preds, average='macro')
        correct = [1 if test_y[i] == preds[i] else 0  for i in range(len(test_y))]
        acc = sum(correct)/len(correct)
        scores.append(score)
        accuracies.append(acc)
        print(target, ': ', score,'f1', acc,'acc')
    print('SemEval2016T6 avg:', sum(scores)/len(scores), sum(accuracies)/len(accuracies))

def random_baseline():
    scores = []
    accuracies = []
    for target in df_training['Target'].unique():
        train_x, train_y, test_x, test_y = get_data(target)
        majority = median(train_y)
        preds = [random.randint(0,2) for _ in test_y]
        score = f1_score(test_y, preds, average='macro')
        correct = [1 if test_y[i] == preds[i] else 0  for i in range(len(test_y))]
        acc = sum(correct)/len(correct)
        scores.append(score)
        accuracies.append(acc)
        print(target, ': ', score,'f1', acc,'acc')
    print('SemEval2016T6 avg:', sum(scores)/len(scores), sum(accuracies)/len(accuracies))



In [47]:
random_baseline()

Atheism :  0.29208734411835524 f1 0.36818181818181817 acc
Climate Change is a Real Concern :  0.2650551416273368 f1 0.31952662721893493 acc
Feminist Movement :  0.2886752136752137 f1 0.3087719298245614 acc
Hillary Clinton :  0.3195994264232841 f1 0.34915254237288135 acc
Legalization of Abortion :  0.28956710753149767 f1 0.3357142857142857 acc
SemEval2016T6 avg: 0.2909968466751375 0.3362694406624963


## SEthC and SethB

In [42]:
file_dir = '/Users/baconbaker/Documents/Studium/NLP/Project/reddit-sd/testing'
os.listdir(file_dir)

df = pd.read_csv(file_dir+'/SEthB.csv', sep=',', encoding='utf-8')
df = pd.read_csv(file_dir+'/SEthC.csv', quotechar='`').dropna()

In [129]:
df_train = df.sample(frac=.8)
df_test = pd.concat([df, df_train]).drop_duplicates(keep=False)
df_train = df_train[['text','label']]
df_test = df_test[['text','label']]

In [43]:
def map_labels(stance):
    if stance == 'against':
        return 0
    if stance == 'favor':
        return 1
    return 2

def get_ngrams(train_x, test_x):
    ngram_words = CountVectorizer(analyzer='word', ngram_range=(1,3))
    ngram_char = CountVectorizer(analyzer='char', ngram_range=(2,5))
    
    x_w = ngram_words.fit_transform(list(train_x))
    x_c = ngram_char.fit_transform(list(train_x))
    train_x = scipy.sparse.hstack([x_w, x_c])
    
    test_x_w = ngram_words.transform(list(test_x))
    test_x_c = ngram_char.transform(list(test_x))
    test_x = scipy.sparse.hstack([test_x_w,test_x_c])
    
    return train_x, test_x
    
def get_data_SEthB():
    train_x = df_train['text']
    test_x = df_test['text']
    train_x, test_x = get_ngrams(train_x, test_x)
    
    train_y = df_train['label'].apply(map_labels).values
    test_y = df_test['label'].apply(map_labels).values
    return train_x, train_y, test_x, test_y

def get_data_SEthC():
    train_x = df_train['text']
    test_x = df_test['text']
    train_x, test_x = get_ngrams(train_x, test_x)
    
    train_y = df_train['label'].apply(map_labels).values
    test_y = df_test['label'].apply(map_labels).values
    return train_x, train_y, test_x, test_y



In [44]:
 def run_svm():
    scores = []
    accuracies = []
    train_x, train_y, test_x, test_y = get_data_SEthC()
    clf = svm.SVC(kernel='linear')
    clf.fit(train_x, train_y)
    preds = clf.predict(test_x)
    score = f1_score(test_y, preds, average='macro')
    correct = [1 if test_y[i] == preds[i] else 0  for i in range(len(test_y))]
    acc = sum(correct)/len(correct)
    scores.append(score)
    accuracies.append(acc)
    print('SVM avg:', sum(scores)/len(scores), 'f1,',sum(accuracies)/len(accuracies), 'acc')
    

def majority_baseline():
    scores = []
    accuracies = []
    train_x, train_y, test_x, test_y = get_data_SEthC()
    majority = median(train_y)
    preds = [majority for _ in test_y]
    score = f1_score(test_y, preds, average='macro')
    correct = [1 if test_y[i] == preds[i] else 0  for i in range(len(test_y))]
    acc = sum(correct)/len(correct)
    scores.append(score)
    accuracies.append(acc)
    print('Maj. Baseline avg:', sum(scores)/len(scores), 'f1,',sum(accuracies)/len(accuracies), 'acc')

def random_baseline():
    scores = []
    accuracies = []
    train_x, train_y, test_x, test_y = get_data_SEthC()
    majority = median(train_y)
    preds = [random.randint(0,2) for _ in test_y]
    score = f1_score(test_y, preds, average='macro')
    correct = [1 if test_y[i] == preds[i] else 0  for i in range(len(test_y))]
    acc = sum(correct)/len(correct)
    scores.append(score)
    accuracies.append(acc)
    print('Random Baseline avg:', sum(scores)/len(scores), 'f1,',sum(accuracies)/len(accuracies), 'acc')



In [1]:
print('SEthC Results')
run_svm()
majority_baseline()
random_baseline()

SEthC Results


NameError: name 'run_svm' is not defined

## ARC

In [2]:
data_dir = '/Users/baconbaker/Documents/Studium/NLP/Project/reddit-sd/data/ARC'

bodyfile = os.path.join(data_dir, "arc_bodies.csv")
trainfile = os.path.join(data_dir, "arc_stances_train.csv")
testfile = os.path.join(data_dir, "arc_stances_test.csv")

# format data using pandas
bodies = pd.read_csv(bodyfile)
train_data = pd.read_csv(trainfile).merge(bodies, how='left', on='Body ID')
test_data = pd.read_csv(testfile).merge(bodies, how='left', on='Body ID')
useful_columns = ["Headline", "Stance", "articleBody"]
renamed_columns = {'articleBody': "text", 'Stance': "label", 'Headline': "target"}
train_data = train_data.loc[:, useful_columns].rename(columns=renamed_columns)
test_data = test_data.loc[:, useful_columns].rename(columns=renamed_columns)

df_training = train_data
df_test = test_data

In [4]:
def map_labels(stance):
    if stance == 'agree':
        return 0
    if stance == 'disagree':
        return 1
    if stance == 'unrelated':
        return 2
    return 3

def get_ngrams(train_x, test_x):
    ngram_words = CountVectorizer(analyzer='word', ngram_range=(1,3))
    ngram_char = CountVectorizer(analyzer='char', ngram_range=(2,5))
    
    x_w = ngram_words.fit_transform(list(train_x))
    x_c = ngram_char.fit_transform(list(train_x))
    train_x = scipy.sparse.hstack([x_w, x_c])
    
    test_x_w = ngram_words.transform(list(test_x))
    test_x_c = ngram_char.transform(list(test_x))
    test_x = scipy.sparse.hstack([test_x_w,test_x_c])
    
    return train_x, test_x
    
def get_data_ARC(target):
    train = df_training[df_training.target == target]
    test = df_test[df_test.target == target]
    train_x = train['text']
    test_x = test['text']
    train_x, test_x = get_ngrams(train_x, test_x)
    
    train_y = train['label'].apply(map_labels).values
    test_y = test['label'].apply(map_labels).values
    return train_x, train_y, test_x, test_y

In [5]:
from tqdm import tqdm

def majority_baseline():
    print('Majority Baseline')
    scores = []
    accuracies = []
    for target in tqdm(df_training['target'].unique()):
        train_x, train_y, test_x, test_y = get_data_ARC(target)
        majority = median(train_y)
        preds = [majority for _ in test_y]
        correct = [1 if test_y[i] == preds[i] else 0  for i in range(len(test_y))]
        if len(correct) > 0:
            acc = sum(correct)/len(correct)
            score = f1_score(test_y, preds, average='macro')
            scores.append(score)
            accuracies.append(acc)
        scores.append(score)
        accuracies.append(acc)
    print('Maj. Baseline avg:', sum(scores)/len(scores), sum(accuracies)/len(accuracies))

def random_baseline():
    print('Random Baseline')
    scores = []
    accuracies = []
    for target in tqdm(df_training['target'].unique()):
        train_x, train_y, test_x, test_y = get_data_ARC(target)
        preds = [random.randint(0,3) for _ in test_y]
        correct = [1 if test_y[i] == preds[i] else 0  for i in range(len(test_y))]
        if len(correct) > 0:
            acc = sum(correct)/len(correct)
            score = f1_score(test_y, preds, average='macro')
            scores.append(score)
            accuracies.append(acc)
    print('Random Baseline avg:', sum(scores)/len(scores), sum(accuracies)/len(accuracies))
    
def run_svm():
    print('Running SVM')
    scores = []
    accuracies = []
    for target in tqdm(df_training['target'].unique()):
        train_x, train_y, test_x, test_y = get_data_ARC(target)
        if len(test_y) < 2:
            continue
        clf = svm.SVC(kernel='linear')
        if len(set(train_y)) == 1:
            continue
        clf.fit(train_x, train_y)
        preds = clf.predict(test_x)
        correct = [1 if test_y[i] == preds[i] else 0  for i in range(len(test_y))]
        if len(correct) > 0:
            acc = sum(correct)/len(correct)
            score = f1_score(test_y, preds, average='macro')
            scores.append(score)
            accuracies.append(acc)
        scores.append(score)
        accuracies.append(acc)
    print('SVM avg:', sum(scores)/len(scores), sum(accuracies)/len(accuracies))
    

In [6]:
run_svm()

  1%|          | 1/186 [00:00<00:21,  8.58it/s]

Running SVM


100%|██████████| 186/186 [01:02<00:00,  2.96it/s]

SVM avg: 0.4793858126028945 0.8058953478660416



