In [1]:
import nltk
import pandas as pd
from utils.dataset import DataSet
import os
import utils.generate_test_splits as split

In [2]:
train = DataSet('train')
test = DataSet('competition_test')

Reading dataset
Total stances: 49972
Total bodies: 1683
Reading dataset
Total stances: 25413
Total bodies: 904


# ex1: split the training set

In [3]:
def training_set_split(dataset, training = 0.9, base_dir="splits"):
    if not (os.path.exists(base_dir+ "/"+ "training_ids.txt")
            and os.path.exists(base_dir+ "/"+ "hold_out_ids.txt")):
        split.generate_hold_out_split(dataset,training,base_dir)

    training_ids = split.read_ids("training_ids.txt", base_dir)
    hold_out_ids = split.read_ids("hold_out_ids.txt", base_dir)

    # get stances
    stances_train = []
    stances_hold_out = []
    
    for stance in dataset.stances:
        if stance['Body ID'] in hold_out_ids:
            stances_hold_out.append(stance)
        else:
            stances_train.append(stance)
    
    return stances_train, stances_hold_out

In [4]:
training_stance, hold_out_stance = training_set_split(train)

In [5]:
def calculate_class_ratio(stance):
    
    agree = [ 1 for d in stance if d['Stance']== 'agree']
    disagree = [ 1 for d in stance if d['Stance']== 'disagree']
    discuss = [ 1 for d in stance if d['Stance']== 'discuss']
    unrelated = [ 1 for d in stance if d['Stance']== 'unrelated']
    
    agree = sum(agree)
    disagree = sum(disagree)
    discuss = sum(discuss)
    unrelated = sum(unrelated)
    
    if sum([agree, disagree, discuss, unrelated]) != len(stance):
        print('Error, invalid calculation of elements in each class')
    
    total_num_stance = len(stance)
    agree_ratio = round(agree/total_num_stance, 3)
    disagree_ratio = round(disagree/total_num_stance, 3)
    discuss_ratio = round(discuss/total_num_stance,3)
    unrelated_ratio = round(unrelated/total_num_stance,3)
    
    return agree_ratio, disagree_ratio, discuss_ratio, unrelated_ratio

In [6]:
train_agree,train_disagree, train_discus, train_unrelate = calculate_class_ratio(training_stance)
hold_agree,hold_disagree, hold_discus, hold_unrelate = calculate_class_ratio(hold_out_stance)

In [7]:
# display class ratio in training and hold-out set
print('Class ratio in training and hold-out dataset:\n')
print('\tTrain\tHold-out')
print('Agree\t', train_agree,'|', hold_agree)
print('Disagree', train_disagree,'|', hold_disagree)
print('Discuss\t', train_discus, '|', hold_discus)
print('Unrelate', train_unrelate, '|', hold_unrelate)

Class ratio in training and hold-out dataset:

	Train	Hold-out
Agree	 0.072 | 0.079
Disagree 0.017 | 0.017
Discuss	 0.176 | 0.187
Unrelate 0.735 | 0.717


In [8]:
import pickle

with open('splitted_train_list_dict.pickle', 'wb') as handle:
    pickle.dump(training_stance, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('splitted_evaluate_list_dict.pickle', 'wb') as handle:
    pickle.dump(hold_out_stance, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [9]:
import os
os.system('say "your program has finished"')

0