In [1]:
import os
import torch
from xml.dom import minidom
import pandas as pd
import numpy as np
import torchtext

In [2]:
def raw_to_csv(directory, text_filename, label_filename, target_filename):
    text_file = os.path.join(directory, text_filename)
    label_file = os.path.join(directory, label_filename)
    target_file = os.path.join(directory, target_filename)
    
    xml_doc = minidom.parse(text_file)
    
    text_elements = [
        [element.getAttribute('id'), element.firstChild.data] 
        for element in xml_doc.getElementsByTagName('instance')
    ]
    
    df_text = pd.DataFrame(text_elements, index=np.array(text_elements)[:,0], columns=['id', 'text'])
    df_text['id'] = df_text['id'].astype('int64')
    
    df_label_columns = ['id', 'anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']
    df_label = pd.read_csv(label_file, sep=' ', names=df_label_columns, header=None)
    
    df_final = pd.merge(df_text, df_label, on='id', how='inner')
    df_final.to_csv(target_file, index=False)
    return df_final

In [3]:
df_train = raw_to_csv(
    directory='data/AffectiveText.test', 
    text_filename='affectivetext_test.xml', 
    label_filename='affectivetext_test.emotions.gold', 
    target_filename='affectivetext_test.csv'
)
df_train.head(5)

Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise
0,500,Test to predict breast cancer relapse is approved,0,0,15,38,9,11
1,501,"Two Hussein allies are hanged, Iraqi official ...",24,26,16,13,38,5
2,502,Sights and sounds from CES,0,0,0,17,0,4
3,503,Schuey sees Ferrari unveil new car,0,0,0,46,0,31
4,504,Closings and cancellations top advice on flu o...,1,0,23,8,11,8


In [4]:
len(df_train)

1000

In [None]:
df_test = raw_to_csv(
    directory='data/AffectiveText.trial', 
    text_filename='affectivetext_trial.xml', 
    label_filename='affectivetext_trial.emotions.gold', 
    target_filename='affectivetext_trial.csv'
)
df_test.head(5)

Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise
0,1,Mortar assault leaves at least 18 dead,22,2,60,0,64,0
1,2,Goal delight for Sheva,0,0,0,93,0,38
2,3,Nigeria hostage feared dead is freed,18,0,52,66,20,65
3,4,Bombers kill shoppers,66,39,94,0,86,0
4,5,"Vegetables, not fruit, slow brain decline",0,0,25,26,2,46


In [None]:
len(df_test)

250

In [None]:
import spacy
spacy_en = spacy.load('en')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

TEXT = torchtext.data.Field(sequential=True, tokenize=tokenizer, lower=True)
LABEL = torchtext.data.LabelField(sequential=False, use_vocab=False)

affectiveText_train_dataset = torchtext.data.TabularDataset(
    path='data/AffectiveText.test/affectivetext_test.csv', 
    format='csv', 
    fields=[
        ('id', None),
        ('text', TEXT),
        ('anger', LABEL),
        ('disgust', LABEL),
        ('fear', LABEL),
        ('joy', LABEL),
        ('sadness', LABEL),
        ('surprise', LABEL)
    ], 
    skip_header=True
)
affectiveText_train_dataset