In [1]:
import os
import sys
from ast import literal_eval
import itertools as it

import numpy as np
import pandas as pd
import torch
import transformers
import nltk.tokenize
import tqdm
import sklearn

from system_evaluation import evaluation_technique as evaluate
import convert_to_bert as ctb
import span_find as sf


In [None]:
# Preprocessing (run before training)
# Make sure you choose the right training data path
# Goal of this cell is to reformat the data so it is a collection of tokens and tags, both grouped by their text id

# Reading the dataset into a pandas dataframe
initial_df = ctb.read_raw_data(train_path)
# creating a new column that contains the tokenization of each text
initial_df['tokenize'] = [ctb.tokenize_text(text) for text in initial_df['text']]
# creating a new column that contains the tags for each token. Should be a 1-1 correspondence
initial_df['tags'] = [ctb.tag_toxic_spans(text, initial_df['spans'][i]) for i, text in enumerate(initial_df['text'])]

# path to write new dataframe to
new_path = os.path.join('toxic_data', 'train_full.txt')
tag_list = []

# stores all tags in list
for i, item in initial_df['tags'].iteritems():
    for tag in item:
        tag_list.append(tag)

# creates the write dataframe and organizes the token columns simultaneously
flatdata = pd.DataFrame([( index, value) for ( index, values) 
                         in initial_df[ 'tokenize' ].iteritems() for value in values], 
                             columns = [ 'index', 'tokens']).set_index( 'index' )
# putting the tags into the dataframe
flatdata['tags'] = tag_list
# Indexing the dataframe by which post a token came from
flatdata['Text #'] = ['Text: {}'.format(i + 1) for i in flatdata.index]
# Verify the dataframe
print(flatdata)
# Write the dataframe to specified path
flatdata.to_csv(new_path, sep='\t', columns=['Text #', 'tokens', 'tags'], index=False, header=True)

In [None]:
# Helper method to create the prediction mask for a given prediction
# the input looks like (token_list, tag_list, seperator_list)
def generate_pred_mask(token_tag_space_zip):
    # initializes an index variable to count across the text
    current_index = 0
    # intitializes a prediction mask
    pred_mask = []
    
    # Item 0 is the token, item 1 is the tag, item 2 is the seperator
    for index, item in enumerate(token_tag_space_zip):
        # check if item is tagged as toxic
        if item[1] == 'Tox':
            # target index is set to be the end of the current item
            target = current_index + len(item[0])
            # adds all indexes of the current toxic item to the span mask
            while current_index < target:
                pred_mask.append(current_index)
                current_index += 1
            # check if the next item is tagged as toxic and if there is a token after the current 
            if index + 1 < len(token_tag_space_zip) and token_tag_space_zip[index +1][1] == 'Tox':
                # target index is set to the end of the seperation between current and next token
                target = current_index + item[2]                
                # adds all indexes of the current seperator to the span mask
                while current_index < target:
                    pred_mask.append(current_index)
                    current_index += 1
        # if the token isn't toxic, set the current index to the next token's index
        else:
            current_index += len(item[0]) + item[2]
    return pred_mask

In [None]:
# gets the evaluation dataset
eval_df = pd.read_csv('toxic_data/tsd_test.csv', header=0, keep_default_na=False)
# gets tag predictions
tags = pd.read_csv('tsd_eval_tags_trial_3.csv', sep='\t', header=0)['tags']
# cleans the prediction tags so they can be read as a list
tags = [literal_eval(x.replace('\n', '').replace(' ', ',')) for x in tags]

predictions = []

for i, tag in enumerate(tags):
    # grabs the text from the evaluation dataset
    post = eval_df['text'][i]
    # creates a list of tokens from the dataset
    tokens = post.split()
    # grabs the seperators between the tokens
    list_of_spaces = ctb.space_between_tokens(post, tokens)
    # zips the tokens, tags, and seperators
    token_tag_space = list(zip(tokens, tag, list_of_spaces + [0]))
    # adds the prediction mask to the total list of predictions
    predictions.append(generate_pred_mask(token_tag_space))

In [None]:
# Run this cell after the previous
# creates list of ids
ids = np.arange(len(tags))
# DO NOT CHANGE THIS CODE
with open("spans-pred.txt", "w") as out:
    for uid, text_scores in zip(ids, predictions):
        out.write(f"{str(uid)}\t{str(text_scores)}\n")

In [None]:
# this cell is to see what the prediction mask contains
read_file = 'spans-pred.txt'
eval_file = 'toxic_data/tsd_test.csv'
write_file = 'extracted-t3.txt'

span_df = pd.read_csv(read_file, sep='\t', names=['index', 'span'], header=None, index_col=0)
text_df = pd.read_csv(eval_file, header=0)

toxic = [sf.extract_toxic_span(literal_eval(span_list), text) for span_list, text in zip(span_df['span'], text_df['text'])]
toxic_df = pd.DataFrame({'text':text_df['text'], 'toxic':toxic})
toxic_df.to_csv(write_file)