# Preprocessing for OPTIMUS

We partition the data as follows: 75%/25% > 75%/25%

75% of the latter are used for fine-tuning the pretrained OPTIMUS VAE

25% are used to evaluate the fine-tuned VAE (automatically) and to generate explanations

.data files for sentence generation

.txt files for VAE training

In [1]:
import string
from collections import Counter

import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

from lstm_vae import create_lstm_vae, inference
from pre_processing import preProcessing, YOUTUBE_preProcessing

import sklearn
from scipy.spatial.distance import cdist
from statistics import stdev

import keras

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def OPTIMUS_get_text_data(num_samples, data_path, dataset):
    thousandwords = [line.rstrip('\n') for line in open('data/1-1000.txt')]

    # vectorize the data
    input_texts = []
    input_texts_test = []
    input_texts_original = []
    input_texts_original_test = []
    
    input_words = set(["\t"])
    all_input_words = []
    
    lines = []
    lines_test = []
    
    df = pd.read_csv(data_path, encoding='utf-8')

    if dataset == "polarity":
        X = df['tweet'].values
        y = df['class'].values
    elif dataset == "hate":
        # Removing the offensive comments, keeping only neutral and hatespeech,
        # and convert the class value from 2 to 1 for simplification purposes
        df = df[df['class'] != 1]
        X = df['tweet'].values
        y = df['class'].apply(lambda x: 1 if x == 2 else 0).values

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, test_size=0.25)
    
    # add another split of the data set
    # parameter stratify: preserves class-relations in data set    
    X_train_subsplit, X_test_subsplit, y_train_subsplit, y_test_subsplit = train_test_split(X_test, y_test, random_state=42, stratify=y_test, test_size=0.25)

    new_X_train_subsplit = preProcessing(X_train_subsplit)
    new_X_test_subsplit = preProcessing(X_test_subsplit)
    
    # clean training set
    for line in new_X_train_subsplit:
        input_texts_original.append(line)
        # lowercase and remove punctuation
        lines.append(line.lower().translate(str.maketrans('', '', string.punctuation)))  
        
    # clean test set
    for line in new_X_test_subsplit:
        input_texts_original_test.append(line)
        # lowercase and remove punctuation
        lines_test.append(line.lower().translate(str.maketrans('', '', string.punctuation)))  

    return input_texts_original, y_train_subsplit, input_texts_original_test, y_test_subsplit

In [3]:
def YOUTUBE_OPTIMUS_get_text_data(num_samples, data_path, dataset):
    thousandwords = [line.rstrip('\n') for line in open('data/1-1000.txt')]

    # vectorize the data
    input_texts = []
    input_texts_test = []
    input_texts_original = []
    input_texts_original_test = []
    
    input_words = set(["\t"])
    all_input_words = []
    
    lines = []
    lines_test = []
        
    df = pd.read_csv(data_path, encoding='utf-8')

    X = df["CONTENT"].values
    y = df["CLASS"].values
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, test_size=0.25)
    X_train_subsplit, X_test_subsplit, y_train_subsplit, y_test_subsplit = train_test_split(X_test, y_test, random_state=42, stratify=y_test, test_size=0.25)
    
    print(len(y_train_subsplit))
    print(len(y_test_subsplit))
    
    new_X_test_subsplit = YOUTUBE_preProcessing(X_test_subsplit)
    new_X_train_subsplit = YOUTUBE_preProcessing(X_train_subsplit)
    
    # delete x/y where there is no more content after preprocessing (e.g. comment was only an url)
    
    indx = []
    for i in range(len(new_X_test_subsplit)):
        if len(new_X_test_subsplit[i]) == 0:
            indx.append(i)
        elif len(new_X_test_subsplit[i]) > 140:
            indx.append(i)     
    new_X_test_subsplit = np.delete(new_X_test_subsplit, indx, 0)
    y_test_subsplit = np.delete(y_test_subsplit, indx, 0)
    
    indx_train = []
    for i in range(len(new_X_train_subsplit)):
        if len(new_X_train_subsplit[i]) == 0:
            indx_train.append(i)
        if len(new_X_train_subsplit[i]) > 140:
            indx_train.append(i)
    new_X_train_subsplit = np.delete(new_X_train_subsplit, indx_train, 0)
    y_train_subsplit = np.delete(y_train_subsplit, indx_train, 0)
    
    # clean training set
    for line in new_X_train_subsplit:
        input_texts_original.append(line)
        # lowercase and remove punctuation
        lines.append(line.lower().translate(str.maketrans('', '', string.punctuation)))  
        
    # clean test set
    for line in new_X_test_subsplit:
        input_texts_original_test.append(line)
        # lowercase and remove punctuation
        lines_test.append(line.lower().translate(str.maketrans('', '', string.punctuation)))  

    return input_texts_original, y_train_subsplit, input_texts_original_test, y_test_subsplit


In [6]:
dataset_name = 'polarity'

res = OPTIMUS_get_text_data(num_samples=20000, data_path='data/' + dataset_name + '_tweets.csv', dataset=dataset_name)
input_texts, y_train, input_texts_test, y_test = res

In [8]:
# safe data and labels
# input sentence generation is .data

import pickle

with open(dataset_name +'_training_data.data', 'wb') as filehandle:
    pickle.dump(input_texts, filehandle)

#with open(dataset_name +'_test_data.data', 'wb') as filehandle:
#    pickle.dump(input_texts_test, filehandle)
    
#with open(dataset_name +'_training_labels.data', 'wb') as filehandle:
#    pickle.dump(y_train, filehandle)

#with open(dataset_name +'_test_labels.data', 'wb') as filehandle:
#    pickle.dump(y_test, filehandle)

In [7]:
# safe data
# input VAE training is txt

#file = open(dataset_name + "_training_data.txt", "w")

#for i in range(len(input_texts)):
#    file.write(input_texts[i])
#    file.write("\n")
#file.close() 

#file = open(dataset_name + "_test_data.txt", "w")

#for i in range(len(input_texts_test)):
#    file.write(input_texts_test[i])
#    file.write("\n")
#file.close() 