## imports

In [3]:
from aug import *
from functions import *
from tqdm import tqdm

import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

from scipy.spatial import distance

from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

from keras import backend as K
from keras.layers import Dense, Dropout, LSTM, Bidirectional
from keras.models import Sequential, load_model, Model
from keras.callbacks import EarlyStopping

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  #get rid of warnings

2022-11-15 02:45:04.518925: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-15 02:45:04.650799: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-15 02:45:05.141395: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-11-15 02:45:05.141438: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

[H[2J

## load model

In [None]:
model = load_model('eda_code/output/pc.h5')
model.summary()

In [None]:
# intermediate_layer_model = Model(inputs=model.input, outputs=model.get_layer('dense_1').output)
# intermediate_layer_model_with_k_function = K.function([model.input], [model.get_layer('dense_1').output])
# get_4rd_layer_output = K.function([model.layers[0].input],[model.layers[5].output])

## define functions

In [2]:
def train_x(train_txt, word2vec_len, input_size, word2vec):

	#read in lines
	train_lines = open(train_txt, 'r').readlines()
	num_lines = len(train_lines)

	x_matrix = np.zeros((num_lines, input_size, word2vec_len))

	#insert values
	for i, line in enumerate(train_lines):

		parts = line[:-1].split('\t')
		label = int(parts[0])
		sentence = parts[1]	

		#insert x
		words = sentence.split(' ')
		words = words[:x_matrix.shape[1]] #cut off if too long
		for j, word in enumerate(words):
			if word in word2vec:
				x_matrix[i, j, :] = word2vec[word]

	return x_matrix






def get_tsne_labels(file,num_aug):
	labels = []
	alphas = []
	lines = open(file, 'r').readlines()
	for i, line in enumerate(lines):
		parts = line[:-1].split('\t')
		_class = int(parts[0])
		alpha = i % (num_aug+1)
		if alpha == 0:
			labels.append(_class+100)
			alphas.append(alpha)
		else:
			labels.append(_class)
			alphas.append(alpha)
	return labels, alphas

def get_plot_vectors(layer_output,perplexity=30,n_iter=1000,random_state=0,method='barnes_hut',learning_rate=200):

	tsne = TSNE(n_components=2,perplexity=perplexity,n_iter=n_iter,random_state=random_state,method=method,learning_rate=learning_rate).fit_transform(layer_output)
	return tsne

def plot_tsne(tsne, labels, output_path):

	label_to_legend_label = {'output/pc_last_dense_tsne.png':{	0:'Con (augmented)', 
															100:'Con (original)', 
															1: 'Pro (augmented)', 
								 							101:'Pro (original)'}
								,'output/cr_last_dense_tsne.png':{	0:'Con (augmented)', 
															100:'Con (original)', 
															1: 'Pro (augmented)', 
								 							101:'Pro (original)'}
								,'output/subj_last_dense_tsne.png':{	0:'Con (augmented)', 
															100:'Con (original)', 
															1: 'Pro (augmented)', 
								 							101:'Pro (original)'}}
								# 'outputs_f4/trec_last_dense_tsne.png':{0:'Description (augmented)',
								# 							100:'Description (original)',
								# 							1:'Entity (augmented)',
								# 							101:'Entity (original)',
								# 							2:'Abbreviation (augmented)',
								# 							102:'Abbreviation (original)',
								# 							3:'Human (augmented)',
								# 							103:'Human (original)',
								# 							4:'Location (augmented)',
								# 							104:'Location (original)',
								# 							5:'Number (augmented)',
								# 							105:'Number (original)'}}

	plot_to_legend_size = {'output/cr_last_dense_tsne.png':6,'output/pc_last_dense_tsne.png':6,'output/subj_last_dense_tsne.png':6}

	labels = labels#.tolist() 
	big_groups = [label for label in labels if label < 100]
	big_groups = list(sorted(set(big_groups)))
	
	colors = ['b', 'g']#, 'r', 'c', 'm', 'y', 'k', '#ff1493', '#FF4500']
	fig, ax = plt.subplots()

	for big_group in big_groups:
		
		for group in [big_group, big_group+100]:
			
			x, y = [], []

			for j, label in enumerate(labels):
				if label == group:
				
					x.append(tsne[j][0])
					y.append(tsne[j][1])

			#params
			color = colors[int(group % 100)]
			marker = 'o' if group in[0,100] else '^'
			size = 1 if group < 100 else 40
			fillstyles = color if group < 100 else 'none'
		

			legend_label = label_to_legend_label[output_path][group]
			
			ax.scatter(x, y, color=color, marker=marker, s=size, facecolors=fillstyles, label=legend_label)
			plt.axis('off')

	legend_size = plot_to_legend_size[output_path]
	plt.legend(loc='best',prop={'size': legend_size})
	
	plt.figure(dpi=1000)
	plt.show()




## train model

In [3]:
def build_model(sentence_length, word2vec_len, num_classes):
    model = None
    model = Sequential()
    model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(sentence_length, word2vec_len)))
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(32, return_sequences=False)))
    model.add(Dropout(0.5))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(num_classes, kernel_initializer='normal', activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    #print(model.summary())
    return model

In [4]:
word2vec_len = 300
input_size = 25
num_classes = 2


In [5]:
dict_of_train_datasets = {'pc':'data/pc/train.txt','cr':'data/cr/train.txt','subj':'data/subj/train.txt'}
dict_of_test_datasets = {'pc':'data/pc/test.txt','cr':'data/cr/test.txt','subj':'data/subj/test.txt'}

dict_of_30_samples = {'pc':'data/pc/30_samples.txt','cr':'data/cr/30_samples.txt','subj':'data/subj/30_samples.txt'}

dict_of_models = {'pc':'models/pc_model.h5','cr':'models/cr_model.h5','subj':'models/subj_model.h5'}

dict_of_word2vec_files = {'pc':pickle.load(open('data/pc/word2vec.p', 'rb')),
                        'cr':pickle.load(open('data/cr/word2vec.p', 'rb')),
                        'subj':pickle.load(open('data/subj/word2vec.p', 'rb'))}
# laod data
df_train_pc = load_data('data/pc/train.txt')
df_train_cr = load_data('data/cr/train.txt')
df_train_subj = load_data('data/subj/train.txt')
df_test_pc = load_data('data/pc/test.txt')
df_test_cr = load_data('data/cr/test.txt')
df_test_subj = load_data('data/subj/test.txt')

In [6]:
def get_x_y(train_txt, num_classes, word2vec_len, input_size, word2vec, percent_dataset):

    #read in lines
    train_lines = open(train_txt, 'r').readlines()
    shuffle(train_lines)
    train_lines = train_lines[:int(percent_dataset*len(train_lines))]
    num_lines = len(train_lines)

    #initialize x and y matrix
    x_matrix = None
    y_matrix = None

    try:
        x_matrix = np.zeros((num_lines, input_size, word2vec_len))
    except:
        print("Error!", num_lines, input_size, word2vec_len)
    y_matrix = np.zeros((num_lines, num_classes))

    #insert values
    for i, line in enumerate(train_lines):

        parts = line[:-1].split('\t')
        label = int(parts[0])
        sentence = parts[1]	

        #insert x
        words = sentence.split(' ')
        words = words[:x_matrix.shape[1]] #cut off if too long
        for j, word in enumerate(words):
            if word in word2vec:
                x_matrix[i, j, :] = word2vec[word]

        #insert y
        y_matrix[i][label] = 1.0

    return x_matrix, y_matrix

In [7]:
#one hot to categorical
def one_hot_to_categorical(y):
    assert len(y.shape) == 2
    return np.argmax(y, axis=1)
#load data
def run_model(dataset_name):
	train_x, train_y = get_x_y(dict_of_train_datasets[dataset_name], num_classes, word2vec_len, input_size, dict_of_word2vec_files[dataset_name], 1)
	test_x, test_y = get_x_y(dict_of_test_datasets[dataset_name], num_classes, word2vec_len, input_size, dict_of_word2vec_files[dataset_name], 1)

	#build model
	model = build_model(input_size, word2vec_len, num_classes)

	callbacks = [EarlyStopping(monitor='val_loss', patience=3)]

	#train model
	model.fit(	train_x, 
					train_y, 
					epochs=100000, 
					callbacks=callbacks,
					validation_split=0.1, 
					batch_size=1024, 
					shuffle=True, 
					verbose=0)
	#save the model
	model.save(dict_of_models[dataset_name])


	#evaluate model
	y_pred = model.predict(test_x)
	test_y_cat = one_hot_to_categorical(test_y)
	y_pred_cat = one_hot_to_categorical(y_pred)
	acc = accuracy_score(test_y_cat, y_pred_cat)

	#clean memory???
	train_x, train_y = None, None

	#return the accuracy
	#print("data with shape:", train_x.shape, train_y.shape, 'train=', train_file, 'test=', test_file, 'with fraction', percent_dataset, 'had acc', acc)
	return acc

In [8]:
run_model('pc')

2022-11-14 13:44:10.463010: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-14 13:44:10.463683: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-11-14 13:44:10.463692: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-11-14 13:44:10.463836: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN



0.9235273854633138

In [None]:
run_model('cr')

In [None]:
run_model('subj')

## Augmenting data

In [None]:
def aug_samples(dataset_name):
    data = load_data(dict_of_30_samples['pc'])
    methods = ['eda_augmenter','wordnet_augmenter','aeda_augmenter','backtranslation_augmenter']
    for method in methods:
        augmented_data = augment_text(data, method,fraction=1,pct_words_to_swap=0.2 ,transformations_per_example=1,
                    label_column='class',target_column='text',include_original=True)
        augmented_data = augmented_data[['class','text']]
        np.savetxt(f'data/{dataset_name}/{method}_augmented.txt', augmented_data.values, fmt='%s', delimiter='\t')

In [None]:
aug_samples('pc')

In [None]:
aug_samples('cr')

In [None]:
aug_samples('subj')

## TSNE

In [None]:
dict_of_eda_augmented_datasets = {'pc':'data/pc/eda_augmenter_augmented.txt', 'cr':'data/cr/eda_augmenter_augmented.txt', 'subj':'data/subj/eda_augmenter_augmented.txt'}
dict_of_wordnet_augmented_datasets = {'pc':'data/pc/wordnet_augmenter_augmented.txt', 'cr':'data/cr/wordnet_augmenter_augmented.txt', 'subj':'data/subj/wordnet_augmenter_augmented.txt'}
dict_of_aeda_augmented_datasets = {'pc':'data/pc/aeda_augmenter_augmented.txt', 'cr':'data/cr/aeda_augmenter_augmented.txt', 'subj':'data/subj/aeda_augmenter_augmented.txt'}
dict_of_backtranslation_augmented_datasets = {'pc':'data/pc/backtranslation_augmenter_augmented.txt', 'cr':'data/cr/backtranslation_augmenter_augmented.txt', 'subj':'data/subj/backtranslation_augmenter_augmented.txt'}

dict_of_aug_methods = {'eda':dict_of_eda_augmented_datasets, 'wordnet':dict_of_wordnet_augmented_datasets, 'aeda':dict_of_aeda_augmented_datasets, 'backtranslation':dict_of_backtranslation_augmented_datasets}


def tsne_plot(dataset_name,method_name):

    
    
    model = load_model(dict_of_models[dataset_name])
    
    model.summary()

    word2vec = dict_of_word2vec_files[dataset_name]
    word2vec_len = 300
    input_size = 25

    original_file = dict_of_30_samples[dataset_name]
    augmented_file = dict_of_aug_methods[method_name][dataset_name]
    


    X = train_x(augmented_file, word2vec_len, input_size, word2vec)
    labels, _ = get_tsne_labels(augmented_file,1)


    intermediate_layer_model = Model(inputs=model.input, outputs=model.get_layer(model.layers[5].name).output)
    layer_output = intermediate_layer_model.predict(X)

    #layer_output = K.function([model.layers[0].input],[model.layers[4].output])([X])[0]

    t = get_plot_vectors(layer_output,perplexity=30,n_iter=5000,random_state=10,method='exact')

    plot_tsne(t, labels, 'output/pc_last_dense_tsne.png')

In [None]:
tsne_plot('pc','aeda')

In [None]:
tsne_plot('pc','eda')

In [None]:
tsne_plot('pc','wordnet')

In [None]:
tsne_plot('pc','backtranslation')

In [None]:
tsne_plot('cr','eda')

In [None]:
tsne_plot('cr','aeda')

In [None]:
tsne_plot('cr','wordnet')

In [None]:
tsne_plot('cr','backtranslation')

In [None]:
tsne_plot('subj','eda')

In [None]:
tsne_plot('subj','aeda')

In [None]:
tsne_plot('subj','wordnet')

In [None]:
tsne_plot('subj','backtranslation')

## Plotly TSNE

In [None]:
model = load_model(dict_of_models['subj'])
model.summary()

In [None]:
word2vec = dict_of_word2vec_files['subj']
word2vec_len = 300
input_size = 25

In [6]:
dict_of_eda_augmented_datasets = {'pc':'data/pc/eda_augmenter_augmented.txt', 'cr':'data/cr/eda_augmenter_augmented.txt', 'subj':'data/subj/eda_augmenter_augmented.txt'}
dict_of_wordnet_augmented_datasets = {'pc':'data/pc/wordnet_augmenter_augmented.txt', 'cr':'data/cr/wordnet_augmenter_augmented.txt', 'subj':'data/subj/wordnet_augmenter_augmented.txt'}
dict_of_aeda_augmented_datasets = {'pc':'data/pc/aeda_augmenter_augmented.txt', 'cr':'data/cr/aeda_augmenter_augmented.txt', 'subj':'data/subj/aeda_augmenter_augmented.txt'}
dict_of_backtranslation_augmented_datasets = {'pc':'data/pc/backtranslation_augmenter_augmented.txt', 'cr':'data/cr/backtranslation_augmenter_augmented.txt', 'subj':'data/subj/backtranslation_augmenter_augmented.txt'}

dict_of_aug_methods = {'eda':dict_of_eda_augmented_datasets, 'wordnet':dict_of_wordnet_augmented_datasets, 'aeda':dict_of_aeda_augmented_datasets, 'backtranslation':dict_of_backtranslation_augmented_datasets}


In [None]:
augmented_file = dict_of_aug_methods['eda']['subj']

In [None]:
print(augmented_file)

In [None]:
X = train_x(augmented_file, word2vec_len, input_size, word2vec)

In [None]:

def plotly_tsne(df, df_name, method):
    total_distance = df['distance'].sum()
    fig = px.scatter(df, x='x', y='y', color='color'
                                , size='size'
                                , symbol='symbol'
                                , title=f't-SNE plot of {df_name} dataset with {method} augmentation method, total distance: {total_distance:.4f}'
                                , custom_data=[df['text'],df['label'],df['distance']]
                                #,labels={'label':'bbb', 'y':''}                   
                                ).update_traces(hovertemplate='Text: %{customdata[0]} <br>' +
                                                                            'Label: %{customdata[1]} <br>' +
                                                                            'Distance: %{customdata[2]:.5f} <br>' )
    fig.show()
                       

def cal_distance(df,number_of_augmentation_per_sample):
    dists = []
    for i in range(0,len(df),number_of_augmentation_per_sample+1):
        point_a = df[['x','y']].iloc[i]
        point_b = df[['x','y']].iloc[i+1]
        dist = distance.euclidean(point_a, point_b)
        dists.append(dist)
    dists = np.repeat(dists, 2)
    return dists


labels, _ = get_tsne_labels(augmented_file,1)

def label_to_str_map(x):
    if x == 0:
        return 'Con (augmented)'
    elif x == 1:
        return 'Pro (augmented)'
    elif x == 100:
        return 'Con (original)'
    elif x == 101:
        return 'Pro (original)'

def label_to_color_map(x):
    if x == 100 or x == 0:
        return 'red'
    elif x == 1 or x == 101:
        return 'blue'

def label_to_size_map(x):
    if x >= 100:
        return 100
    elif x < 100:
        return 10

def label_to_symbol_map(x):
    if x == 0 or x == 100:
        return '^'
    elif x == 1 or x == 101:
        return 'o'

labels_str = list(map(label_to_str_map,labels))
labels_color = list(map(label_to_color_map,labels))
labels_size = list(map(label_to_size_map,labels))
labels_symbol = list(map(label_to_symbol_map,labels))


In [None]:
intermediate_layer_model = Model(inputs=model.input, outputs=model.get_layer(model.layers[5].name).output)
layer_output = intermediate_layer_model.predict(X)

In [None]:
t = get_plot_vectors(layer_output,perplexity=30,n_iter=5000,random_state=10,method='exact')

In [None]:
df_t = pd.DataFrame(t, columns=['x','y'])
df_text = load_data(augmented_file)



df_text = df_text[['text']]
df_t['distance'] = cal_distance(df_t,1)
df_t['text'] = df_text['text']
df_t['label'] = labels_str
df_t['color'] = labels_color
df_t['size'] = labels_size
df_t['symbol'] = labels_symbol



In [None]:
def plotly_tsne(df, df_name, method):
    total_distance = df['distance'].sum()/2
    fig = px.scatter(df, x='x', y='y', color='color'
                                , size='size'
                                , symbol='symbol'
                                , title=f't-SNE plot of {df_name} dataset with {method} augmentation method, total distance: {total_distance:.4f}'
                                , custom_data=[df['text'],df['label'],df['distance']]
                                ).update_traces(hovertemplate='Text: %{customdata[0]} <br>' +
                                                                            'Label: %{customdata[1]} <br>' +
                                                                            'Distance: %{customdata[2]:.5f} <br>' )
    fig.update_layout(showlegend=False)
    fig.show()

In [None]:
plotly_tsne(df_t, 'pc','eda')

## experimental

In [100]:
def train_x(train_txt, word2vec_len, input_size, word2vec):

	#read in lines
	train_lines = open(train_txt, 'r').readlines()
	num_lines = len(train_lines)

	x_matrix = np.zeros((num_lines, input_size, word2vec_len))

	#insert values
	for i, line in enumerate(train_lines):

		parts = line[:-1].split('\t')
		label = int(parts[0])
		sentence = parts[1]	

		#insert x
		words = sentence.split(' ')
		words = words[:x_matrix.shape[1]] #cut off if too long
		for j, word in enumerate(words):
			if word in word2vec:
				x_matrix[i, j, :] = word2vec[word]

	return x_matrix





def plotly_tsne(df, df_name, method):
    total_distance = df['normailized_distance'].sum()/2
    fig = px.scatter(df, x='x', y='y', color='color'
                                , size='size'
                                , symbol='symbol'
                                , title=f't-SNE plot of {df_name} dataset with {method} augmentation method, total distance: {total_distance:.4f}'
                                , custom_data=[df['text'],df['label'],df['distance']]
                                #,labels={'label':'bbb', 'y':''}                   
                                ).update_traces(hovertemplate='Text: %{customdata[0]} <br>' +
                                                                            'Label: %{customdata[1]} <br>' +
                                                                            'Distance: %{customdata[2]:.5f} <br>' )
    fig.update_layout(showlegend=False)
    fig.show()
                       

def cal_distance(df,number_of_augmentation_per_sample):
    dists = []
    for i in range(0,len(df),number_of_augmentation_per_sample+1):
        point_a = df[['x','y']].iloc[i]
        point_b = df[['x','y']].iloc[i+1]
        dist = distance.euclidean(point_a, point_b)
        dists.append(dist)
    dists = np.repeat(dists, 2)
    return dists



def label_to_str_map(x):
    if x == 0:
        return 'Con (augmented)'
    elif x == 1:
        return 'Pro (augmented)'
    elif x == 100:
        return 'Con (original)'
    elif x == 101:
        return 'Pro (original)'

def label_to_color_map(x):
    if x == 100 or x == 0:
        return 'red'
    elif x == 1 or x == 101:
        return 'blue'

def label_to_size_map(x):
    if x >= 100:
        return 100
    elif x < 100:
        return 10

def label_to_symbol_map(x):
    if x == 0 or x == 100:
        return '^'
    elif x == 1 or x == 101:
        return 'o'


def get_tsne_labels(file,num_aug):
	labels = []
	alphas = []
	lines = open(file, 'r').readlines()
	for i, line in enumerate(lines):
		parts = line[:-1].split('\t')
		_class = int(parts[0])
		alpha = i % (num_aug+1)
		if alpha == 0:
			labels.append(_class+100)
			alphas.append(alpha)
		else:
			labels.append(_class)
			alphas.append(alpha)
	return labels, alphas

def get_plot_vectors(layer_output,perplexity=30,n_iter=1000,random_state=0,method='barnes_hut',learning_rate=200):

	tsne = TSNE(n_components=2,perplexity=perplexity,n_iter=n_iter,random_state=random_state,method=method,learning_rate=learning_rate).fit_transform(layer_output)
	return tsne


def get_x_y(train_txt, num_classes, word2vec_len, input_size, word2vec, percent_dataset):

    #read in lines
    train_lines = open(train_txt, 'r').readlines()
    shuffle(train_lines)
    train_lines = train_lines[:int(percent_dataset*len(train_lines))]
    num_lines = len(train_lines)

    #initialize x and y matrix
    x_matrix = None
    y_matrix = None

    try:
        x_matrix = np.zeros((num_lines, input_size, word2vec_len))
    except:
        print("Error!", num_lines, input_size, word2vec_len)
    y_matrix = np.zeros((num_lines, num_classes))

    #insert values
    for i, line in enumerate(train_lines):

        parts = line[:-1].split('\t')
        label = int(parts[0])
        sentence = parts[1]	

        #insert x
        words = sentence.split(' ')
        words = words[:x_matrix.shape[1]] #cut off if too long
        for j, word in enumerate(words):
            if word in word2vec:
                x_matrix[i, j, :] = word2vec[word]

        #insert y
        y_matrix[i][label] = 1.0

    return x_matrix, y_matrix


def one_hot_to_categorical(y):
    assert len(y.shape) == 2
    return np.argmax(y, axis=1)

In [101]:
dict_of_train_datasets = {'pc':'data/pc/train.txt','cr':'data/cr/train.txt','subj':'data/subj/train.txt'}
dict_of_test_datasets = {'pc':'data/pc/test.txt','cr':'data/cr/test.txt','subj':'data/subj/test.txt'}

dict_of_30_samples = {'pc':'data/pc/30_samples.txt','cr':'data/cr/30_samples.txt','subj':'data/subj/30_samples.txt'}

dict_of_models = {'pc':'models/pc_model.h5','cr':'models/cr_model.h5','subj':'models/subj_model.h5'}

dict_of_word2vec_files = {'pc':pickle.load(open('data/pc/word2vec.p', 'rb')),
                        'cr':pickle.load(open('data/cr/word2vec.p', 'rb')),
                        'subj':pickle.load(open('data/subj/word2vec.p', 'rb'))}


dict_of_eda_augmented_datasets = {'pc':'data/pc/eda_augmenter_augmented.txt', 'cr':'data/cr/eda_augmenter_augmented.txt', 'subj':'data/subj/eda_augmenter_augmented.txt'}
dict_of_wordnet_augmented_datasets = {'pc':'data/pc/wordnet_augmenter_augmented.txt', 'cr':'data/cr/wordnet_augmenter_augmented.txt', 'subj':'data/subj/wordnet_augmenter_augmented.txt'}
dict_of_aeda_augmented_datasets = {'pc':'data/pc/aeda_augmenter_augmented.txt', 'cr':'data/cr/aeda_augmenter_augmented.txt', 'subj':'data/subj/aeda_augmenter_augmented.txt'}
dict_of_backtranslation_augmented_datasets = {'pc':'data/pc/backtranslation_augmenter_augmented.txt', 'cr':'data/cr/backtranslation_augmenter_augmented.txt', 'subj':'data/subj/backtranslation_augmenter_augmented.txt'}

dict_of_aug_methods = {'eda':dict_of_eda_augmented_datasets, 'wordnet':dict_of_wordnet_augmented_datasets, 'aeda':dict_of_aeda_augmented_datasets, 'backtranslation':dict_of_backtranslation_augmented_datasets}


In [103]:
model = load_model(dict_of_models['cr'])

In [104]:
word2vec = dict_of_word2vec_files['cr']
word2vec_len = 300
input_size = 25

In [105]:
augmented_file = dict_of_aug_methods['eda']['cr']

In [106]:
X = train_x(augmented_file, word2vec_len, input_size, word2vec)

In [107]:
labels, _ = get_tsne_labels(augmented_file,1)

In [108]:
labels_str = list(map(label_to_str_map,labels))
labels_color = list(map(label_to_color_map,labels))
labels_size = list(map(label_to_size_map,labels))
labels_symbol = list(map(label_to_symbol_map,labels))

In [109]:
intermediate_layer_model = Model(inputs=model.input, outputs=model.get_layer(model.layers[4].name).output)
layer_output = intermediate_layer_model.predict(X)



In [110]:
t = get_plot_vectors(layer_output,perplexity=30,n_iter=10000,random_state=10,method='barnes_hut')

In [125]:
df_t = pd.DataFrame(t, columns=['x','y'])
df_t['standardized_x'] = df_t['x'].apply(lambda x: (x - df_t['x'].mean()) / df_t['x'].std())
df_t['standardized_y'] = df_t['y'].apply(lambda x: (x - df_t['y'].mean()) / df_t['y'].std())

df_text = load_data(augmented_file)


df_t['ground_truth'] = [int(i) for i in df_text['class']]
df_t['predicted_label'] = list(one_hot_to_categorical(model.predict(X)))

df_text = df_text[['text']]

df_t['distance'] = cal_distance(df_t,1)
df_t['normailized_distance'] = df_t['distance'].apply(lambda x: (x - df_t['distance'].min()) / (df_t['distance'].max() - df_t['distance'].min()))

df_t['text'] = df_text['text']
df_t['label'] = labels_str
df_t['color'] = labels_color
df_t['size'] = labels_size
df_t['symbol'] = labels_symbol






In [112]:
plotly_tsne(df_t, 'cr','eda')

In [71]:
plotly_tsne(df_t, 'subj','eda')

In [78]:
plotly_tsne(df_t, 'subj','eda')

In [34]:
pred = one_hot_to_categorical(layer_output)

In [35]:
pred

array([ 9, 11,  9,  9,  9,  9,  9,  9,  9, 15,  9, 11, 11,  9, 11,  9, 11,
       11, 11,  9, 11,  9, 15,  9,  9,  9,  9,  9, 11, 11, 11, 11, 11, 11,
        9,  9, 11, 11,  9,  9, 11, 11,  9,  9,  9, 11, 11, 11,  9,  9,  9,
        9, 11, 15,  9, 11, 11,  9, 11, 11])

In [37]:
layer_output.shape

(60, 20)

In [113]:
accuracy_score(df_t['ground_truth'],df_t['predicted_label'])

0.7

In [116]:
df_t['standardized_x'] = df_t['x'].apply(lambda x: (x - df_t['x'].mean()) / df_t['x'].std())
df_t['standardized_y'] = df_t['y'].apply(lambda x: (x - df_t['y'].mean()) / df_t['y'].std())

In [117]:
df_t

Unnamed: 0,x,y,ground_truth,predicted_label,distance,normailized_distance,text,label,color,size,symbol,standardized_distance,standardized_x,standardized_y
0,7.599387,-8.286252,0,1,1.047088,0.406031,"No USB and no optical zoom, not great in low l...",Con (original),red,100,^,0.610531,1.046947,-1.035637
1,8.402977,-8.957546,0,0,1.047088,0.406031,"No USB and no optic zoom, not swell in low light",Con (augmented),red,10,^,0.610531,1.50107,-1.448502
2,3.442508,-4.302541,1,1,0.313798,0.119471,"Buy it! Great first digital camera, Comes with...",Pro (original),blue,100,o,-0.481574,-1.302179,1.414464
3,3.751723,-4.355977,1,1,0.313798,0.119471,"Buy it! charger first digital great, Comes wit...",Pro (augmented),blue,10,o,-0.481574,-1.127436,1.3816
4,6.661739,-7.427606,0,1,1.662913,0.646688,"Port cover, antenna when extended, takes a bit...",Con (original),red,100,^,1.527694,0.517065,-0.507543
5,7.748543,-8.686231,0,0,1.662913,0.646688,"Port cover, antenna extended, takes a bit to f...",Con (augmented),red,10,^,1.527694,1.131238,-1.281635
6,3.365581,-4.317281,1,1,0.300296,0.114194,"Easy to use, great battery life, variety of ri...",Pro (original),blue,100,o,-0.501684,-1.345652,1.405399
7,3.214299,-4.576686,1,1,0.300296,0.114194,"Easy to use, great battery life, of ringtones ...",Pro (augmented),blue,10,o,-0.501684,-1.431144,1.245857
8,6.662807,-7.439452,0,1,0.670384,0.25882,Sometimes more than one sheet of paper is pull...,Con (original),red,100,^,0.049497,0.517669,-0.514829
9,6.176991,-6.977502,0,1,0.670384,0.25882,Sometimes through than one sheet of paper is p...,Con (augmented),red,10,^,0.049497,0.243126,-0.230717


In [120]:
def plotly_standard_tsne(df, df_name, method):
    total_distance = df['normailized_distance'].sum()/2
    fig = px.scatter(df, x='standardized_x', y='standardized_y', color='color'
                                , size='size'
                                , symbol='symbol'
                                , title=f't-SNE plot of {df_name} dataset with {method} augmentation method, total distance: {total_distance:.4f}'
                                , custom_data=[df['text'],df['label'],df['distance']]
                                #,labels={'label':'bbb', 'y':''}                   
                                ).update_traces(hovertemplate='Text: %{customdata[0]} <br>' +
                                                                            'Label: %{customdata[1]} <br>' +
                                                                            'Distance: %{customdata[2]:.5f} <br>' )
    fig.update_layout(showlegend=False)
    fig.show()

In [121]:
plotly_standard_tsne(df_t, 'cr','eda')

In [124]:
df_t

Unnamed: 0,x,y,ground_truth,predicted_label,distance,normailized_distance,text,label,color,size,symbol,standardized_x,standardized_y
0,7.599387,-8.286252,0,1,1.047088,0.406031,"No USB and no optical zoom, not great in low l...",Con (original),red,100,^,1.046947,-1.035637
1,8.402977,-8.957546,0,0,1.047088,0.406031,"No USB and no optic zoom, not swell in low light",Con (augmented),red,10,^,1.50107,-1.448502
2,3.442508,-4.302541,1,1,0.313798,0.119471,"Buy it! Great first digital camera, Comes with...",Pro (original),blue,100,o,-1.302179,1.414464
3,3.751723,-4.355977,1,1,0.313798,0.119471,"Buy it! charger first digital great, Comes wit...",Pro (augmented),blue,10,o,-1.127436,1.3816
4,6.661739,-7.427606,0,1,1.662913,0.646688,"Port cover, antenna when extended, takes a bit...",Con (original),red,100,^,0.517065,-0.507543
5,7.748543,-8.686231,0,0,1.662913,0.646688,"Port cover, antenna extended, takes a bit to f...",Con (augmented),red,10,^,1.131238,-1.281635
6,3.365581,-4.317281,1,1,0.300296,0.114194,"Easy to use, great battery life, variety of ri...",Pro (original),blue,100,o,-1.345652,1.405399
7,3.214299,-4.576686,1,1,0.300296,0.114194,"Easy to use, great battery life, of ringtones ...",Pro (augmented),blue,10,o,-1.431144,1.245857
8,6.662807,-7.439452,0,1,0.670384,0.25882,Sometimes more than one sheet of paper is pull...,Con (original),red,100,^,0.517669,-0.514829
9,6.176991,-6.977502,0,1,0.670384,0.25882,Sometimes through than one sheet of paper is p...,Con (augmented),red,10,^,0.243126,-0.230717


In [1]:
def aug_samples(dataset_name):
    data = load_data(dict_of_30_samples[dataset_name])
    methods = ['eda_augmenter','wordnet_augmenter','aeda_augmenter','backtranslation_augmenter']
    for method in methods:
        augmented_data = augment_text(data, method,fraction=1,pct_words_to_swap=0.2 ,transformations_per_example=1,
                    label_column='class',target_column='text',include_original=True)
        augmented_data = augmented_data[['class','text']]
        np.savetxt(f'data/{dataset_name}/{method}_augmented.txt', augmented_data.values, fmt='%s', delimiter='\t')


In [7]:
aug_samples('cr')

[nltk_data] Downloading package omw-1.4 to /home/peyman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/peyman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


[H[2J

[nltk_data] Downloading package omw-1.4 to /home/peyman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/peyman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


[H[2J

[nltk_data] Downloading package omw-1.4 to /home/peyman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/peyman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


[H[2J

[nltk_data] Downloading package omw-1.4 to /home/peyman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/peyman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


[H[2J

In [8]:
aug_samples('subj')

[nltk_data] Downloading package omw-1.4 to /home/peyman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/peyman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


[H[2J

[nltk_data] Downloading package omw-1.4 to /home/peyman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/peyman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


[H[2J

[nltk_data] Downloading package omw-1.4 to /home/peyman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/peyman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


[H[2J

[nltk_data] Downloading package omw-1.4 to /home/peyman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/peyman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


[H[2J