In [1]:
from aug import *
from functions import *
from tqdm import tqdm

import plotly.graph_objects as go

import numpy as np
import pandas as pd
import pickle

from scipy.spatial import distance

from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

from keras import backend as K
from keras.layers import Dense, Dropout, LSTM, Bidirectional
from keras.models import Sequential, load_model, Model
from keras.callbacks import EarlyStopping

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  #get rid of warnings

2023-01-20 19:07:25.073214: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-20 19:07:25.236640: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-01-20 19:07:25.710985: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-20 19:07:25.711027: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

[H[2J

In [46]:

def plotly_tsne(df, df_name, method,annotations=False):    
    df['normalized_distance'] = df['distance'].apply(lambda x: (x - df['distance'].min()) / (df['distance'].max() - df['distance'].min()))
    df['standardized_x'] = df['x'].apply(lambda x: (x - df['x'].mean()) / df['x'].std())
    df['standardized_y'] = df['y'].apply(lambda x: (x - df['y'].mean()) / df['y'].std())
    df['is_original'] = df['label'].str.contains('original')
    total_distance = df['normalized_distance'].sum()/2
    fig = go.Figure()
    for name, group in df.groupby(['label', 'is_original']):
        if name[0].startswith('Pro'):
            symbol = 'diamond'
            color = 'red'
            procon = 'Pro'
        elif name[0].startswith('Con'):
            symbol = 'circle'
            color = 'blue'
            procon = 'Con'
        size = 60 if name[1] else 20
        opacity = 0.3 if name[1] else 1
        fig.add_trace(go.Scatter(x=group['standardized_x'], y=group['standardized_y'],
                   mode='markers', marker=dict(size=size, color=color, symbol=symbol, opacity=opacity),
                   name=f'{procon} ({"Original" if name[1] else "Augmented"})',
                   text=group[['text','label','distance']].apply(lambda x: f'Text: {x[0]} <br> Label: {x[1]} <br> Distance: {x[2]:.5f}', axis=1),
                   hovertemplate='%{text}'
                   ))
    annotations = []
    if annotations:
        for i, row in df.loc[df['normalized_distance'] < 0.1 ][:6].iterrows():
            annotations.append(dict(x=row['standardized_x'], 
                                    y=row['standardized_y'], 
                                    text=row['text'], 
                                    showarrow=True, 
                                    arrowhead=1, 
                                    ax=10, 
                                    ay=10,
                                    font=dict(size=10,
                                        color='black',
                                        family='Monospace')
                                    ))
        fig.update_layout(title=f'total distance: {total_distance:.4f}',
                        annotations=annotations,
                      showlegend=True,
                      font=dict(family="Courier New, monospace", size=50, color="black"))
    else:
        fig.update_layout(title=f'total distance: {total_distance:.4f}',
                      showlegend=True,
                      font=dict(family="Courier New, monospace", size=50, color="black"))
    filename = f'images/{df_name}_{method}.png'
    # note that the image font and point size are set for the image to be saved with the below sizes. because of this, the image will look different when you open it in the browser
    fig.write_image(filename, width=3000, height=1000, scale=2)
    fig.show()





def cal_distance(df,number_of_augmentation_per_sample):
    dists = []
    for i in range(0,len(df),number_of_augmentation_per_sample+1):
        point_a = df[['x','y']].iloc[i]
        point_b = df[['x','y']].iloc[i+1]
        dist = distance.euclidean(point_a, point_b)
        dists.append(dist)
    dists = np.repeat(dists, 2)
    return dists


def train_x(train_txt, word2vec_len, input_size, word2vec):

	#read in lines
	train_lines = open(train_txt, 'r').readlines()
	num_lines = len(train_lines)

	x_matrix = np.zeros((num_lines, input_size, word2vec_len))

	#insert values
	for i, line in enumerate(train_lines):

		parts = line[:-1].split('\t')
		label = int(parts[0])
		sentence = parts[1]	

		#insert x
		words = sentence.split(' ')
		words = words[:x_matrix.shape[1]] #cut off if too long
		for j, word in enumerate(words):
			if word in word2vec:
				x_matrix[i, j, :] = word2vec[word]

	return x_matrix




def get_plot_vectors(layer_output,perplexity=30,n_iter=1000,random_state=0,method='barnes_hut'):

	tsne = TSNE(n_components=2,perplexity=perplexity,n_iter=n_iter,random_state=random_state,method=method,).fit_transform(layer_output)
	return tsne


def one_hot_to_categorical(y):
    assert len(y.shape) == 2
    return np.argmax(y, axis=1)


def get_tsne_labels(file,num_aug):
	labels = []
	alphas = []
	lines = open(file, 'r').readlines()
	for i, line in enumerate(lines):
		parts = line[:-1].split('\t')
		_class = int(parts[0])
		alpha = i % (num_aug+1)
		if alpha == 0:
			labels.append(_class+100)
			alphas.append(alpha)
		else:
			labels.append(_class)
			alphas.append(alpha)
	return labels, alphas


def label_to_str_map(x):
    if x == 0:
        return 'Con (augmented)'
    elif x == 1:
        return 'Pro (augmented)'
    elif x == 100:
        return 'Con (original)'
    elif x == 101:
        return 'Pro (original)'

# def label_to_color_map(x):
#     if x == 100 or x == 0:
#         return 'red'
#     elif x == 1 or x == 101:
#         return 'blue'

# def label_to_size_map(x):
#     if x >= 100:
#         return 100
#     elif x < 100:
#         return 10

# def label_to_symbol_map(x):
#     if x == 0 or x == 100:
#         return '^'
#     elif x == 1 or x == 101:
#         return 'o'
        

In [47]:
def run(dataset,method,):
    dict_of_eda_augmented_datasets = {'pc':'data/pc/eda_augmenter_augmented.txt', 'cr':'data/cr/eda_augmenter_augmented.txt', 'subj':'data/subj/eda_augmenter_augmented.txt'}
    dict_of_wordnet_augmented_datasets = {'pc':'data/pc/wordnet_augmenter_augmented.txt', 'cr':'data/cr/wordnet_augmenter_augmented.txt', 'subj':'data/subj/wordnet_augmenter_augmented.txt'}
    dict_of_aeda_augmented_datasets = {'pc':'data/pc/aeda_augmenter_augmented.txt', 'cr':'data/cr/aeda_augmenter_augmented.txt', 'subj':'data/subj/aeda_augmenter_augmented.txt'}
    dict_of_backtranslation_augmented_datasets = {'pc':'data/pc/backtranslation_augmenter_augmented.txt', 'cr':'data/cr/backtranslation_augmenter_augmented.txt', 'subj':'data/subj/backtranslation_augmenter_augmented.txt'}
    dict_of_train_datasets = {'pc':'data/pc/train.txt','cr':'data/cr/train.txt','subj':'data/subj/train.txt'}
    dict_of_test_datasets = {'pc':'data/pc/test.txt','cr':'data/cr/test.txt','subj':'data/subj/test.txt'}
    dict_of_30_samples = {'pc':'data/pc/30_samples.txt','cr':'data/cr/30_samples.txt','subj':'data/subj/30_samples.txt'}
    dict_of_models = {'pc':'models/pc_model.h5','cr':'models/cr_model.h5','subj':'models/subj_model.h5'}
    
    dict_of_aug_methods = {'eda':dict_of_eda_augmented_datasets,
                                         'wordnet':dict_of_wordnet_augmented_datasets,
                                          'aeda':dict_of_aeda_augmented_datasets,
                                           'backtranslation':dict_of_backtranslation_augmented_datasets}

    dict_of_word2vec_files = {'pc':pickle.load(open('data/pc/word2vec.p', 'rb')),
                        'cr':pickle.load(open('data/cr/word2vec.p', 'rb')),
                        'subj':pickle.load(open('data/subj/word2vec.p', 'rb'))}

    word2vec = dict_of_word2vec_files[dataset]
    word2vec_len = 300
    input_size = 25
    augmented_file = dict_of_aug_methods[method][dataset]
    labels, _ = get_tsne_labels(augmented_file,1)

    labels_str = list(map(label_to_str_map,labels))
    # labels_color = list(map(label_to_color_map,labels))
    # labels_size = list(map(label_to_size_map,labels))
    # labels_symbol = list(map(label_to_symbol_map,labels))

    model = load_model(dict_of_models[dataset])
    intermediate_layer_model = Model(inputs=model.input, outputs=model.get_layer(model.layers[4].name).output)
    X = train_x(augmented_file, word2vec_len, input_size, word2vec)
    layer_output = intermediate_layer_model.predict(X)

    t = get_plot_vectors(layer_output,perplexity=30,n_iter=5000,random_state=10,method='exact')
    
    
    df = pd.DataFrame(t,columns=['x','y'])
    df_text = load_data(augmented_file)
    
    df['distance'] = cal_distance(df,1)
    df['text'] = df_text['text']
    df['label'] = labels_str
    # df['color'] = labels_color
    # df['size'] = labels_size
    # df['symbol'] = labels_symbol

    plotly_tsne(df, dataset, method)
    return df
    

In [48]:
df = run('pc','backtranslation')



In [50]:
datasets = ['pc','cr','subj']
methods = ['eda','wordnet','aeda','backtranslation']
for dataset in datasets:
    for method in methods:
        run(dataset,method)























