In [42]:
from methods import *
from numpy.random import seed
from keras import backend as K
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
seed(0)

################################
#### get dense layer output ####
################################

#getting the x and y inputs in numpy array form from the text file
def train_x(train_txt, word2vec_len, input_size, word2vec):

	#read in lines
	train_lines = open(train_txt, 'r').readlines()
	num_lines = len(train_lines)

	x_matrix = np.zeros((num_lines, input_size, word2vec_len))

	#insert values
	for i, line in enumerate(train_lines):

		parts = line[:-1].split('\t')
		label = int(parts[0])
		sentence = parts[1]	

		#insert x
		words = sentence.split(' ')
		words = words[:x_matrix.shape[1]] #cut off if too long
		for j, word in enumerate(words):
			if word in word2vec:
				x_matrix[i, j, :] = word2vec[word]

	return x_matrix

def get_dense_output(model_checkpoint, file, num_classes):

	x = train_x(file, word2vec_len, input_size, word2vec)

	model = load_model(model_checkpoint)

	get_3rd_layer_output = K.function([model.layers[0].input], [model.layers[3].output])
	layer_output = get_3rd_layer_output([x])[0]

	return layer_output

def get_tsne_labels(file):
	labels = []
	alphas = []
	lines = open(file, 'r').readlines()
	for i, line in enumerate(lines):
		parts = line[:-1].split('\t')
		_class = int(parts[0])
		alpha = i % 10
		if alpha == 0:
			labels.append(_class+100)
			alphas.append(alpha)
		else:
			labels.append(_class)
			alphas.append(alpha)
	return labels, alphas

def get_plot_vectors(layer_output):

	tsne = TSNE(n_components=2,perplexity=10).fit_transform(layer_output)
	return tsne

def plot_tsne(tsne, labels, output_path):

	label_to_legend_label = {'output/cr_tsne.png':{	0:'Con (augmented)', 
															100:'Con (original)', 
															1: 'Pro (augmented)', 
								 							101:'Pro (original)'}}
								# 'outputs_f4/trec_tsne.png':{0:'Description (augmented)',
								# 							100:'Description (original)',
								# 							1:'Entity (augmented)',
								# 							101:'Entity (original)',
								# 							2:'Abbreviation (augmented)',
								# 							102:'Abbreviation (original)',
								# 							3:'Human (augmented)',
								# 							103:'Human (original)',
								# 							4:'Location (augmented)',
								# 							104:'Location (original)',
								# 							5:'Number (augmented)',
								# 							105:'Number (original)'}}

	plot_to_legend_size = {'output/cr_tsne.png':11}

	labels = labels#.tolist() 
	big_groups = [label for label in labels if label < 100]
	big_groups = list(sorted(set(big_groups)))

	colors = ['b', 'g']#, 'r', 'c', 'm', 'y', 'k', '#ff1493', '#FF4500']
	fig, ax = plt.subplots()

	for big_group in big_groups:

		for group in [big_group, big_group+100]:

			x, y = [], []

			for j, label in enumerate(labels):
				if label == group:
					x.append(tsne[j][0])
					y.append(tsne[j][1])

			#params
			color = colors[int(group % 100)]
			marker = 'o' if group in[0,100] else '^'
			size = 1 if group < 100 else 27		
			fillstyles = color if group < 100 else 'none'

			legend_label = label_to_legend_label[output_path][group]

			ax.scatter(x, y, color=color, marker=marker, s=size, facecolors=fillstyles, label=legend_label)
			plt.axis('off')

	legend_size = plot_to_legend_size[output_path]
	plt.legend(prop={'size': legend_size})
	plt.savefig(output_path, dpi=1000)
	plt.clf()








In [43]:
import plotly.express as px

#global variables
word2vec_len = 300
input_size = 20

datasets = ['cr'] #['pc', 'trec']
num_classes_list =[2] #[2, 6]

for i, dataset in enumerate(datasets):

    #load parameters
    model_checkpoint = 'output/' + dataset + '.h5'
    file = 'txt_for_test/' + dataset + '/test_aug.txt'
    num_classes = num_classes_list[i]
    word2vec_pickle = 'txt_for_test/' + dataset + '/word2vec.p'
    word2vec = load_pickle(word2vec_pickle)

    #do tsne
    layer_output = get_dense_output(model_checkpoint, file, num_classes)
    #print(layer_output.shape)
    t = get_plot_vectors(layer_output)

    # use plotly just to compare with the original plot from eda
    labels, alphas = get_tsne_labels(file)
    # projections = TSNE(n_components=2).fit_transform(layer_output)
    # fig = px.scatter(projections, x=0, y=1, color= labels, size= labels,labels={'color':'label'})
    # fig.show()
    #print(labels, alphas)

    writer = open("output/new_tsne.txt", 'w')

    label_to_mark = {0:'o', 1:'^',100:'o', 101:'^'}

    for i, label in enumerate(labels):
        alpha = alphas[i]
        line = str(t[i, 0]) + ' ' + str(t[i, 1]) + ' ' + str(label_to_mark[label]) + ' ' + str(alpha/10)
        writer.write(line + '\n')
    plot_tsne(t, labels, 'output/' + dataset + '_tsne.png')



The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



<Figure size 640x480 with 0 Axes>

In [53]:
def tsne(layer_number,input_size,perplexity,n_iter):
    def train_x(train_txt, word2vec_len, input_size, word2vec):

        #read in lines
        train_lines = open(train_txt, 'r').readlines()
        num_lines = len(train_lines)

        x_matrix = np.zeros((num_lines, input_size, word2vec_len))

        #insert values
        for i, line in enumerate(train_lines):

            parts = line[:-1].split('\t')
            label = int(parts[0])
            sentence = parts[1]	

            #insert x
            words = sentence.split(' ')
            words = words[:x_matrix.shape[1]] #cut off if too long
            for j, word in enumerate(words):
                if word in word2vec:
                    x_matrix[i, j, :] = word2vec[word]

        return x_matrix

    def get_dense_output(model_checkpoint, file, num_classes):

        x = train_x(file, word2vec_len, input_size, word2vec)

        model = load_model(model_checkpoint)

        get_3rd_layer_output = K.function([model.layers[0].input], [model.layers[layer_number].output])
        layer_output = get_3rd_layer_output([x])[0]

        return layer_output

    def get_tsne_labels(file):
        labels = []
        alphas = []
        lines = open(file, 'r').readlines()
        for i, line in enumerate(lines):
            parts = line[:-1].split('\t')
            _class = int(parts[0])
            alpha = i % 10
            if alpha == 0:
                labels.append(_class+100)
                alphas.append(alpha)
            else:
                labels.append(_class)
                alphas.append(alpha)
        return labels, alphas

    def get_plot_vectors(layer_output):

        tsne = TSNE(n_components=2,perplexity=perplexity,n_iter=n_iter).fit_transform(layer_output)
        return tsne

    def plot_tsne(tsne, labels):
        output_path = f'output/tsne/{dataset}_tsne_perplexity_{perplexity}__iteration_{n_iter}__layerNum_{layer_number}__inputSize_{input_size}.png'

        label_to_legend_label = {output_path:{	0:'Con (augmented)', 
                                                                100:'Con (original)', 
                                                                1: 'Pro (augmented)', 
                                                                101:'Pro (original)'}}
                                    # 'outputs_f4/trec_tsne.png':{0:'Description (augmented)',
                                    # 							100:'Description (original)',
                                    # 							1:'Entity (augmented)',
                                    # 							101:'Entity (original)',
                                    # 							2:'Abbreviation (augmented)',
                                    # 							102:'Abbreviation (original)',
                                    # 							3:'Human (augmented)',
                                    # 							103:'Human (original)',
                                    # 							4:'Location (augmented)',
                                    # 							104:'Location (original)',
                                    # 							5:'Number (augmented)',
                                    # 							105:'Number (original)'}}

        plot_to_legend_size = {output_path:11}

        labels = labels#.tolist() 
        big_groups = [label for label in labels if label < 100]
        big_groups = list(sorted(set(big_groups)))

        colors = ['b', 'g']#, 'r', 'c', 'm', 'y', 'k', '#ff1493', '#FF4500']
        fig, ax = plt.subplots()

        for big_group in big_groups:

            for group in [big_group, big_group+100]:

                x, y = [], []

                for j, label in enumerate(labels):
                    if label == group:
                        x.append(tsne[j][0])
                        y.append(tsne[j][1])

                #params
                color = colors[int(group % 100)]
                marker = 'o' if group in[0,100] else '^'
                size = 1 if group < 100 else 27		
                fillstyles = color if group < 100 else 'none'

                legend_label = label_to_legend_label[output_path][group]

                ax.scatter(x, y, color=color, marker=marker, s=size, facecolors=fillstyles, label=legend_label)
                plt.axis('off')

        legend_size = plot_to_legend_size[output_path]
        plt.legend(prop={'size': legend_size})
        plt.savefig(output_path, dpi=1000)
        plt.clf()




    for i, dataset in enumerate(datasets):

        #load parameters
        model_checkpoint = 'output/' + dataset + '.h5'
        file = 'txt_for_test/' + dataset + '/test_aug.txt'
        num_classes = num_classes_list[i]
        word2vec_pickle = 'txt_for_test/' + dataset + '/word2vec.p'
        word2vec = load_pickle(word2vec_pickle)

        #do tsne
        layer_output = get_dense_output(model_checkpoint, file, num_classes)
        #print(layer_output.shape)
        t = get_plot_vectors(layer_output)

        # use plotly just to compare with the original plot from eda
        labels, alphas = get_tsne_labels(file)
        # projections = TSNE(n_components=2).fit_transform(layer_output)
        # fig = px.scatter(projections, x=0, y=1, color= labels, size= labels,labels={'color':'label'})
        # fig.show()
        #print(labels, alphas)

        #writer = open("output/new_tsne.txt", 'w')

        #label_to_mark = {0:'o', 1:'^',100:'o', 101:'^'}

        # for i, label in enumerate(labels):
        #     alpha = alphas[i]
        #     line = str(t[i, 0]) + ' ' + str(t[i, 1]) + ' ' + str(label_to_mark[label]) + ' ' + str(alpha/10)
        #     writer.write(line + '\n')
        plot_tsne(t, labels)


In [55]:
from tqdm import tqdm
word2vec_len = 300
input_size = 25
datasets = ['cr'] 
num_classes_list = [2] 
layer_number = 4
perplexity = [20,30,40,50,60]
n_iter = [1000,2000,3000,4000,5000]

for j in tqdm(perplexity):
    for k in tqdm(n_iter):
        tsne(layer_number,input_size,j,k)


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.

100%|██████████| 5/5 [07:45<00:00, 93.16s/it] 

The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.


The defaul

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>