<a href="https://colab.research.google.com/github/nicoostendorf/HCI/blob/master/SPINE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reproduction: Results on word2vec and GloVe embeddings

Setup utility functions for the model:  
https://github.com/jacobdanovitch/SPINE/blob/master/code/model/utils.py

In [None]:
import numpy as np
import logging
from sklearn.datasets import make_blobs
#logging.basicConfig(level=logging.INFO)
logging.basicConfig(level=logging.DEBUG)

############################################

class DataHandler:

	def __init__(self):
		pass

	def loadData(self, filename):
		logging.info("Loading data from " + filename )
		lines = open(filename).readlines()#[:limit] 
		self.data = []
		self.words = []
		for line in lines:
			tokens = line.strip().split()
			self.words.append(tokens[0])
			self.data.append([float(i) for i in tokens[1:]])
		self.data = np.array(self.data)
		logging.info("Loaded data. #shape = " + str(self.data.shape) )
		logging.info(" #words = %d " %(len(self.words)) )
		self.data_size = self.data.shape[0]
		self.inp_dim = self.data.shape[1]
		self.original_data = self.data[:]
		logging.debug("original_data[0][0:5] = " + str(self.original_data[0][0:5]))

	def getWordsList(self):
		return self.words

	def getDataShape(self):
		return self.data.shape

	def resetDataOrder(self):
		self.data = self.original_data[:]
		logging.debug("original_data[0][0:5] = " + str(self.original_data[0][0:5]))

	def getNumberOfBatches(self, batch_size):
		return int(( self.data_size + batch_size - 1 ) / batch_size)

	def getBatch(self, i, batch_size, noise_level, denoising):
		batch_y = self.data[i*batch_size:min((i+1)*batch_size, self.data_size)]
		batch_x = batch_y
		if denoising:
			batch_x = batch_y + get_noise_features(batch_y.shape[0], self.inp_dim, noise_level)
		return batch_x, batch_y

	def shuffleTrain(self):
		indices = np.arange(self.data_size)
		np.random.shuffle(indices)
		self.data = self.data[indices]

############################################

def compute_sparsity(X):
	non_zeros = 1. * np.count_nonzero(X)
	total = X.size
	sparsity = 100. * (1 - (non_zeros)/total)
	return sparsity

def dump_vectors(X, outfile, words):
	print ("shape", X.shape)
	assert len(X) == len(words) #TODO print error statement
	fw = open(outfile, 'w')
	for i in range(len(words)):
		fw.write(words[i] + " ")
		for j in X[i]:
			fw.write(str(j) + " ")
		fw.write("\n")
	fw.close()

def get_noise_features(n_samples, n_features, noise_amount):
	noise_x,  _ =  make_blobs(n_samples=n_samples, n_features=n_features, 
			cluster_std=noise_amount,
			centers=np.array([np.zeros(n_features)]))
	return noise_x


DEBUG:matplotlib.pyplot:Loaded backend module://ipykernel.pylab.backend_inline version unknown.


Define the SPINE model:  
https://github.com/jacobdanovitch/SPINE/blob/master/code/model/model.py


In [None]:
import torch
from torch import nn
import logging
logging.basicConfig(level=logging.INFO)


class SPINEModel(nn.Module):

	def __init__(self, params):
		super(SPINEModel, self).__init__()
		
		# params
		self.inp_dim = params['inp_dim']
		self.hdim = params['hdim']
		self.noise_level = params['noise_level']
		self.getReconstructionLoss = nn.MSELoss()
		self.rho_star = 1.0 - params['sparsity']
		
		# autoencoder
		logging.info("Building model ")
		self.linear1 = nn.Linear(self.inp_dim, self.hdim)
		self.linear2 = nn.Linear(self.hdim, self.inp_dim)
		

	def forward(self, batch_x, batch_y):
		
		# forward
		batch_size = batch_x.size(0)
		linear1_out = self.linear1(batch_x)
		h = linear1_out.clamp(min=0, max=1) # capped relu
		out = self.linear2(h)

		# different terms of the loss
		reconstruction_loss = self.getReconstructionLoss(out, batch_y) # reconstruction loss
		psl_loss = self._getPSLLoss(h, batch_size) 		# partial sparsity loss
		asl_loss = self._getASLLoss(h)    	# average sparsity loss
		total_loss = reconstruction_loss + psl_loss + asl_loss
		
		return out, h, total_loss, [reconstruction_loss,psl_loss, asl_loss]


	def _getPSLLoss(self,h, batch_size):
		return torch.sum(h*(1-h))/ (batch_size * self.hdim)


	def _getASLLoss(self, h):
		temp = torch.mean(h, dim=0) - self.rho_star
		temp = temp.clamp(min=0)
		return torch.sum(temp * temp) / self.hdim



Setup the SPINE model:  
https://github.com/jacobdanovitch/SPINE/blob/master/code/model/main.py

In [None]:
import torch
from torch import nn
import argparse
from random import shuffle
import numpy as np
import logging
logging.basicConfig(level=logging.INFO)


class Solver:

  def __init__(self, params):
    # Build data handler
    self.data_handler = DataHandler()
    self.data_handler.loadData(params['input'])
    params['inp_dim'] = self.data_handler.getDataShape()[1]
    logging.info("="*41)


    # Build model
    self.model = SPINEModel(params)
    self.dtype = torch.FloatTensor
    use_cuda = torch.cuda.is_available()
    if use_cuda:
      self.model = self.model.cuda()
      self.dtype = torch.cuda.FloatTensor
    
    self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1)
    logging.info("="*41)


  def train(self, params):
    num_epochs, batch_size = params['num_epochs'], params['batch_size'],
    optimizer = self.optimizer
    dtype = self.dtype
    for iteration in range(num_epochs):
      self.data_handler.shuffleTrain()
      num_batches = self.data_handler.getNumberOfBatches(batch_size)
      epoch_losses = np.zeros(4) # rl, asl, psl, total
      for batch_idx in range(num_batches):
        optimizer.zero_grad()
        batch_x, batch_y = self.data_handler.getBatch(batch_idx, batch_size, params['noise_level'], params['denoising'] )
        batch_x = torch.from_numpy(batch_x).type(dtype)
        batch_y = torch.from_numpy(batch_y).type(dtype)
        out, h, loss, loss_terms = self.model(batch_x, batch_y)
        reconstruction_loss, psl_loss, asl_loss = loss_terms
        loss.backward()
        optimizer.step()
        epoch_losses[0]+=reconstruction_loss.item()
        epoch_losses[1]+=asl_loss.item()
        epoch_losses[2]+=psl_loss.item()
        epoch_losses[3]+=loss.item()
      print("After epoch %r, Reconstruction Loss = %.4f, ASL = %.4f,"\
            "PSL = %.4f, and total = %.4f"
            %(iteration+1, epoch_losses[0], epoch_losses[1], epoch_losses[2], epoch_losses[3]) )

  def getSpineEmbeddings(self, batch_size, params):
    ret = []
    self.data_handler.resetDataOrder()
    num_batches = self.data_handler.getNumberOfBatches(batch_size)
    for batch_idx in range(num_batches):
      batch_x, batch_y = self.data_handler.getBatch(batch_idx, batch_size, params['noise_level'], params['denoising'] )
      batch_x = torch.from_numpy(batch_x).type(self.dtype)
      batch_y = torch.from_numpy(batch_y).type(self.dtype)
      _, h, _, _ = self.model(batch_x, batch_y)
      ret.extend(h.cpu().data.numpy())
    return np.array(ret)

  def getWordsList(self):
    return self.data_handler.getWordsList()
  

def parse_params(input, output=None, hdim=1000, denoising=False, noise_level=0.2, 
                 num_epochs=100, batch_size=64, sparsity=0.85):
  if output == None:
    output = "spine_" + input

  params = {
      'hdim': hdim,
      'denoising': denoising,
      'noise_level': noise_level,
      'num_epochs': num_epochs,
      'batch_size': batch_size,
      'sparsity': sparsity,
      'input': input,
      'output': output
  }
  return params

Running the Solver:  
https://github.com/jacobdanovitch/SPINE/blob/master/code/model/main.py  

Word vectors can be downloaded from:  
https://drive.google.com/drive/folders/1ksVcWDADmnp0Cl5kezjHqTg3Jnh8q031

In [None]:
from google.colab import files

def run_and_download(params):
  logging.info("PARAMS = " + str(params))
  logging.info("="*41)
  solver = Solver(params)
  logging.info("Device = "+ str(next(solver.model.parameters()).device))
  solver.train(params)

  # dumping the final vectors
  logging.info("Dumping the final SPine embeddings")
  output_path = params['output']
  final_batch_size = 512
  spine_embeddings = solver.getSpineEmbeddings(final_batch_size, params)
  dump_vectors(spine_embeddings, output_path, solver.getWordsList())

  files.download(params['output'])


In [None]:
# upload for the word embeddings
# (faster to directly upload to colab folder?)
files.upload()

In [None]:
file_name = 'glove_original_15k_300d_train.txt'

params = parse_params(file_name, num_epochs=4000, denoising=True, 
                      noise_level=0.4, sparsity=0.85, hdim=1000, batch_size=64)
run_and_download(params)

INFO:root:PARAMS = {'hdim': 1000, 'denoising': True, 'noise_level': 0.4, 'num_epochs': 4000, 'batch_size': 64, 'sparsity': 0.85, 'input': 'glove_original_15k_300d_train.txt', 'output': 'spine_glove_original_15k_300d_train.txt'}
INFO:root:Loading data from glove_original_15k_300d_train.txt
INFO:root:Loaded data. #shape = (15000, 300)
INFO:root: #words = 15000 
DEBUG:root:original_data[0][0:5] = [ 0.04656    0.21318   -0.0074364 -0.45854   -0.035639 ]
INFO:root:Building model 
INFO:root:Device = cuda:0


After epoch 1, Reconstruction Loss = 33.5689, ASL = 0.0465,PSL = 17.5023, and total = 51.1177
After epoch 2, Reconstruction Loss = 32.0082, ASL = 0.0263,PSL = 16.5886, and total = 48.6231
After epoch 3, Reconstruction Loss = 31.0739, ASL = 0.0150,PSL = 15.7298, and total = 46.8187
After epoch 4, Reconstruction Loss = 30.2866, ASL = 0.0092,PSL = 14.9716, and total = 45.2674
After epoch 5, Reconstruction Loss = 29.6680, ASL = 0.0060,PSL = 14.2660, and total = 43.9400
After epoch 6, Reconstruction Loss = 29.1058, ASL = 0.0039,PSL = 13.6384, and total = 42.7481
After epoch 7, Reconstruction Loss = 28.6166, ASL = 0.0029,PSL = 13.0952, and total = 41.7146
After epoch 8, Reconstruction Loss = 28.1874, ASL = 0.0025,PSL = 12.5506, and total = 40.7405
After epoch 9, Reconstruction Loss = 27.8216, ASL = 0.0023,PSL = 12.0941, and total = 39.9180
After epoch 10, Reconstruction Loss = 27.4654, ASL = 0.0021,PSL = 11.6655, and total = 39.1330
After epoch 11, Reconstruction Loss = 27.1625, ASL = 0.0023

INFO:root:Dumping the final SPine embeddings
DEBUG:root:original_data[0][0:5] = [ 0.04656    0.21318   -0.0074364 -0.45854   -0.035639 ]


After epoch 4000, Reconstruction Loss = 12.1514, ASL = 0.0088,PSL = 1.8741, and total = 14.0343
shape (15000, 1000)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
file_name = 'word2vec_original_15k_300d_train.txt'

params = parse_params(file_name, num_epochs=4000, denoising=True, 
                      noise_level=0.1, sparsity=0.85, hdim=1000, batch_size=64)
run_and_download(params)

INFO:root:PARAMS = {'hdim': 1000, 'denoising': True, 'noise_level': 0.1, 'num_epochs': 4000, 'batch_size': 64, 'sparsity': 0.85, 'input': 'word2vec_original_15k_300d_train.txt', 'output': 'spine_word2vec_original_15k_300d_train.txt'}
INFO:root:Loading data from word2vec_original_15k_300d_train.txt
INFO:root:Loaded data. #shape = (15000, 300)
INFO:root: #words = 15000 
DEBUG:root:original_data[0][0:5] = [0.189453 0.210938 0.205078 0.289062 0.21875 ]
INFO:root:Building model 
INFO:root:Device = cuda:0


After epoch 1, Reconstruction Loss = 7.5402, ASL = 0.0000,PSL = 8.7090, and total = 16.2492
After epoch 2, Reconstruction Loss = 7.1997, ASL = 0.0000,PSL = 7.4125, and total = 14.6122
After epoch 3, Reconstruction Loss = 7.0631, ASL = 0.0000,PSL = 6.3505, and total = 13.4137
After epoch 4, Reconstruction Loss = 6.9858, ASL = 0.0000,PSL = 5.4764, and total = 12.4622
After epoch 5, Reconstruction Loss = 6.9330, ASL = 0.0000,PSL = 4.7431, and total = 11.6761
After epoch 6, Reconstruction Loss = 6.9007, ASL = 0.0000,PSL = 4.1454, and total = 11.0461
After epoch 7, Reconstruction Loss = 6.8732, ASL = 0.0000,PSL = 3.6461, and total = 10.5193
After epoch 8, Reconstruction Loss = 6.8539, ASL = 0.0000,PSL = 3.2288, and total = 10.0827
After epoch 9, Reconstruction Loss = 6.8381, ASL = 0.0000,PSL = 2.8776, and total = 9.7158
After epoch 10, Reconstruction Loss = 6.8257, ASL = 0.0000,PSL = 2.5857, and total = 9.4114
After epoch 11, Reconstruction Loss = 6.8157, ASL = 0.0000,PSL = 2.3297, and tota

INFO:root:Dumping the final SPine embeddings
DEBUG:root:original_data[0][0:5] = [0.189453 0.210938 0.205078 0.289062 0.21875 ]


After epoch 4000, Reconstruction Loss = 5.9320, ASL = 0.0004,PSL = 0.1452, and total = 6.0776
shape (15000, 1000)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Visualization of embeddings:  
https://github.com/harsh19/SPINE/blob/master/code/evaluation/visualization/visualize_online.py

In [None]:
from multiprocessing import Queue
from queue import PriorityQueue
from numpy import linalg as LA
from scipy import stats
import numpy as np
import argparse
import time
import sys
import os
from tqdm import tqdm

top_k_words = []
zeros = 0.0
threshold = 0.001
h_dim = None
total = None
vectors = {}
num = 5
width = 10


def load_vectors(filename):
    global vectors, dimensions, zeros, h_dim, total, top_k_words
    vectors = {}
    zeros = 0.0
    f = open(filename, 'r')
    lines = f.readlines()
    f.close()
    dimension = len(lines[0].split()) - 1
    top_k_words = [ [] for i in range(dimension)]
    c = 0
    for line in tqdm(lines):
        start = time.time()
        words = line.strip().split()
        vectors[words[0]] = [abs(float(i)) for i in words[1:]]
        h_dim = len(words[1:])
        c += 1
        vector = vectors[words[0]]
        for i, val in enumerate(vector):
            temp = top_k_words[i]
            if len(temp) < width:
              temp.append((val,words[0]))
            else:
              check = temp[-1]
              if check[0] < val:
                temp[-1] = (val, words[0])
            top_k_words[i] = sorted(temp, reverse=True)
        zeros += sum([1 for i in vectors[words[0]] if i < threshold])
    print ("Sparsity =", 100. * zeros/(len(lines)*dimension))
    total = len(vectors)
    print ('done loading vectors')

def find_top_participating_dimensions(word, k):
        if word not in vectors:
            print ('word not found')
            return []
        temp = [(j, i) for i, j in enumerate(vectors[word])]
        answer = []
        print (" -----------------------------------------------------")
        print ("Word of interest = " , word)
        for i, j in sorted(temp, reverse=True)[:k]:
            print ("The contribution of the word '%s' in dimension %d = %f" %(word, j, i))
            print ('Following are the top words in dimension', j, 'along with their contributions')
            print (top_k_words[j])
            answer.append([k[1] for k in top_k_words[j]])
        return

In [None]:
#file_name = 'spine_glove_original_15k_300d_train.txt'
file_name = 'word2vec_original_15k_300d_train.txt'

load_vectors(file_name)

100%|██████████| 15000/15000 [00:06<00:00, 2293.77it/s]

Sparsity = 0.4829555555555556
done loading vectors





In [None]:
find_top_participating_dimensions('mathematics', 5)

 -----------------------------------------------------
Word of interest =  mathematics
The contribution of the word 'mathematics' in dimension 131 = 0.460938
Following are the top words in dimension 131 along with their contributions
[(0.785156, 'leukemia'), (0.714844, 'enterprises'), (0.703125, 'wingspan'), (0.664062, 'info'), (0.644531, 'booker'), (0.640625, 'strained'), (0.640625, 'recycling'), (0.640625, 'cancer'), (0.636719, 'surnames'), (0.625, 'quits')]
The contribution of the word 'mathematics' in dimension 289 = 0.445312
Following are the top words in dimension 289 along with their contributions
[(0.84375, 'ore'), (0.804688, 'greens'), (0.769531, 'badminton'), (0.765625, 'hymns'), (0.761719, 'clay'), (0.722656, 'gardener'), (0.691406, 'loch'), (0.6875, 'nursery'), (0.675781, 'sperm'), (0.675781, 'avenge')]
The contribution of the word 'mathematics' in dimension 253 = 0.431641
Following are the top words in dimension 253 along with their contributions
[(0.839844, 'asylum'), (0.

In [None]:
find_top_participating_dimensions('remote', 5)

 -----------------------------------------------------
Word of interest =  remote
The contribution of the word 'remote' in dimension 35 = 0.632812
Following are the top words in dimension 35 along with their contributions
[(0.996094, 'basilica'), (0.855469, 'sensory'), (0.820312, 'ranger'), (0.804688, 'chapel'), (0.792969, 'orchestras'), (0.792969, 'memorials'), (0.75, 'soloist'), (0.738281, 'soprano'), (0.734375, 'sr'), (0.71875, 'violinist')]
The contribution of the word 'remote' in dimension 178 = 0.421875
Following are the top words in dimension 178 along with their contributions
[(0.933594, 'microsoft'), (0.898438, 'sr'), (0.792969, 'malaysia'), (0.742188, 'jan'), (0.726562, 'cruisers'), (0.703125, 'propulsion'), (0.699219, 'towed'), (0.695312, 'philippines'), (0.695312, 'ibm'), (0.6875, 'hydraulic')]
The contribution of the word 'remote' in dimension 142 = 0.396484
Following are the top words in dimension 142 along with their contributions
[(0.855469, 'capt'), (0.78125, 'obey'), 

In [None]:
find_top_participating_dimensions('internet', 5)

 -----------------------------------------------------
Word of interest =  internet
The contribution of the word 'internet' in dimension 239 = 0.466797
Following are the top words in dimension 239 along with their contributions
[(0.71875, 'cardinals'), (0.710938, 'tsar'), (0.664062, 'papal'), (0.664062, 'autobiography'), (0.640625, 'befriends'), (0.628906, 'degrees'), (0.625, 'bachelor'), (0.609375, 'treatise'), (0.59375, 'rebels'), (0.589844, 'collar')]
The contribution of the word 'internet' in dimension 184 = 0.408203
Following are the top words in dimension 184 along with their contributions
[(0.800781, 'gases'), (0.742188, 'gov'), (0.679688, 'methane'), (0.667969, 'graph'), (0.660156, 'buttons'), (0.632812, 'button'), (0.625, 'celebrates'), (0.613281, 'derivatives'), (0.605469, 'astronauts'), (0.601562, 'reactors')]
The contribution of the word 'internet' in dimension 89 = 0.369141
Following are the top words in dimension 89 along with their contributions
[(0.886719, 'longitude'),

# Neuer Abschnitt

In [None]:
file_name = 'fasttext_15k.txt'

params = parse_params(file_name, num_epochs=1000, denoising=True, 
                      noise_level=0.2, sparsity=0.8, hdim=1000, batch_size=128)
run_and_download(params)

INFO:root:PARAMS = {'hdim': 1000, 'denoising': True, 'noise_level': 0.2, 'num_epochs': 1000, 'batch_size': 128, 'sparsity': 0.8, 'input': 'fasttext_15k.txt', 'output': 'spine_fasttext_15k.txt'}
INFO:root:Loading data from fasttext_15k.txt
INFO:root:Loaded data. #shape = (15000, 300)
INFO:root: #words = 15000 
DEBUG:root:original_data[0][0:5] = [ 0.1073  0.0089  0.0006  0.0055 -0.0646]
INFO:root:Building model 
INFO:root:Device = cuda:0


After epoch 1, Reconstruction Loss = 2.1586, ASL = 0.0000,PSL = 5.1873, and total = 7.3459
After epoch 2, Reconstruction Loss = 1.9168, ASL = 0.0000,PSL = 4.7141, and total = 6.6309
After epoch 3, Reconstruction Loss = 1.8030, ASL = 0.0000,PSL = 4.2964, and total = 6.0994
After epoch 4, Reconstruction Loss = 1.7402, ASL = 0.0000,PSL = 3.9278, and total = 5.6680
After epoch 5, Reconstruction Loss = 1.7052, ASL = 0.0000,PSL = 3.5865, and total = 5.2918
After epoch 6, Reconstruction Loss = 1.6763, ASL = 0.0000,PSL = 3.2899, and total = 4.9662
After epoch 7, Reconstruction Loss = 1.6551, ASL = 0.0000,PSL = 3.0204, and total = 4.6755
After epoch 8, Reconstruction Loss = 1.6408, ASL = 0.0000,PSL = 2.7800, and total = 4.4208
After epoch 9, Reconstruction Loss = 1.6268, ASL = 0.0000,PSL = 2.5684, and total = 4.1952
After epoch 10, Reconstruction Loss = 1.6176, ASL = 0.0000,PSL = 2.3695, and total = 3.9870
After epoch 11, Reconstruction Loss = 1.6075, ASL = 0.0000,PSL = 2.1982, and total = 3.80

In [None]:
file_name = 'fasttext_15k.txt'

load_vectors(file_name)

100%|██████████| 14999/14999 [00:06<00:00, 2379.08it/s]

Sparsity = 0.7063137542502833
done loading vectors





In [None]:
find_top_participating_dimensions('internet', 5)

 -----------------------------------------------------
Word of interest =  internet
The contribution of the word 'internet' in dimension 52 = 0.692500
Following are the top words in dimension 52 along with their contributions
[(2.2749, 'frequently-reverted'), (1.9671, 'FaQ'), (1.8609, '--XLinkBot'), (1.8434, 'important.svg'), (1.7539, 'nuvola.svg'), (1.6058, 'Information.png'), (1.5954, 'MergeAccount'), (1.5294, 'InternetArchiveBot'), (1.4951, 'Nuvola'), (1.4268, 'Information.svg')]
The contribution of the word 'internet' in dimension 249 = 0.365200
Following are the top words in dimension 249 along with their contributions
[(2.345, 'ANN'), (1.8886, 'ClueBot'), (1.5447, 'Information.png'), (1.4804, 'alt'), (1.3771, '|'), (1.3232, 'nuvola.svg'), (1.1899, 'Always'), (1.183, 'web.archive.org'), (1.1453, 'tildes'), (1.0612, 'Got')]
The contribution of the word 'internet' in dimension 232 = 0.359500
Following are the top words in dimension 232 along with their contributions
[(0.7426, 'RID')

In [None]:
file_name = 'spine_fasttext_15k.txt'

load_vectors(file_name)

100%|██████████| 14999/14999 [00:23<00:00, 646.59it/s]

Sparsity = 97.60762050803388
done loading vectors





In [None]:
find_top_participating_dimensions('Wikipedia', 5)

 -----------------------------------------------------
Word of interest =  Wikipedia
The contribution of the word 'Wikipedia' in dimension 875 = 0.306471
Following are the top words in dimension 875 along with their contributions
[(0.36659843, 'Lin'), (0.32978672, '19th'), (0.30647075, 'Wikipedia'), (0.29718524, 'Chiefs'), (0.2886821, 'resource'), (0.28549582, 'accountable'), (0.2698378, 'Bali'), (0.25885618, 'Jacob'), (0.2568134, 'Caroline'), (0.24885021, 'intersection')]
The contribution of the word 'Wikipedia' in dimension 202 = 0.296134
Following are the top words in dimension 202 along with their contributions
[(0.38295454, '�'), (0.3738711, 'encountered'), (0.37211323, 'FL'), (0.36709607, 'correspondence'), (0.36655766, '2012'), (0.34286666, 'fixture'), (0.3302195, 'enhance'), (0.30413705, 'glad'), (0.30150163, 'measurement'), (0.29613376, 'Wikipedia')]
The contribution of the word 'Wikipedia' in dimension 381 = 0.289063
Following are the top words in dimension 381 along with the

In [None]:
find_top_participating_dimensions('internet', 5)

 -----------------------------------------------------
Word of interest =  internet
The contribution of the word 'internet' in dimension 611 = 0.188178
Following are the top words in dimension 611 along with their contributions
[(0.4594207, 'Dog'), (0.4371216, 'Creation'), (0.37149218, 'We'), (0.3702331, '|'), (0.34171942, '\u200e'), (0.33390996, 'dams'), (0.32635692, 'Articles'), (0.3167685, 'Similarly'), (0.31501582, 'actor'), (0.3113062, 'technicians')]
The contribution of the word 'internet' in dimension 119 = 0.157287
Following are the top words in dimension 119 along with their contributions
[(0.4142223, 'Medical'), (0.4017504, 'Companies'), (0.39628845, 'tariffs'), (0.37772977, 'semester'), (0.34076464, 'No'), (0.3260334, 'valuation'), (0.32165086, 'Spirit'), (0.30362535, 'Yellow'), (0.30340743, 'Troy'), (0.29197633, 'cops')]
The contribution of the word 'internet' in dimension 269 = 0.093541
Following are the top words in dimension 269 along with their contributions
[(0.4073627