In [9]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig
import matplotlib.pyplot as plt
from m1_preprocessing import term_sentence_expansion
import nltk
from sklearn.manifold import TSNE
import pandas as pd

# BERT
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

In [2]:
with open('data/bert_test.txt', 'r') as f:
    sentences = [x.strip() for x in f.readlines()]

all = " ".join(sentences)
sentences_splitted = nltk.sent_tokenize(all)

In [3]:
max_len = 0

print("# sentences", len(sentences_splitted))
# For every sentence...
for sent in sentences_splitted:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

# sentences 319
Max sentence length:  138


In [4]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []
segment_ids = []

# For every sentence...
for (index, sent) in enumerate(sentences_splitted):
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 150,          # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.
    tokens = encoded_dict['input_ids']
    input_ids.append(tokens)
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])
    
    segment_ids.append(torch.tensor([[index % 2] * 150]))

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

segment_ids = torch.cat(segment_ids, dim=0)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])
print('Segment IDs:', segment_ids[0])

Original:  We study the applicability and potential of the algorithm to learn representations of varying depth in a handful of applications and domains  , highlighting the ability of the algorithm to provide discriminative feature representations that are able to achieve top performance. We present a hyper-parameter free  , off-the-shelf  , simple and fast unsupervised algorithm to discover hidden structure from the input data by enforcing a very strong form of sparsity.
Token IDs: tensor([  102,   185,   527,   111, 13214,   137,  1411,   131,   111,  1172,
          147,  6714,  6859,   131,  5543,  3826,   121,   106,  1500,  1004,
          131,  2040,   137,  4371,   422, 18579,   111,  2495,   131,   111,
         1172,   147,  1584, 20900,  2602,  6859,   198,   220,  2357,   147,
         3120,  1623,  1150,   205,   103,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,   

In [5]:
attention_masks[0]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])

In [6]:
tokenized_text = []
segment_ids = []
for (index, sent) in enumerate(sentences_splitted):
    if index == 0:
        sent = "[CLS] " + sent
    sent += " [SEP]"
    tokens = tokenizer.tokenize(sent)
    tokenized_text += tokens
    segment_ids += [index % 2] * len(tokens)
tokenized_text = tokenized_text[:-1]
segment_ids = segment_ids[:-1]
assert(len(tokenized_text) == len(segment_ids))

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segment_ids])
#text_bert = "[CLS] " + " [SEP] ".join(sentences_splitted)
#tokenized_text = tokenizer.tokenize(text_bert)



In [15]:
# Load pre-trained model (weights)
config = AutoConfig.from_pretrained("allenai/scibert_scivocab_uncased", output_hidden_states=True)
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased',
                                  config = config, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers.
with torch.no_grad():
    outputs = model(input_ids)
    # Evaluating the model will return a different number of objects based on
    # how it's  configured in the `from_pretrained` call earlier. In this case,
    # becase we set `output_hidden_states = True`, the third item will be the
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2]

In [16]:
print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")
layer_i = 0

print ("Number of batches:", len(hidden_states[layer_i]))
batch_i = 0

print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))
token_i = 0

print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))

Number of layers: 13   (initial embeddings + 12 BERT layers)
Number of batches: 319
Number of tokens: 150
Number of hidden units: 768


In [17]:
# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
token_embeddings = torch.stack(hidden_states, dim=0)

token_embeddings.size()

torch.Size([13, 319, 150, 768])

In [18]:
token_embeddings = token_embeddings.permute(1, 2, 0, 3)

token_embeddings.size()

torch.Size([319, 150, 13, 768])

In [19]:
# Stores the token vectors, with shape [22 x 768]
token_vecs_sum = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence #1...
for token in token_embeddings[0]:

    # `token` is a [12 x 768] tensor

    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    
    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

Shape is: 150 x 768


In [20]:
for i, vec in enumerate(token_vecs_sum):
    print(i, vec[:5])

0 tensor([ 1.8792, -0.3661, -1.6897,  1.6397,  0.7315])
1 tensor([ 1.6456,  1.1876, -3.7473, -1.1620, -6.6588])
2 tensor([ 3.1494, -1.6758, -4.0400,  2.4409, -3.7305])
3 tensor([-0.6335,  0.7184,  1.3884,  0.1791, -0.4720])
4 tensor([ 6.0752,  0.2215, -1.8379, -0.9989, -0.5707])
5 tensor([1.1246, 0.8387, 3.8351, 2.5916, 0.5349])
6 tensor([ 4.3702,  1.9355,  2.1857, -2.6150,  0.4570])
7 tensor([ 0.9938,  3.9295,  2.8824, -0.3774,  3.1096])
8 tensor([ 0.5817,  4.3221, -1.7123,  1.8197, -0.7708])
9 tensor([ 8.9040, -1.4435, -0.3873, -0.1808,  0.9032])
10 tensor([ 2.0552, -1.6113,  3.8797, -1.9013, -1.6329])
11 tensor([ 1.6235,  0.1619, -0.7588, -3.4230, -6.2082])
12 tensor([ 4.8970,  0.2624, -2.6085, -1.9068, -2.0620])
13 tensor([-2.9151, -0.3895, -2.2679, -1.2034, -1.6059])
14 tensor([-3.7825, -0.6049, -9.8044,  2.3338, -5.0351])
15 tensor([ 1.7000,  2.5368, -4.1168, -4.4919, -0.7711])
16 tensor([ 1.1474,  1.5393,  0.9041,  2.2031, -0.9126])
17 tensor([ 4.0277, -0.1442,  0.5819,  3.2180,

# Tests mit Pipelines
- feature-extraction: Generates a tensor representation for the input sequence
- ner: Generates named entity mapping for each word in the input sequence.
- sentiment-analysis: Gives the polarity (positive / negative) of the whole input sequence.
- text-classification: Initialize a TextClassificationPipeline directly, or see sentiment-analysis for an example.
- question-answering: Provided some context and a question refering to the context, it will extract the answer to the question in the context.
- fill-mask: Takes an input sequence containing a masked token (e.g. <mask>) and return list of most probable filled sequences, with their probabilities.
- summarization
- translation_xx_to_yy

In [40]:
from transformers import pipeline

# This pipeline extracts the hidden states from the base transformer,
# which can be used as features in downstream tasks.
pipe = pipeline('feature-extraction', model=model, tokenizer=tokenizer)
sentence = """We study the applicability and potential of the algorithm to learn representations of varying depth in a handful of applications and domains  , highlighting the ability of the algorithm to provide discriminative feature representations that are able to achieve top performance. We present a hyper-parameter free, off-the-shelf, simple and fast unsupervised algorithm to discover hidden structure from the input data by enforcing a very strong form of sparsity."""
out = pipe(sentence)

In [41]:
print("Sentences:", len(out))
print("Tokens:", len(out[0]))
print("Dimensions:", len(out[0][0]))
print("Words:", len(sentence.split(' ')))

Sentences: 1
Tokens: 82
Dimensions: 768
Words: 69


In [33]:
print(out[0][0][:5])
print(out[0][1][:5])
# lul ist das einfach

[0.6337162256240845, 0.28518983721733093, 0.016355250030755997, -0.3005073666572571, -0.7662723064422607]
[-0.2967832088470459, -1.0541183948516846, -1.8105967044830322, 0.5548749566078186, -0.4306401312351227]


In [44]:
encoded_dict = tokenizer.encode_plus(
                        sentence,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 150,          # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
encoded_dict

{'input_ids': tensor([[  102,   185,   527,   111, 13214,   137,  1411,   131,   111,  1172,
           147,  6714,  6859,   131,  5543,  3826,   121,   106,  1500,  1004,
           131,  2040,   137,  4371,   422, 18579,   111,  2495,   131,   111,
          1172,   147,  1584, 20900,  2602,  6859,   198,   220,  2357,   147,
          3120,  1623,  1150,   205,   185,   709,   106,  1884,   579,  2318,
          2159,   422,  1874,   579,   111,   579, 20103,   422,  2177,   137,
          3254, 18391,  1172,   147,  9819,  8033,  1187,   263,   111,  1653,
           453,   214, 15783,  7020,   106,  1248,  1648,   592,   131, 21123,
           205,   103,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [67]:
entity = "enforcing"
tokens = tokenizer.tokenize(entity)
#tokenizer.convert_ids_to_tokens(encoded_dict['input_ids'][0])

In [68]:
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = encoded_dict['input_ids'][0].numpy().tolist()

In [69]:
for i in range(0, len(input_ids) - len(ids) + 1):
    if input_ids[i:i+len(ids)] == ids:
        print(i)

72


In [70]:
len(input_ids)

150

In [72]:
array = ['SDR', 'Discrete Fourier Transform', 'MNIST', 'ACA', 'UCI Machine', 'Bloom', 'KNN', 'Scopus', 'Optimization', 'Gregex', 'Cubature Kalman', 'Sciences', 'ArcFace', 'CMOS', 'Hybrid Hierarchical Agglomerative Clustering', 'PBtree', 'Minimum Neighborhood Rough Sets', 'VBNMF', 'GIS', 'LGAE', 'Oracle10g', 'DFS', 'WSN', 'NLMS', 'LSH', 'ERD', 'MatchPy', 'DCASE2016', 'TWDA', 'Laguerre', 'Image Retrieval CBIR', 'Cloud', 'MHPCs', 'Jenkins', 'SCEBSS', 'Text', 'IMT', 'SFCscore', 'IIR', 'autoencoder', 'Ontologies', 'LFP', 'DNN', 'Stochastic Meta', 'Radial Basis Function', 'EWF', 'DSP', 'Suffix', 'Universitas', 'PCC', 'Monte Carlo', 'WEC', 'AdaGrad', 'Softmax Classification', 'Pairwise Latent Dirichlet Allocation', 'ASR', 'University', 'Java', 'BOW', 'SPSS', 'Softmax', 'Developed', 'Partial', 'FCDA', 'MTRF', 'Regularized Deep Autoencoder', 'SISO', 'Exponentiated Gumbel', 'Hybrid Extreme Random Forest', 'Error', 'SBADF', 'LVQ', 'Architecture Analysis', 'NEOCIVET', 'EPSO', 'adaptive filter', 'PSF', 'Marginal Loss', 'MIML', 'ISA', 'MEA', 'Wiener', 'Signal', 'LIB', 'Deep Neural Networks DNN', 'FLWOR', 'Support Vector', 'Support Vector Machines', 'CELP', 'Ant Colony Optimization', 'Artificial Bee Algorithm', 'DCT', 'Polyak', 'Long', 'Compared', 'ELM', 'HGP', 'Microsoft', 'IQMs', 'RFID', 'RSTRARelay Selection', 'Slim', 'Spherical Hashing', 'SQA', 'Chap', 'SVMPF', 'Improved Pearson Correlation', 'MMSE', 'sLDA', 'PCA', 'Boltzmann Machine', 'SDD', 'RMSC', 'LDA', 'WLAN', 'XOR Hashing', 'Meta Heuristic', 'hierarchical clustering', 'softmax', 'NJW', 'Consistency SVRC', 'Adaptive Filtering', 'BRISK', 'Particle', 'Random Search', 'pattern matching', 'PSMF', 'BNDMq', 'J48', 'DNNs', 'Bootstrapped Dendritic', 'Maximum Likelihood Estimates', 'AASC', 'selection algorithm', 'Variational Bayesian Inference VBI', 'EEG', 'RRI', 'UBIRISv1', 'Bipartite Graph Bigraph', 'XQuery', 'Regular Expression Functions', 'CDMA', 'Kernel Ridge Regression', 'VWV', 'Bagging', 'Abstract Approximate Nearest Neighbor', 'Semantic Web', 'Vietnamese', 'RBFN', 'random indexing', 'JAR', 'Automata', 'INSs', 'SSSC', 'Basic', 'Markov Chain Monte Carlo', 'Parallel Tempering', 'Syntax', 'UMQA', 'SOMs', 'Deep Neural Networks', 'Bootstrap', 'Latent Semantic Analysis', 'BNDM', 'Bayesian Network', 'Latent Semantic Analysis Pentti Kanerva Jan Kristoferson Anders Holst', 'Manual', 'Contractive Autoencoders', 'PageRank', 'FSIM', 'Simplified Chinese', 'Traits', 'predictive modeling', 'Object', 'Maryland', 'Aspect Embedded', 'MCMC RJMCMC', 'FGT', 'BRIEF', 'IBK', 'transformA', 'DCAE', 'Euclidian', 'Compression', 'MCMC SR', 'Haar', 'AdaBoost', 'XML', 'Angular Softmax', 'AGS', 'Markov', 'GTX260 GPU', 'DMC', 'BPE', 'PIP', 'KerBS', 'PSO', 'MRD', 'PID', 'RBF', 'Bootstrap Method', 'HDD', 'Search Engines', 'Radiology', 'BBC', 'TDRBF', 'WBTFSC', 'Rayleigh', 'CNNs', 'Cartesian', 'Stochastic', 'Support Vector Regression Machine', 'Deep Belief Network', 'SGD', 'REPTree', 'PART', 'RDS', 'Enhanced Adaptive Bayesian Spam Filter', 'FOFS', 'BNSA', 'RaPiD7 Rapid Production', 'Grid Search', 'ARX', 'MLR', 'MSSQL2005', 'Nesterov', 'Synthesis', 'Static', 'SRL', 'Bayesian Spam Filter', 'NPM', 'Fourier Domain Optical Coherence Tomography', 'HAC', 'Light Adaptive Bayesian Spam Filter', 'MDBUTMF', 'Abstrak Telah', 'Modified Differential Evolution MDE', 'Modern', 'EL', 'Variance', 'Pitch', 'Pattern', 'Urdu', 'Lee', 'SRAM', 'Random Boost', 'fMRI', 'HeteRecom', 'English', 'Random Indexing', 'support vector machine', 'ARMA', 'Locality Sensitive Hashing LSH', 'SVR', 'Fast Fourier', 'Components Regression', 'KFD', 'query optimization', 'Toward', 'ERM', 'Hidden Markov Model', 'LapRLS', 'BRUJA', 'CRF', 'restricted boltzmann machine', 'IMMSC', 'Latent Dirichlet Allocation LDA', 'Relief', 'Support', 'SVMR', 'Bhattacharyya', 'HCSD', 'Gaussian Process Bayesian Nonparametric', 'GloVes', 'GTZAN', 'ECC', 'Euclidean', 'SPARQL', 'MCSA Motor', 'Pearson Correlation', 'Mangasarian', 'Saudi Arabian', 'Feature', 'Semantic Indexing', 'TSVMTactic Support Vector Machine', 'random search', 'Text Samples', 'Kennedy', 'LITIS Rouen', 'Spectral Clustering', 'Zermelo', 'Xilinx', 'Feature Selection', 'Instagram', 'Smart Grids', 'Ncut', 'MEAN', 'RLEP', 'lib', 'RZF', 'RI', 'RBM', 'Weka', 'Random Field', 'Replicated Softmax', 'Online', 'PolSAR', 'Discrete Fourier', 'Martin', 'RDF', 'AUC', 'MC68000', 'Dynamically Growing', 'AEVB', 'EHR', 'VAE', 'Distress Analysis Interview', 'Method', 'Locally Consistent Parsing', 'Deep Convolutional Neural Networks', 'DFA', 'MIMO', 'Small Crush', 'OQPSK', 'SLU', 'Recursive', 'Design', 'Petri Nets', 'GRF', 'FBADF', 'Genetic', 'Latent Aspect Rating Regression', 'DMP', 'Semantic Role', 'SAGA', 'Laplace', 'KMeans', 'LMS', 'DBA', 'FGN', 'Random Binary Search', 'Bernoulli', 'MTSA', 'Hybrid', 'Simulation', 'RNA', 'MIB', 'Hierarchical Clustering', 'PRF', 'Short Time Fourier Transform STFT', 'Pearson Correlation Coefficients', 'Reservoir Forecasting', 'LiB', 'Kalman', 'Learned Image Compression', 'Hangeul', 'LDPC', 'regular expression', 'VBEM', 'Doppler', 'Widrow', 'CGF', 'Restricted Boltzmann', 'HACA', 'FNR', 'RNAs', 'MDCNN', 'Recurrent Neural Networks', 'FSMs', 'VNF', 'Dialog', 'CDNSA', 'POS', 'Hierarchical', 'LEAST', 'Radial Basis Function Neural Network', 'DSTC2', 'Gaussian', 'SVNSs', 'DTW', 'Cross', 'CRP', 'Likelihood Function', 'Logit Boost', 'Mixture Model DPMM', 'Load', 'Low Frequency Fluctuations', 'Givens', 'WebKB4', 'QALD', 'AES', 'Advent', 'Laser Range', 'CNN', 'SVSS', 'Ackermann', 'ABC', 'XMLeXtensible Markup', 'CFFT', 'Naives Bayes', 'INDEX', 'CONSTITUTION', 'MATLAB', 'CPA', 'KeywordPrediction', 'REDFA', 'OOV', 'GLUE', 'Pirkola', 'Guided Regularized Random Forest', 'fast fourier', 'Pattern Matching', 'Gabor', 'PSVMParallel Support Vector Machinederived', 'hierarchical agglomerative', 'MLP', 'Laplacian', 'Traditional Sampling Algorithm', 'Flags Properties', 'Indonesia', 'Third', 'Menlo Park', 'ReDoS', 'ECG', 'LAN', 'Hierarchical Agglomerative Clustering', 'Moldability', 'Mahalanobis Taguchi System', 'ML', 'Autoencoded Variational Inference For Topic Model AVITM', 'ROI', 'Arabic', 'TWSVM', 'IR4QA', 'CFD', 'VIF', 'Hilbert', 'Sweden', 'RDQCA', 'Systems Analysis', 'SCALCE', 'Java Android', 'Prefix Indexing', 'JeromeDL', 'Content', 'I2P', 'French', 'Gaussian Cox', 'SVD', 'Gibbs', 'VBI', 'Simple Random', 'LBA', 'Monash University', 'GWO', 'FDCT', 'Steepest Ascent Descent', 'Reflective Random Indexing', 'Partial Least Square', 'SBQL', 'cryoDRGN', 'NAFSM', 'rPPG', 'NASA93', 'Total', 'Ristretto', 'CHiME3', 'IMAE', 'QM', 'Deep', 'FFTsFast Fourier Transformers', 'RSM', 'TDNN', 'ARM University Program', 'TreeLogit', 'Word', 'HMM', 'VESSL', 'Tabu Search Algorithm', 'Layerwise Interweaving Convolutional', 'TREs', 'OFDM', 'MCMC', 'Algorithms', 'HCM', 'Shingle', 'Immune', 'RPSODE', 'Deep Autoencoder', 'Hierarchical Adaptive Clustering HAC', 'Accuracy', 'ARM', 'Restricted Boltzmann Machine', 'MGRBM', 'Transfer', 'Novelty', 'TSA Taboo Search Algorithm', 'SSD', 'Random Binary', 'Interlingua English', 'MAlg', 'Image', 'RDF Data Cubes', 'BWE', 'Connectivity', 'Neural Networks', 'C2LSH', 'recursive function', 'Design Language AADL', 'spectral clustering', 'CAE', 'Random Forest', 'QoS', 'UGM', 'Extreme Learning Machines ELM', 'Random Forests', 'Title', 'Data', 'IFSs', 'LDA Latent Dirichlet Allocation', 'Mandarin', 'Wavelet', 'SVMs', 'WiBro', 'Long Short Term Network', 'HACA Han', 'Throughput', 'ADC', 'Receiver', 'Speech', 'JobType FJT', 'Lipschitz', 'DST', 'Kneser Ney', 'Pattern Characteristics', 'rMRF', 'SkyNet', 'UEP', 'Monte Carlo MCMC', 'Cost', 'Random', 'Turkish', 'RAM', 'GCNSI Graph Convolutional Networks', 'FCM', 'Cross Language Information Retrieval', 'Reuters', 'Particle Swarm', 'GRE', 'TILT', 'DBN', 'Boltzmann', 'LEACH Low', 'REM', 'Xinjiang', 'word embedding', 'Data Presentation', 'Hanja', 'Fourier', 'Resource Allocation', 'UMQL', 'Dynamic Movement Primitives', 'RDCQA', 'Permasalahan', 'Affinity', 'Teknik', 'Convolutional Neural Network CNN', 'Channel', 'DFAs', 'Hailstorm', 'DNA', 'mcmc', 'Feature Cluster Grow', 'Keywords', 'Belief Propagation', 'Cuckoo Search', 'RWCP Theoretical Foundation', 'SVM', 'LIBLINEAR', 'Exponentiated', 'Pairwise Conditional Random Forest', 'RFFT', 'DynaMAD', 'ID3', 'Source Identification', 'Stability', 'Clustered', 'Attribute Weighting Method', 'Locality Sensitive Hashing', 'Query Analyzer', 'Log4J', 'InfoMax Autoencoder IMAE', 'Chinese', 'Support Vector Machine Support Vector Machine', 'Modified Random Forest', 'Random Indexing RI', 'DBNs', 'Adam', 'Weighted Random Forest', 'Abstract', 'Relations Frequency', 'Germanic', 'NRCF', 'Pearson', 'Enhanced Genetic Algorithm EGA', 'CORDIC', 'mulRBM', 'Random Rotation Forest', 'Genetic Algorithm', 'OP3', 'MRF', 'Restricted Boltzmann Machine RBM', 'LCCA', 'Long Short Term', 'Korean', 'Hierarchical Pitman Yor Process Language Model', 'Perceptual Linear', 'SoC', 'NOPs', 'Radial Basis Function Network', 'DBpedia', 'PREREQ', 'Direct Relation Frequency', 'Berkeley Segmentation Dataset', 'SOM', 'SSL', 'FFT', 'DCNN', 'IVIFSs', 'Extreme Learning Machines Autoencoder', 'Daphnee Rentfrow', 'Machine', 'Afterward', 'Hal', 'Alamouti', 'SSIM', 'SMF', 'Turbo', 'CD', 'Latent Dirichlet Allocation', 'BBFNN', 'autoencoderVAE', 'Network', 'Delta State University', 'Biogeography', 'Rotation Forest', 'Arduino', 'SMC', 'GRBM', 'PSNR', 'Bayesian', 'CFPRF', 'MDGVRPTW', 'OpenBUGS', 'Boruta Feature', 'Convolutional Neural Network', 'POP', 'BPDF', 'IRT', 'CPU', 'Snort', 'Restricted Boltzmann Machines', 'Relevance', 'Kim', 'Abstract Experimental', 'Stanford University', 'Particle Swarm Optimization PSO', 'radial basis function network', 'Particle Swarm Optimization', 'Distributed Database Systems', 'Expert', 'likelihood function', 'RKHSs', 'Collaborative Filtering', 'Hidden', 'TreeNetreg', 'Regional Homogeneity ReHo', 'Extreme Learning Machine Autoencoder', 'NOP', 'Variational Bayesian', 'RSVM', 'CEAS', 'Deep Neural Network', 'Hopfield', 'Golay', 'Poisson', 'ARS', 'Cooley', 'Mahony', 'System Analysis', 'Computer Science', 'RG', 'RMSProp', 'Sound', 'Google', 'Siamese', 'CRBM', 'RBMs', 'Cox', 'Random Connectivity', 'LASSO', 'Original Research', 'SPHC', 'TREC', 'DMOS', 'LKB', 'Dirichlet', 'Linear', 'Hybrid Radial Basis Kernel', 'IFFT', 'Hybrid Kernel Support Vector Machine', 'Japanese', 'Logistic', 'Middlebury', 'TNDM', 'Context', 'Island Code Transformation', 'Virtex II', 'Maximum Entropy Discriminant', 'DBSCAN', 'Android', 'AUV', 'UAVs', 'Riemannian', 'VSGML', 'Second', 'Structured Sparsity', 'SVM LinSVM', 'SQUARE', 'Hierarchical Dirichlet Process', 'Naive Bayes', 'RPCA', 'AFM', 'Timed Regular Expression Mining', 'AP', 'stochastic gradient descent', 'Cuckoo Search AlgorithmCSA', 'Link Latent Dirichlet Allocation LinkLDA', 'Hierarchical Machine Translation Model', 'reinforcement learning', 'OP2', 'DBM', 'semantic relevance', 'DWT', 'SAS Perl Regular Expression', 'Hierarchical Clustering Agglomerative Hierarchical Clustering', 'GDNSA', 'Pearson Correlation Coefficient', 'Normal Recovery Collaborative', 'linear regression', 'Nehorai', 'Short', 'Support Vector Machine', 'Profit Sharing', 'SMIB', 'Radial', 'SVGD', 'ICI', 'SpatDensReg', 'MEM', 'MCMC Niepert', 'PICR', 'TDCM', 'random forest', 'HRKSVM', 'OWASP Core Rule Set', 'Recurrent Neural Networks RNNs', 'Dynamic', 'Various', 'UMH', 'Minimum', 'Box', 'Instability', 'Fast Fourier Transform', 'User', 'Lib', 'Stochastic Gradient', 'Performance Evaluation', 'Generational Feature', 'Variational Autoencoder', 'Feature Elimination', 'Cubature Kalman Filter', 'MAB', 'NVRAM', 'Bayes', 'Horn', 'IR', 'FPGA', 'DLF', 'LSI', 'Radial Basis Functions', 'Simhash', 'basedonField Programmable', 'Random Forest RF', 'IDS', 'HDN', 'Volterra', 'ANN', 'Petroleum Engineering Department', 'DFT', 'Linear Regression', 'LibSVM', 'TSA', 'Computer', 'HSEG', 'THE', 'AMF']
print(len(array))
for x in array:
    print(x)

807
SDR
Discrete Fourier Transform
MNIST
ACA
UCI Machine
Bloom
KNN
Scopus
Optimization
Gregex
Cubature Kalman
Sciences
ArcFace
CMOS
Hybrid Hierarchical Agglomerative Clustering
PBtree
Minimum Neighborhood Rough Sets
VBNMF
GIS
LGAE
Oracle10g
DFS
WSN
NLMS
LSH
ERD
MatchPy
DCASE2016
TWDA
Laguerre
Image Retrieval CBIR
Cloud
MHPCs
Jenkins
SCEBSS
Text
IMT
SFCscore
IIR
autoencoder
Ontologies
LFP
DNN
Stochastic Meta
Radial Basis Function
EWF
DSP
Suffix
Universitas
PCC
Monte Carlo
WEC
AdaGrad
Softmax Classification
Pairwise Latent Dirichlet Allocation
ASR
University
Java
BOW
SPSS
Softmax
Developed
Partial
FCDA
MTRF
Regularized Deep Autoencoder
SISO
Exponentiated Gumbel
Hybrid Extreme Random Forest
Error
SBADF
LVQ
Architecture Analysis
NEOCIVET
EPSO
adaptive filter
PSF
Marginal Loss
MIML
ISA
MEA
Wiener
Signal
LIB
Deep Neural Networks DNN
FLWOR
Support Vector
Support Vector Machines
CELP
Ant Colony Optimization
Artificial Bee Algorithm
DCT
Polyak
Long
Compared
ELM
HGP
Microsoft
IQMs
RFID
RSTRARela

# Doc2vec tests

In [15]:
import gensim
doc2vec_model = gensim.models.Doc2Vec.load('embedding_models/doc2vec.model')

In [37]:
from gensim.utils import simple_preprocess
tokens = simple_preprocess("Combined with principal component analysis, the total training for ten one against the-rest classifiers on MNIST took just 0.77 hours.")
print(tokens)
new_vector = doc2vec_model.infer_vector(tokens)
sims = doc2vec_model.docvecs.most_similar([new_vector], topn=3)
print(sims)

['combined', 'with', 'principal', 'component', 'analysis', 'the', 'total', 'training', 'for', 'ten', 'one', 'against', 'the', 'rest', 'classifiers', 'on', 'mnist', 'took', 'just', 'hours']
[(38122746, 0.6723868250846863), (16402600, 0.6712477207183838), (39632846, 0.6673611402511597)]


In [23]:
len(doc2vec_model.docvecs)

40139652

In [6]:
doc2vec_model.docvecs[30431524]

array([-0.03852049, -0.0665906 , -0.04628057, -0.06231441, -0.03460921,
       -0.03901168, -0.06382208,  0.08229337,  0.00271638,  0.01581776,
        0.05019219,  0.1256509 , -0.03815013, -0.05365263, -0.05448654,
       -0.01666151, -0.01677482, -0.05219691,  0.04517033,  0.1090557 ,
        0.12321562,  0.08372267, -0.2060628 , -0.01822349, -0.07854325,
        0.01112043,  0.07415384, -0.10915   ,  0.06760772,  0.04235146,
       -0.04535627,  0.03000781, -0.01149387, -0.03809126, -0.06970099,
        0.01360476,  0.03218273,  0.11050344,  0.00869558,  0.04056333,
       -0.0830144 , -0.02412082, -0.03001807, -0.11603937,  0.01732241,
        0.02701879, -0.01442215,  0.05145655, -0.06035341,  0.06090457,
       -0.08889908,  0.00640001,  0.04755795,  0.06097076, -0.15226342,
        0.04331081,  0.0846197 ,  0.12408043, -0.03757742, -0.05635827,
        0.05589816, -0.02187142, -0.00418653, -0.04294994, -0.07213835,
       -0.02162061,  0.08874757, -0.01664794,  0.04523556, -0.05

In [10]:
new_vector

array([-0.4584659 ,  1.242882  , -0.20461333, -0.13547745,  0.17811774,
       -0.7919226 ,  0.16580945,  0.5773504 ,  0.27312937, -0.4790782 ,
        0.66863424,  0.35208142,  0.19789323,  0.15972097, -0.07386668,
       -0.46394625, -0.5921684 , -0.08119006, -0.11479389,  0.73383313,
        0.1443812 ,  0.29893023, -0.5370209 ,  0.09367172, -0.4327738 ,
        0.22623162,  0.71356255, -0.55759346, -0.840005  ,  0.28527948,
       -0.4888846 , -0.7491185 , -0.53943795, -0.24224772, -0.15615904,
       -0.5730955 , -0.14364181,  0.07767376, -0.47759956, -0.66972166,
       -0.4413844 ,  0.8561549 ,  0.08969028,  0.267796  , -0.04675876,
       -0.37878293, -0.9136244 , -0.83220655, -0.00457794,  0.38117346,
       -0.25626647,  0.3553839 , -0.5126471 ,  0.38053483, -0.627039  ,
       -0.5737841 ,  0.18739061, -0.02421715, -0.46818072,  0.10705938,
       -0.34396693, -0.03810226,  0.35623017,  0.39787683,  0.386166  ,
        0.27503106,  0.27898815, -0.10873193, -0.4889278 , -0.01