<h1> USING FASTTEXT FOR TOKEN AND SEQUENCE EMBEDDINGS </h1>

The folder [shared_resources/pretrained_fasttext](../shared_resources/pretrained_fasttext) contains the link to the pretrained fasttext model.

## Example: Embedding embed_32 model

In [None]:
## Make sure to install gensim package

from gensim.models import FastText
import gensim
 
##try loading the tokenizer
PATH = '../shared_resources/pretrained_fasttext/embed_if_32.mdl'
embed_model = gensim.models.FastText.load(PATH)

### Similarity

In [None]:
similarity = embed_model.wv.similarity('=', '>')
print(similarity)

similarity = embed_model.wv.similarity('==', '>')
print(similarity)

### Most similar tokens

In [4]:
most_similar = embed_model.wv.most_similar('len(x)')
print(most_similar)

[('v_len', 0.8910567760467529), ('len_', 0.880039393901825), ('m5_len', 0.8571930527687073), ('len_w', 0.8483595252037048), ('x_len', 0.8370495438575745), ('lenlen', 0.8341297507286072), ('len', 0.8320168852806091), ('lenr', 0.8259851336479187), ('ar_len', 0.8248469233512878), ('d_len', 0.8217968940734863)]


### Embeddding vector

In [5]:
vector = embed_model.wv['for']
print(vector)

[  6.7529564   -9.964394    -1.8260309  -10.388263    -6.5207434
   1.9055498    4.285392   -11.140352     4.084665    -7.6023407
   8.614147    16.28056      1.5551897    8.734416    -3.3114815
   0.35531428  -6.8794856   -5.0623484  -11.065971    -7.802344
  -0.86469924   6.623688    -7.1245127   -5.279053     1.6626914
  -5.969906     2.29899     10.980888     3.3477707   -7.4630017
  -3.7319052   -0.6246684 ]


## Sequence embedding

In [6]:
from sklearn import preprocessing
import numpy as np

# sequences: list of sequences of tokens
# embd: embedding model
# embedding vector size

def vectorize_code(tokens, model):
    vector = []
    for t in tokens:
        vector.append(model.wv[t])
    return vector

def vectorize_trim_pad(sequences, embd, embed_dim, seq_length = 48):
    
    trimed_stmts = []
    for seq in sequences:
        if len(seq) >= seq_length:
            seq_vec = vectorize_code(seq[:seq_length], embd)
        else:
            seq_vec = vectorize_code(seq, embd) + [np.zeros(embed_dim) for _ in range(len(seq), seq_length, 1)]
            
        trimed_stmts.append(seq_vec)
    return np.array(trimed_stmts)

In [7]:
sequences = [["if", "x", ">", "0"]]


embed_dim = 32
seq_length = 48
vectors = vectorize_trim_pad(sequences, embed_model, embed_dim, seq_length = seq_length)

In [8]:
vectors

array([[[-21.96269417,  11.33774567,  12.3253231 , ..., -12.78934097,
           6.25265408,   6.54338169],
        [ -1.79927635,  10.91177177,  21.54956436, ...,  -2.67278123,
          11.17793274,  -3.15423632],
        [-31.85717583,  -6.4174633 ,  -1.22788358, ...,  -3.16708946,
          35.54802704,  11.04066658],
        ...,
        [  0.        ,   0.        ,   0.        , ...,   0.        ,
           0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        , ...,   0.        ,
           0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        , ...,   0.        ,
           0.        ,   0.        ]]])