In [None]:
# install fastText as a python library
!git clone https://github.com/facebookresearch/fastText.git
!pip install fastText

In [None]:
# download pretrained word embeddings (english, crosslingual) from fasttext
!python fastText/download_model.py en
!mkdir models
!mv cc.en.300.bin models/
!rm cc.en.300.bin.gz

In [None]:
import numpy as np
import re
import fasttext as ft
model = ft.load_model("models/cc.en.300.bin")

In [None]:
def vectorize(text, tokenization_method="word", combination_method="mean"):
    text = preprocess(text)
    tokens = tokenize(text, tokenization_method)
    vectors = embed(tokens, tokenization_method)
    return combine_token_vectors(vectors, combination_method)

##### HELPER FUNCTIONS ######

def preprocess(text):
    # Same preprocessing as fasttext example: 
    # https://github.com/facebookresearch/fastText/blob/master/classification-example.sh#L14
    # See fasttext docs for more preprocessing:
    # https://github.com/facebookresearch/fastText/tree/master/python#important-preprocessing-data--encoding-conventions
    # Text should be converted to UTF-8 (not done here).
    text = text.lower().replace("'", " ' ").replace('"', "").replace(".", " . ") \
        .replace("<br />", "").replace(",", " , ").replace("(", " ( ") \
        .replace(")", " ) ").replace("!", " ! ").replace("?", " ? ") \
        .replace(";", " ").replace(":", " ")
    text = " ".join(text.split())
    return text

SENTENCE_DELIMITERS = "\.|;|\!|\?"
def tokenize(text, method="word"):
    if method == "word":
        return ft.tokenize(text)
    if method == "sentence":
        return re.split(SENTENCE_DELIMITERS, text)

def embed(list_of_strings, method="word"):
    vectors = []
    for s in list_of_strings:
        if method == "word":
            vectors.append(model.get_word_vector(s))
        elif method == "sentence":
            vectors.append(model.get_sentence_vector(s))
    return vectors

def combine_token_vectors(vectors, method="mean"):
    if method == "mean":
        return np.mean(vectors, axis=0)
    if method == "concatenate":
        return np.concatenate(vectors)

In [None]:
vector = vectorize("help me search for something")
vector.shape

(300,)