In [6]:
import nltk
nltk.download('stopwords')
nltk.download('genesis')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pwierzgala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package genesis to
[nltk_data]     /home/pwierzgala/nltk_data...
[nltk_data]   Unzipping corpora/genesis.zip.


True

# Fasttext

In [8]:
from gensim.models.fasttext import FastText
from nltk.corpus import genesis, stopwords

def preprocess(sentence, stop_words):
    sentence = [word.lower() for word in sentence if word.isalpha()]
    sentence = [word for word in sentence if word not in stop_words]
    return sentence


stop_words = set(stopwords.words('english'))
stop_words.update(["that", "cannot", "thus", "here", "where", "whether",
                   "hath", "unto", "neither", "wherefore", "whither",
                   "could", "thence", "mayest", "therein", "wherein"])

sentences = genesis.sents('english-kjv.txt')
sentences = [preprocess(sentence, stop_words) for sentence in sentences]

ft_model = FastText(
    sentences=sentences,
    sg=1,
    iter=50,
    size=10
)

## Model vocabulary

In [9]:
[word for word in ft_model.wv.vocab][:5]

['beginning', 'god', 'created', 'heaven', 'earth']

## Word vector

In [11]:
print(ft_model.wv["blood"])

[-0.1857974   0.82725245  0.08990229 -0.58054435  0.67336184 -1.4477103
  0.66258544  0.02946591 -1.3454525   0.69255394]


## Most similar words

In [12]:
ft_model.wv.most_similar(positive=["serpent"], topn=3)

[('woman', 0.9586037993431091),
 ('surely', 0.9291962385177612),
 ('sorrow', 0.9276148080825806)]

## Words similarity

In [13]:
ft_model.wv.similarity(w1="lord", w2="god")

0.92658615

## Mismatched word

In [14]:
ft_model.wv.doesnt_match(list("serpent tree fruit egypt".split()))

'egypt'

## Check if word exists im model

In [15]:
print("cat" in ft_model.wv.vocab)
print("cattle" in ft_model.wv.vocab)

False
True


## Check if vector exists im model

In [None]:
print("cat" in ft_model.wv)
print("cattle" in ft_model.wv)

## Pretrained vectors

In [None]:
from gensim.models import KeyedVectors

# Create language model
# Download pretrained vectors from: https://fasttext.cc/docs/en/english-vectors.html
en_model = KeyedVectors.load_word2vec_format('wiki-news-300d-1M.vec')

# Printing dimension of a word vector.
print("Dimension of a word vector: {}".format(
    len(en_model[words[0]])
))

# Print vector of a word.
print("Vector components of a word: {}".format(
    en_model[words[0]]
))

In [None]:
def print_related(vec, *words):
    """
    Parameters
    ----------
    vec : ndarray
        Vector representing a word for which similar words will be displayed.
    words : list of strings
        Words that should be excluded from similar words.
    """
    similar_words = en_model.similar_by_vector(vec)
    similar_words = [word for word, proba in similar_words if word not in words]
    print(similar_words)
    print(similar_words[0])

# king - man = ? - woman
vec = en_model["king"] - en_model["man"] + en_model["woman"]
words = ["king", "man", "woman"]
print_related(vec, *words)

# book - reading = ? - watching
vec = en_model["book"] - en_model["reading"] + en_model["watching"]
words = ["book", "reading", "watching"]
print_related(vec, *words)

# walk - walked = ? - went
vec = en_model["walk"] - en_model["walked"] + en_model["went"]
words = ["walk", "walked", "went"]
print_related(vec, *words)

# paris - france = poland - ?
vec = en_model["paris"] - en_model["france"] + en_model["poland"]
words = ["france", "paris", "poland"]
print_related(vec, *words)