In [1]:
!pip install -U ffast

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
from ffast import load

### Base model (Wordnet)

In [3]:
tokeniser = load()

[nltk_data] Downloading package stopwords to /Users/mohammedterry-
[nltk_data]     jack/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mohammedterry-
[nltk_data]     jack/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Token IDs

In [4]:
outputs = tokeniser.encode("this is a test")
outputs.ids

  log_a = np.log(np.array(a, dtype=dtype))


[117797, 117730, 117659, 105137]

In [5]:
outputs = tokeniser.decode([117797, 117730, 117659, 105134])
str(outputs)

'this is a test'

### Other Information

In [6]:
outputs


        text = this
        morphology = this
        phonology = 0S
        id = 117797
        tag = <StopWord>
        similar = set()
        opposite = set()
        related = set()
        semantics = set()
        definition = None
        example = None
        

        text = is
        morphology = is
        phonology = IS
        id = 117730
        tag = <StopWord>
        similar = set()
        opposite = set()
        related = set()
        semantics = set()
        definition = None
        example = None
        

        text = a
        morphology = a
        phonology = A
        id = 117659
        tag = <StopWord>
        similar = set()
        opposite = set()
        related = set()
        semantics = set()
        definition = None
        example = None
        

        text = test
        morphology = test
        phonology = TST
        id = 105134
        tag = Verb
        similar = set()
        opposite = set()
        related = set()
        sema

### Compound words (e.g. 'big shot')

In [7]:
tokeniser.encode("the big shot")

  for synset in acyclic_breadth_first(self, rel, depth):



        text = the
        morphology = the
        phonology = 0
        id = 117788
        tag = <StopWord>
        similar = set()
        opposite = set()
        related = set()
        semantics = set()
        definition = None
        example = None
        

        text = big shot
        morphology = big shot
        phonology = BK XT
        id = 11031
        tag = Noun
        similar = {'big_fish', 'big_gun', 'head_honcho', 'big_cheese', 'big_enchilada', 'big_wheel', 'big_shot', 'big_deal'}
        opposite = set()
        related = {'knocker', 'colloquialism', 'supremo'}
        semantics = {'living_thing.n.01', 'entity.n.01', 'adult.n.01', 'physical_entity.n.01', 'organism.n.01', 'important_person.n.01', 'object.n.01', 'causal_agent.n.01', 'person.n.01', 'whole.n.02'}
        definition = an important influential person
        example = he thinks he's a big shot; she's a big deal in local politics; the Qaeda commander is a very big fish
        

### Filtering tokens (i.e. for Entity Extraction) 

In [8]:
outputs = tokeniser.encode("i will fly to the nasa space station now redfox")
str(outputs)

'i will fly to the nasa space station now redfox'

In [9]:
list(map(str,outputs.tokens))

['i', 'will', 'fly', 'to', 'the', 'nasa', 'space station', 'now', 'redfox']

In [10]:
str(outputs.skip_unknowns())

'i will fly to the nasa space station now'

In [11]:
str(outputs.skip_stopwords())

'fly nasa space station redfox'

In [12]:
str(outputs.nouns())

'nasa space station'

In [13]:
str(outputs.verbs())

'fly'

In [14]:
str(outputs.entities())

'fly nasa space station redfox'

### Disambiguation (e.g. "fast")

In [15]:
tokeniser.encode("fast from food in Ramadan")[0]


        text = fast
        morphology = fast
        phonology = FST
        id = 38980
        tag = Noun
        similar = {'fasting'}
        opposite = set()
        related = {'diet', 'Ramadan', 'hunger_strike', 'dieting'}
        semantics = {'act.n.02', 'entity.n.01', 'event.n.01', 'control.n.05', 'abstraction.n.06', 'abstinence.n.02', 'psychological_feature.n.01', 'self-denial.n.02', 'activity.n.01'}
        definition = abstaining from food
        example = 
        

In [16]:
tokeniser.encode("i'm going too fast")[-1]


        text = fast
        morphology = fast
        phonology = FST
        id = 41354
        tag = Adjective
        similar = {'quick', 'flying'}
        opposite = set()
        related = {'hurried'}
        semantics = set()
        definition = hurried and brief
        example = paid a flying visit; took a flying glance at the book; a quick inspection; a fast visit
        

### Paraphrasing Permutations

In [17]:
for variant in tokeniser.encode("he is a big shot").paraphrase():
    print(variant)

he is a big_fish 
he is a big_gun 
he is a head_honcho 
he is a big_cheese 
he is a big_enchilada 
he is a big_wheel 
he is a big_shot 
he is a big_deal 
he is a big shot


### Fast Token Comparisons 

In [18]:
tuna = tokeniser.encode("tuna")[0]
tuna


        text = tuna
        morphology = tuna
        phonology = TN
        id = 108939
        tag = Noun
        similar = {'Anguilla_sucklandii'}
        opposite = set()
        related = set()
        semantics = {'soft-finned_fish.n.01', 'bony_fish.n.01', 'living_thing.n.01', 'aquatic_vertebrate.n.01', 'entity.n.01', 'teleost_fish.n.01', 'whole.n.02', 'physical_entity.n.01', 'vertebrate.n.01', 'animal.n.01', 'fish.n.01', 'organism.n.01', 'object.n.01', 'chordate.n.01', 'eel.n.02'}
        definition = New Zealand eel
        example = 
        

In [19]:
tokens = tokeniser.encode("i ate fish and chips on the weekend")[:]
tuna.most_similar(tokens)


        text = fish and chips
        morphology = fish and chip
        phonology = FX ANT XPS
        id = 40416
        tag = Noun
        similar = {'fish_and_chips'}
        opposite = set()
        related = set()
        semantics = {'matter.n.03', 'entity.n.01', 'physical_entity.n.01', 'dish.n.02', 'nutriment.n.01', 'substance.n.07', 'food.n.01'}
        definition = fried fish and french-fried potatoes
        example = 
        

### Lightweight Contextual Sentence Embeddings (dot similarity)

In [20]:
outputs = tokeniser.encode("this is a test")
outputs.vector.shape

(1178620,)

In [21]:
best = outputs.most_similar([
    tokeniser.encode("test this is"),
    tokeniser.encode("this is an exam"),
    tokeniser.encode("bla bla bla food")
])
str(best)

'this is an exam'

In [22]:
best = tokeniser.encode("How big is London").most_similar([
    tokeniser.encode("London has 9,787,426 inhabitants at the 2011 census"),
    tokeniser.encode("London is known for its financial district"),
])
str(best)

  for synset in acyclic_breadth_first(self, rel, depth):


'London has 9,787,426 inhabitants at the 2011 census'

### Alternatively - Use the Poincare model

In [23]:
tokeniser = load("poincare")

In [24]:
outputs = tokeniser.encode("this is a asdasDS")
outputs

  log_a = np.log(np.array(a, dtype=dtype))



        text = this
        morphology = this
        phonology = 0S
        id = 20
        

        text = is
        morphology = is
        phonology = IS
        id = 6
        

        text = a
        morphology = a
        phonology = A
        id = 5
        

        text = asdasDS
        morphology = <Unknown>
        phonology = ASTSTS
        id = 50001
        

In [25]:
str(outputs.skip_unknowns())

'this is a'

In [26]:
str(tokeniser.decode([20,6,5,50001]))

'this is a <Unknown>'

In [27]:
tuna = tokeniser.encode("tuna")[0]
tuna


        text = tuna
        morphology = tuna
        phonology = TN
        id = 18955
        

In [28]:
tuna.semantics.shape

(100,)

In [29]:
other = tokeniser.encode("i ate fish and chips on the weekend")[:]

In [30]:
tuna.most_similar(other)


        text = fish
        morphology = fish
        phonology = FX
        id = 1671
        

In [31]:
best = tokeniser.encode("this is a test").most_similar([
    tokeniser.encode("test this is"),
    tokeniser.encode("this is an exam"),
    tokeniser.encode("bla bla bla food")
])
str(best)

'this is an exam'

In [32]:
best.vector.shape

(1240,)