In [1]:
from fast import load

In [2]:
tokeniser = load()

### Token IDs

In [3]:
outputs = tokeniser.encode("this is a test")
outputs.ids

  log_a = np.log(np.array(a, dtype=dtype))


[117797, 117730, 117659, 105134]

In [5]:
outputs = tokeniser.decode([117797, 117730, 117659, 105134])

In [6]:
str(outputs)

'this is a test'

### Other Information

In [7]:
outputs


        text = this
        morphology = this
        phonology = 0S
        id = 117797
        tag = <StopWord>
        similar = set()
        opposite = set()
        related = set()
        semantics = set()
        definition = None
        example = None
        

        text = is
        morphology = is
        phonology = IS
        id = 117730
        tag = <StopWord>
        similar = set()
        opposite = set()
        related = set()
        semantics = set()
        definition = None
        example = None
        

        text = a
        morphology = a
        phonology = A
        id = 117659
        tag = <StopWord>
        similar = set()
        opposite = set()
        related = set()
        semantics = set()
        definition = None
        example = None
        

        text = test
        morphology = test
        phonology = TST
        id = 105134
        tag = Verb
        similar = set()
        opposite = set()
        related = set()
        sema

### Compound words (e.g. 'big shot')

In [8]:
tokeniser.encode("the big shot")

  for synset in acyclic_breadth_first(self, rel, depth):



        text = the
        morphology = the
        phonology = 0
        id = 117788
        tag = <StopWord>
        similar = set()
        opposite = set()
        related = set()
        semantics = set()
        definition = None
        example = None
        

        text = big shot
        morphology = big shot
        phonology = BK XT
        id = 11031
        tag = Noun
        similar = {'big_cheese', 'big_deal', 'big_wheel', 'big_fish', 'big_enchilada', 'head_honcho', 'big_gun', 'big_shot'}
        opposite = set()
        related = {'knocker', 'colloquialism', 'supremo'}
        semantics = {'causal_agent.n.01', 'important_person.n.01', 'living_thing.n.01', 'organism.n.01', 'entity.n.01', 'whole.n.02', 'person.n.01', 'physical_entity.n.01', 'object.n.01', 'adult.n.01'}
        definition = an important influential person
        example = he thinks he's a big shot; she's a big deal in local politics; the Qaeda commander is a very big fish
        

### Filtering tokens (i.e. for Entity Extraction) 

In [9]:
outputs = tokeniser.encode("i am flying onto the nasa space station now redfox")
str(outputs)

'i am flying onto the nasa space station now redfox'

In [10]:
str(outputs.skip_unknowns())

'i am flying the nasa space station now'

In [11]:
str(outputs.skip_stopwords())

'flying onto nasa space station redfox'

In [12]:
str(outputs.nouns())

'nasa space station'

In [13]:
str(outputs.verbs())

'flying'

In [14]:
str(outputs.entities())

'flying nasa space station'

### Disambiguation (e.g. "fast")

In [16]:
outputs1 = tokeniser.encode("i like to fast from food")
outputs2 = tokeniser.encode("i'm going too fast")

In [10]:
outputs1[-3]


        text = fast
        morphology = fast
        phonology = FST
        id = 38987
        tag = Adjective
        similar = set()
        opposite = set()
        related = {'causative'}
        semantics = set()
        definition = (of a photographic lens or emulsion) causing a shortening of exposure time
        example = a fast lens
        

In [11]:
outputs2[-1]


        text = fast
        morphology = fast
        phonology = FST
        id = 38987
        tag = Adjective
        similar = set()
        opposite = set()
        related = {'causative'}
        semantics = set()
        definition = (of a photographic lens or emulsion) causing a shortening of exposure time
        example = a fast lens
        

### Paraphrasing Permutations

In [21]:
for variant in tokeniser.encode("i smell a rat").paraphrase():
    print(variant)

i taste a rat
i smack a rat
i smell a scab 
i smell a blackleg 
i smell a strikebreaker 
i smell a rat


### Fast Token Comparisons 

In [12]:
fish = tokeniser.encode("fish")[0]
fish

  for synset in acyclic_breadth_first(self, rel, depth):



        text = fish
        morphology = fish
        phonology = FX
        id = 40414
        tag = Noun
        similar = {'Pisces', 'Fish'}
        opposite = set()
        related = {'star_divination', 'astrology'}
        semantics = {'causal_agent.n.01', 'living_thing.n.01', 'organism.n.01', 'entity.n.01', 'whole.n.02', 'person.n.01', 'physical_entity.n.01', 'object.n.01'}
        definition = (astrology) a person who is born while the sun is in Pisces
        example = 
        

In [16]:
tokens = tokeniser.encode("i ate a salmon pie on the weekend")[:]
fish.most_similar(tokens)


        text = salmon
        morphology = salmon
        phonology = SLMN
        id = 90581
        tag = Noun
        similar = set()
        opposite = set()
        related = set()
        semantics = {'color.n.01', 'chromatic_color.n.01', 'entity.n.01', 'property.n.02', 'attribute.n.02', 'abstraction.n.06', 'visual_property.n.01'}
        definition = a pale pinkish orange color
        example = 
        

### Lightweight Contextual Sentence Embeddings (dot similarity)

In [22]:
outputs = tokeniser.encode("this is a test")
outputs.vector

array([1.90929743, 1.        , 0.90962228, ..., 4.03114547, 1.94133054,
       3.11597023])

In [25]:
best = outputs.most_similar([
    tokeniser.encode("test this is"),
    tokeniser.encode("this is an exam"),
    tokeniser.encode("bla bla bla food")
])
str(best)

'this is an exam'

In [26]:
best = tokeniser.encode("How big is London").most_similar([
    tokeniser.encode("London has 9,787,426 inhabitants at the 2011 census"),
    tokeniser.encode("London is known for its financial district"),
])
str(best)

  for synset in acyclic_breadth_first(self, rel, depth):


'London has 9,787,426 inhabitants at the 2011 census'