In [1]:
import spacy
from scipy import spatial
from collections import Counter

In [2]:
# Install the medium or large model package so we can have word vectors
# !python -m spacy download en_core_web_md

In [3]:
# Load the medium model
nlp = spacy.load("en_core_web_md")

In [4]:
# Name of the file to run nlp on
filename_input = './files/Infrastructure.ttl'

with open(filename_input, encoding='utf8') as infile:
    text = infile.read()

**Goal:** To find, using cosine similarity, words in the nlp vocabulary that are semantically similar to the 10 most common words in a .ttl file.

In [5]:
# Filter the lines tha start with '#','@','.' and save them in a list
lines = []
for line in text.splitlines():
    if line.startswith(('#', '@', '.')):
        continue
    lines.append(line)

In [6]:
# Words that are reserved by the SKOS vocabulary
skos = ['skos', 'Class', 'example', 'dataElementLabel', 'acronym', 'broader', 'narrower', 'vaem', 'prefix', 'dcterms', 'owl',
        'termSubject', 'prefLabel', 'bibliographicCitation', 'imports', 'versionInfo', 'string', 'rdfs', 'label',
        'subPropertyOf', 'subClassOf', 'anySimpleType', 'code', 'definition', 'dtype', 'topConceptOf',
        'hasTopConcept', 'rdf', 'type', 'unionOf', 'intersectionOf', 'historyNote', 'xsd']

In [7]:
'''Fiter all the words (tokens) in each line. If any token is alphabetical and is not a stop word or in the SKOS 
reserved words, we keep it. Else discard it.'''

cnt = Counter()
for line in lines:
    doc = nlp(line)
    for token in doc:
        if token.is_alpha and not token.is_stop and not token.is_oov and not token.text in skos:
            cnt[token.text] += 1

**Note**: Maybe is a better choice to save in the counter the .lemma_ of the token. Don't know...

In [8]:
# Check the 10 most common elements in the counter
cnt.most_common(10)

[('Systems', 123),
 ('Infrastructure', 45),
 ('Airspace', 43),
 ('Atmosphere', 38),
 ('Transportation', 32),
 ('Air', 19),
 ('Activity', 13),
 ('System', 13),
 ('infrastructure', 13),
 ('Facilities', 12)]

In [9]:
# Extract the 10 most common words into a list
most_common_words = []
for elem in cnt.most_common(10):
    most_common_words.append(elem[0])

In [10]:
def cosine_similarity(word1, word2):
    '''Calculates the cosine similarity of two word vectors'''
    w1 = nlp.vocab[word1].vector
    w2 = nlp.vocab[word2].vector
    return 1-spatial.distance.cosine(w1, w2)

In [11]:
# Finds words in the nlp.vocabulary that have the same cosine similarity with each word in the most common word.
# for testing purposes 'common' is Airspace
computed_similarities = []
common = 'Airspace'

for word in nlp.vocab:
    if word.is_alpha and word.is_lower and word.has_vector:
        similarity = cosine_similarity(word.text, common)
        computed_similarities.append((word.text, similarity))

In [12]:
# sort the list by similarity descending
computed_similarities = sorted(
    computed_similarities, key=lambda item: -item[1])

# print the first 10 words that have the same similarity
print([t[0] for t in computed_similarities[:10]])

['airframes', 'airworthy', 'turboprop', 'monoplane', 'airfield', 'airspace', 'refuelling', 'airframe', 'aicraft', 'airborne']
