In [10]:
import spacy
import os

In [11]:
def check_word_validity(word):
    if word.is_alpha and word.text.lower() not in nlp.Defaults.stop_words:
        return True
    else:
        return False


def extract_sample_from_file(file_name, character_count, start_position=0):
    possible_encodings = ['utf-8', 'latin-1', 'windows-1252']
    unprocessed_text = ''
    for encoding in possible_encodings:
        try:
            with open(file_name, 'r', encoding=encoding) as f:
                f.seek(start_position)
                text = f.read(character_count)
                unprocessed_text += text
            doc = nlp(text)
            # Check and remove the first token if it's not a valid word
            if check_word_validity(doc[0]):
                print(doc[0])
                doc = doc[1:]

            # Check and remove the last token if it's not a valid word
            if check_word_validity(doc[-1]):
                doc = doc[:-1]
            return doc, unprocessed_text
        except UnicodeDecodeError:
            continue

In [12]:
nlp = spacy.load('en_core_web_md')
#  You read a small  extract of the file sherlock_homes.txt
sherlock_homes_sample, unprocessed_sherlock_homes_sample = extract_sample_from_file(os.path.join("../data","sherlock_homes.txt"), character_count=50000)
# take a small extraact from the file social_new_orleans.txt
social_new_orleans_sample, unprocessed_social_new_orleans_sample = extract_sample_from_file(os.path.join("../data","social_new_orleans.txt"), character_count=50000)
# take a small extraact from the file the_lindsays.txt
the_lindsays_sample, unprocessed_the_lindsays_sample = extract_sample_from_file(os.path.join("../data","the_lindsays.txt"), character_count=50000)

In [13]:
# print the number of tokens in each sample
print("Sherlock Homes sample contains {} tokens".format(len(sherlock_homes_sample)))
print("Social New Orleans sample contains {} tokens".format(len(social_new_orleans_sample)))
print("The Lindsays sample contains {} tokens".format(len(the_lindsays_sample)))


Sherlock Homes sample contains 11659 tokens
Social New Orleans sample contains 10809 tokens
The Lindsays sample contains 11368 tokens


In [14]:
# compare the similarity of the sherlock_homes_sample with the social_new_orleans_sample
print("Similarity between sherlock_homes_sample and social_new_orleans_sample")
print(sherlock_homes_sample.similarity(social_new_orleans_sample))



Similarity between sherlock_homes_sample and social_new_orleans_sample
0.9631454517552418


In [15]:
# compare the similarity of the sherlock_homes_sample with the the_lindsays_sample
print("Similarity between sherlock_homes_sample and the_lindsays_sample")
print(sherlock_homes_sample.similarity(the_lindsays_sample))

Similarity between sherlock_homes_sample and the_lindsays_sample
0.9683116425927687


In [16]:
# compare the similarity of the social_new_orleans_sample with the the_lindsays_sample
print("Similarity between social_new_orleans_sample and the_lindsays_sample")
print(social_new_orleans_sample.similarity(the_lindsays_sample))

Similarity between social_new_orleans_sample and the_lindsays_sample
0.9913071096044772


In [17]:
# Compare also the first 100 tokens of each document with each other.
first_100_tokens_sherlock_homes_sample = sherlock_homes_sample[:100]
first_100_tokens_social_new_orleans_sample = social_new_orleans_sample[:100]
print("Similarity between first_100_tokens_sherlock_homes_sample and first_100_tokens_social_new_orleans_sample")
print(first_100_tokens_sherlock_homes_sample.similarity(first_100_tokens_social_new_orleans_sample))

Similarity between first_100_tokens_sherlock_homes_sample and first_100_tokens_social_new_orleans_sample
0.9264501929283142


In [18]:
first_100_tokens_sherlock_homes_sample = sherlock_homes_sample[:100]
first_100_tokens_the_lindsays_sample = the_lindsays_sample[:100]
print("Similarity between first_100_tokens_sherlock_homes_sample and first_100_tokens_the_lindsays_sample")
print(first_100_tokens_sherlock_homes_sample.similarity(first_100_tokens_the_lindsays_sample))

Similarity between first_100_tokens_sherlock_homes_sample and first_100_tokens_the_lindsays_sample
0.9284713268280029


In [19]:
first_100_tokens_social_new_orleans_sample = social_new_orleans_sample[:100]
first_100_tokens_the_lindsays_sample = the_lindsays_sample[:100]
print("Similarity between first_100_tokens_social_new_orleans_sample and first_100_tokens_the_lindsays_sample")
print(first_100_tokens_social_new_orleans_sample.similarity(first_100_tokens_the_lindsays_sample))

Similarity between first_100_tokens_social_new_orleans_sample and first_100_tokens_the_lindsays_sample
0.9858769178390503


In [20]:
def get_token_index_from_character_index(doc, character_index):
    for token in doc:
        if character_index >= token.idx and character_index < token.idx + len(token):
            return token.i
    return None

In [21]:
import spacy
from spacy.tokens import  Span

# Load a blank English model
nlp = spacy.blank("en")

# Add a new entity label for 'PERSON'
# ner = nlp.add_pipe('ner')
# ner.add_label("PERSON")

name = "Paul Chiteri Ashioya"

# Create a Doc object with the name
doc = nlp.make_doc(f"My name is not {name}")

# Create a Span for the 'PERSON' entity
start_index = doc.text.find(name)

token_index = get_token_index_from_character_index(doc, start_index)

person_span = Span(doc, token_index, (token_index + len(name.split())), label="PERSON")

# Set the 'PERSON' entity on the Doc
doc.ents = [person_span]

# Print the Person entities in the document
print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "PERSON"])


[('Paul Chiteri Ashioya', 'PERSON')]
