In [42]:
import spacy
import os 

nlp = spacy.load("en_core_web_sm")

In [43]:
def check_word_validity(word):
    if word.is_alpha and word.text.lower() not in nlp.Defaults.stop_words:
        return True
    else:
        return False


def extract_sample_from_file(file_name, character_count, start_position=0):
    possible_encodings = ['utf-8', 'latin-1', 'windows-1252']
    unprocessed_text = ''
    for encoding in possible_encodings:
        try:
            with open(file_name, 'r', encoding=encoding) as f:
                f.seek(start_position)
                text = f.read(character_count)
                unprocessed_text += text
            doc = nlp(text)
            # Check and remove the first token if it's not a valid word
            if check_word_validity(doc[0]):
                print("removing first token: ", doc[0])
                doc = doc[1:]

            # Check and remove the last token if it's not a valid word
            if check_word_validity(doc[-1]):
                print("removing last token: ", doc[-1])
                doc = doc[:-1]
            return doc, unprocessed_text
        except UnicodeDecodeError:
            continue

In [44]:



sherlock_homes_sample, unprocessed_sherlock_homes_sample = extract_sample_from_file(os.path.join("data","sherlock_homes.txt"), character_count=502, start_position=10000)
social_new_orleans_sample, unprocessed_social_new_orleans_sample = extract_sample_from_file(os.path.join("data","social_new_orleans.txt"), character_count=502, start_position=10000)
the_lindsays_sample, unprocessed_the_lindsays_sample  = extract_sample_from_file(os.path.join("data","the_lindsays.txt"), character_count=502, start_position=10000)

labeled_unprocessed_documents = {
    'sherlock_homes_sample': unprocessed_sherlock_homes_sample,
    'social_new_orleans_sample': unprocessed_social_new_orleans_sample,
    'the_lindsays_sample': unprocessed_the_lindsays_sample
}


removing first token:  G
removing first token:  utral
removing last token:  watered


In [45]:
sherlock_homes_sample

" with a small "t" woven into the texture of the paper.

"What do you make of that?" asked Holmes.

"The name of the maker, no doubt; or his monogram, rather."

"Not at all. The 'G' with the small 't' stands for 'Gesellschaft,' which is the German for 'Company.' It is a customary contraction like our 'Co.' 'P,' of course, stands for 'Papier.' Now for the 'Eg.' Let us glance at our Continental Gazetteer." He took down a heavy brown volume from his shelves. "Eglow, Eglonitz--here we are, Egria. It

In [46]:
social_new_orleans_sample

ground, for we walked, and
it was not considered far.

The Farmers’ and Traders’ Bank was on Canal Street, and the family of Mr.
Bell, the cashier, lived over the bank. There were children there and a
governess, who went fishing with us. We rarely caught anything and had no
use for it when we did.

Sometimes I was permitted to go to market with John, way down to the
old French Market. We had to start early, before the shops on Chartres
Street were open, and the boys busy with scoops

In [47]:
the_lindsays_sample

he railway
station, I noticed a stout dog-cart standing at the corner of a
by-road, under a tall, straggling thorn hedge. The youth who was seated
in it made a sign to the coachman to stop, and I was made aware that
the dog-cart had been sent for me. I got down, and as I bade good-night
to the cross-questioning farmer, I observed a grim smile of triumph on
his firmly compressed lips. He evidently knew the dog-cart, and would
now be able to trace the mysterious stranger.

I and my portmanteau were 

In [48]:
from gensim.models import LsiModel, LdaModel
from gensim import corpora

corpus = [sherlock_homes_sample, social_new_orleans_sample, the_lindsays_sample]

dictionary = corpora.Dictionary([doc.text.lower().split() for doc in corpus])

corpus_bow = [dictionary.doc2bow(doc.text.lower().split()) for doc in corpus]





In [49]:
lsa_model = LsiModel(corpus_bow, num_topics=5, id2word=dictionary)

# print all topics from the LSA model
for topic in lsa_model.print_topics():
    print(topic)


    

(0, '0.604*"the" + 0.312*"and" + 0.261*"a" + 0.251*"to" + 0.222*"i" + 0.189*"of" + 0.188*"for" + 0.156*"was" + 0.147*"we" + 0.141*"it"')
(1, '-0.319*"and" + -0.247*"to" + 0.239*"of" + 0.229*"for" + -0.192*"i" + 0.163*"is" + 0.163*"our" + 0.163*"stands" + 0.163*"small" + -0.159*"was"')
(2, '-0.370*"i" + 0.289*"we" + -0.274*"a" + 0.226*"with" + -0.147*"dog-cart" + -0.147*"made" + 0.141*"there" + -0.139*"he" + 0.128*"and" + 0.097*"for"')


In [50]:
lda_model = LdaModel(corpus_bow, num_topics=5)

# print all topics from the LDA model
for topic in lda_model.print_topics():
    print(topic)

(0, '0.054*"59" + 0.037*"68" + 0.029*"14" + 0.027*"113" + 0.022*"93" + 0.019*"63" + 0.016*"118" + 0.016*"29" + 0.016*"49" + 0.016*"65"')
(1, '0.017*"59" + 0.010*"65" + 0.010*"49" + 0.010*"29" + 0.009*"14" + 0.009*"40" + 0.008*"63" + 0.008*"51" + 0.008*"36" + 0.008*"26"')
(2, '0.060*"59" + 0.029*"68" + 0.028*"93" + 0.027*"14" + 0.025*"113" + 0.022*"49" + 0.018*"29" + 0.017*"118" + 0.014*"40" + 0.012*"63"')
(3, '0.023*"59" + 0.017*"49" + 0.015*"29" + 0.013*"14" + 0.011*"55" + 0.011*"39" + 0.011*"18" + 0.011*"36" + 0.010*"40" + 0.010*"65"')
(4, '0.053*"59" + 0.024*"29" + 0.019*"49" + 0.019*"68" + 0.018*"113" + 0.017*"14" + 0.017*"65" + 0.016*"40" + 0.013*"63" + 0.012*"118"')


In [51]:
from gensim import models

tfidf = models.TfidfModel(corpus_bow)



In [52]:
from gensim import similarities


index = similarities.SparseMatrixSimilarity(tfidf[corpus_bow], num_features=len(dictionary))

In [53]:
query = "straggling thorn hedge"

processed_query = nlp(query)
query_bow = dictionary.doc2bow(processed_query.text.split())

# Transform the query into the same vector space as the documents for Top2Vec
query_vector_gensim = lsa_model[query_bow] 





In [54]:
# Calculate similarity between the query and each document for LSA
similarity_scores_lsa = index[query_bow]

most_similar_index_lsa = similarity_scores_lsa.argmax()

most_similar_sample = list(labeled_unprocessed_documents.keys())[most_similar_index_lsa]
most_similar_document_lsa = labeled_unprocessed_documents[most_similar_sample]

print("Most Similar Document (LSA):")
print(f"Sample: {most_similar_sample}")
print(f"Document: {most_similar_document_lsa}")

Most Similar Document (LSA):
Sample: sherlock_homes_sample
Document: G" with a small "t" woven into the texture of the paper.

"What do you make of that?" asked Holmes.

"The name of the maker, no doubt; or his monogram, rather."

"Not at all. The 'G' with the small 't' stands for 'Gesellschaft,' which is the German for 'Company.' It is a customary contraction like our 'Co.' 'P,' of course, stands for 'Papier.' Now for the 'Eg.' Let us glance at our Continental Gazetteer." He took down a heavy brown volume from his shelves. "Eglow, Eglonitz--here we are, Egria. It 
