In [49]:
import spacy
import os 
import gensim

nlp = spacy.load("en_core_web_sm")

In [50]:
def check_word_validity(word):
    if word.is_alpha and word.text.lower() not in nlp.Defaults.stop_words:
        return True
    else:
        return False


def extract_sample_from_file(file_name, character_count, start_position=0):
    possible_encodings = ['utf-8', 'latin-1', 'windows-1252']
    unprocessed_text = ''
    for encoding in possible_encodings:
        try:
            with open(file_name, 'r', encoding=encoding) as f:
                f.seek(start_position)
                text = f.read(character_count)
                unprocessed_text += text
            doc = nlp(text)
            # Check and remove the first token if it's not a valid word
            if check_word_validity(doc[0]):
                print("removing first token: ", doc[0])
                doc = doc[1:]

            # Check and remove the last token if it's not a valid word
            if check_word_validity(doc[-1]):
                print("removing last token: ", doc[-1])
                doc = doc[:-1]
            return doc, unprocessed_text
        except UnicodeDecodeError:
            continue

In [51]:



sherlock_homes_sample, unprocessed_sherlock_homes_sample = extract_sample_from_file(os.path.join("data","sherlock_homes.txt"), character_count=502, start_position=10000)
social_new_orleans_sample, unprocessed_social_new_orleans_sample = extract_sample_from_file(os.path.join("data","social_new_orleans.txt"), character_count=502, start_position=10000)
the_lindsays_sample, unprocessed_the_lindsays_sample  = extract_sample_from_file(os.path.join("data","the_lindsays.txt"), character_count=502, start_position=10000)

labeled_unprocessed_documents = {
    'sherlock_homes_sample': unprocessed_sherlock_homes_sample,
    'social_new_orleans_sample': unprocessed_social_new_orleans_sample,
    'the_lindsays_sample': unprocessed_the_lindsays_sample
}


removing first token:  G
removing last token:  mon
removing first token:  n
removing last token:  uncertai


In [52]:
sherlock_homes_sample

" with a small "t" woven into the texture of the paper.

"What do you make of that?" asked Holmes.

"The name of the maker, no doubt; or his monogram, rather."

"Not at all. The 'G' with the small 't' stands for 'Gesellschaft,' which is the German for 'Company.' It is a customary contraction like our 'Co.' 'P,' of course, stands for 'Papier.' Now for the 'Eg.' Let us glance at our Continental Gazetteer." He took down a heavy brown volume from his shelves. "Eglow, Eglonitz--here we are, Egria. It

In [53]:
social_new_orleans_sample

or it when we did.

Sometimes I was permitted to go to market with John, way down to the
old French Market. We had to start early, before the shops on Chartres
Street were open, and the boys busy with scoops watered the roadway from
brimming gutters. John and I hurried past. Once at market we rushed from
stall to stall, filling our basket, John forgetting nothing that had been
ordered, and always carefully remembering one most important item, the
saving of at least a picayune out of the market

In [54]:
the_lindsays_sample

, and as I bade good-night
to the cross-questioning farmer, I observed a grim smile of triumph on
his firmly compressed lips. He evidently knew the dog-cart, and would
now be able to trace the mysterious stranger.

I and my portmanteau were finally left on the side of the road, and
the young man in the dog-cart civilly turned the vehicle round (with
some difficulty on account of the narrow road), and drew up beside me,
to save my carrying my luggage a dozen yards. At first I was a little

In [55]:
from gensim.models import LsiModel, LdaModel
from gensim import corpora

corpus = [sherlock_homes_sample, social_new_orleans_sample, the_lindsays_sample]




In [56]:
from gensim import models
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

data_words = [[token.text for token in doc] for doc in corpus]

# Build the bigram and trigram models
bigram = models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = models.phrases.Phraser(bigram)
trigram_mod = models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])


def remove_stopwords(texts):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words)

# lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])


['"', 'with', 'a', 'small', '"', 't', '"', 'woven', 'into', 'the', 'texture', 'of', 'the', 'paper', '.', '\n\n', '"', 'What', 'do', 'you', 'make', 'of', 'that', '?', '"', 'asked', 'Holmes', '.', '\n\n', '"', 'The', 'name', 'of', 'the', 'maker', ',', 'no', 'doubt', ';', 'or', 'his', 'monogram', ',', 'rather', '.', '"', '\n\n', '"', 'Not', 'at', 'all', '.', 'The', "'", 'G', "'", 'with', 'the', 'small', "'", 't', "'", 'stands', 'for', "'", 'Gesellschaft', ',', "'", 'which', 'is', 'the', 'German', 'for', "'", 'Company', '.', "'", 'It', 'is', 'a', 'customary', 'contraction', 'like', 'our', "'", 'Co.', "'", "'", 'P', ',', "'", 'of', 'course', ',', 'stands', 'for', "'", 'Papier', '.', "'", 'Now', 'for', 'the', "'", 'Eg', '.', "'", 'Let', 'us', 'glance', 'at', 'our', 'Continental', 'Gazetteer', '.', '"', 'He', 'took', 'down', 'a', 'heavy', 'brown', 'volume', 'from', 'his', 'shelves', '.', '"', 'Eglow', ',', 'Eglonitz', '--', 'here', 'we', 'are', ',', 'Egria', '.', 'It']


In [57]:


dictionary = corpora.Dictionary(data_lemmatized)
corpus_bow = [dictionary.doc2bow(doc) for doc in data_lemmatized]


lsa_model = LsiModel(corpus_bow, num_topics=5, id2word=dictionary)

# print all topics from the LSA model
for topic in lsa_model.print_topics():
    print(topic)


    

(0, '0.273*"cart" + 0.273*"road" + 0.273*"dog" + 0.150*"now" + 0.137*"compress" + 0.137*"account" + 0.137*"carry" + 0.137*"good" + 0.137*"lip" + 0.137*"dozen"')
(1, '0.425*"market" + 0.168*"at" + 0.142*"carefully" + 0.142*"shop" + 0.142*"stall" + 0.142*"way" + 0.142*"french" + 0.142*"item" + 0.142*"important" + 0.142*"permit"')
(2, '0.299*"stand" + 0.299*"small" + 0.299*"t" + 0.150*"customary" + 0.150*"volume" + 0.150*"all" + 0.150*"let" + 0.150*"here" + 0.150*"maker" + 0.150*"paper"')


In [58]:
lda_model = LdaModel(corpus_bow, num_topics=5)

# print all topics from the LDA model
for topic in lda_model.print_topics():
    print(topic)

(0, '0.029*"104" + 0.023*"82" + 0.022*"77" + 0.017*"21" + 0.016*"92" + 0.016*"103" + 0.016*"84" + 0.016*"90" + 0.015*"73" + 0.015*"87"')
(1, '0.023*"28" + 0.021*"27" + 0.018*"53" + 0.017*"29" + 0.017*"2" + 0.015*"30" + 0.014*"0" + 0.014*"6" + 0.014*"26" + 0.014*"31"')
(2, '0.026*"77" + 0.025*"82" + 0.017*"104" + 0.017*"76" + 0.016*"88" + 0.015*"111" + 0.015*"89" + 0.015*"98" + 0.015*"114" + 0.015*"93"')
(3, '0.011*"29" + 0.010*"28" + 0.010*"27" + 0.010*"12" + 0.010*"6" + 0.010*"21" + 0.010*"1" + 0.010*"23" + 0.010*"32" + 0.010*"11"')
(4, '0.035*"53" + 0.020*"2" + 0.016*"29" + 0.014*"55" + 0.014*"27" + 0.014*"66" + 0.014*"67" + 0.014*"47" + 0.014*"64" + 0.014*"68"')


In [59]:
from gensim import models

tfidf = models.TfidfModel(corpus_bow)



In [60]:
from gensim import similarities


index = similarities.SparseMatrixSimilarity(tfidf[corpus_bow], num_features=len(dictionary))

In [61]:
query = "straggling thorn hedge"

processed_query = nlp(query)
query_bow = dictionary.doc2bow(processed_query.text.split())

# Transform the query into the same vector space as the documents for Top2Vec
query_vector_gensim = lsa_model[query_bow] 





In [62]:
# Calculate similarity between the query and each document for LSA
similarity_scores_lsa = index[query_bow]

most_similar_index_lsa = similarity_scores_lsa.argmax()

most_similar_sample = list(labeled_unprocessed_documents.keys())[most_similar_index_lsa]
most_similar_document_lsa = labeled_unprocessed_documents[most_similar_sample]

print("Most Similar Document (LSA):")
print(f"Sample: {most_similar_sample}")
print(f"Document: {most_similar_document_lsa}")

Most Similar Document (LSA):
Sample: sherlock_homes_sample
Document: G" with a small "t" woven into the texture of the paper.

"What do you make of that?" asked Holmes.

"The name of the maker, no doubt; or his monogram, rather."

"Not at all. The 'G' with the small 't' stands for 'Gesellschaft,' which is the German for 'Company.' It is a customary contraction like our 'Co.' 'P,' of course, stands for 'Papier.' Now for the 'Eg.' Let us glance at our Continental Gazetteer." He took down a heavy brown volume from his shelves. "Eglow, Eglonitz--here we are, Egria. It 
