## Libraries

In [3]:
import numpy as np
from gensim import corpora, models
import spacy
from spacy.lang.en import stop_words
from tqdm import tqdm

# !python -m spacy download en_core_web_md
nlp = spacy.load("en_core_web_md")

## Preprocess text

In [15]:
# https://spacy.io/usage/linguistic-features


def preprocess_text(file_path, nlp, stop_word_removal=True, non_alpha_removal=True, lemmatization=True, lowercasing=True, additional_stop_words=[]):
    stop_words_to_use = list(stop_words.STOP_WORDS)
    stop_words_to_use += additional_stop_words

    documents = []
    with open(file_path, "r", encoding="latin") as f:
        for blog in tqdm(f):
            doc = nlp(blog.strip()) # Convert to spaCy doc

            if len(doc) > 1:
                if non_alpha_removal: # Remove non alpha characters
                    doc = [token for token in doc if token.is_alpha]

                if lemmatization: # Lemmatize words
                    doc = [token.lemma_ for token in doc]

                if stop_word_removal: # Remove stop words
                    doc = [token for token in doc if token not in stop_words_to_use]

                if lowercasing: # Lowercase words
                    doc = [token.lower() for token in doc]

                documents.append(doc)

    # https://radimrehurek.com/gensim/auto_examples/core/run_corpora_and_vector_spaces.html
    dictionary = corpora.Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]

    dictionary.save("dictionary2.dict")
    corpora.MmCorpus.serialize("corpus2.mm", corpus)

additional_stop_words = ["ride", "day", "bike", "road", "get", "go", "mile", "km", "metre", "like", "way", "good", "come", "look", "nice", "think", "trip", "know", "see", "great", "today"]
preprocess_text("blogs.txt", nlp, additional_stop_words=additional_stop_words)


2196it [02:27, 14.93it/s]


True


## Explore dictionary

In [16]:
# https://radimrehurek.com/gensim/auto_examples/core/run_topics_and_transformations.html

dictionary = corpora.Dictionary.load("dictionary2.dict")
corpus = corpora.MmCorpus("corpus2.mm")

word_freq = {k: v for k, v in sorted(dictionary.cfs.items(), key=lambda item: item[1], reverse=True)}
for id in list(word_freq.keys())[:100]:
    print(dictionary[id])

i
time
stop
town
find
little
head
hill
start
pass
night
turn
leave
place
park
long
lot
people
hour
climb
rain
feel
big
right
eat
want
wind
tell
bit
thing
camp
pretty
decide
water
stay
route
river
end
new
home
use
close
work
dinner
old
meet
lunch
try
morning
rest
area
city
car
walk
rt
tent
need
talk
ask
run
store
bad
trail
hot
arrive
far
couple
guy
food
small
south
set
mountain
wait
year
minute
flat
spend
room
sleep
shower
early
campground
traffic
lake
tour
point
sit
north
away
continue
finally
state
drive
tomorrow
cross
house
later
tire
west


## Modelling

In [21]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

# # https://radimrehurek.com/gensim/models/ldamodel.html
lda = models.LdaModel(corpus_tfidf, num_topics=10)

topics = lda.get_topics()
for topic in range(10):
    topic_probs = topics[topic, :]
    print(f"Topic {topic}: {', '.join([dictionary[i] for i in np.argsort(topic_probs)[-10:]])}")



Topic 0: town, wind, park, time, pass, city, climb, rain, rt, i
Topic 1: area, route, trail, town, train, climb, time, park, total, i
Topic 2: stop, camp, lake, pass, rain, rt, town, hill, campground, i
Topic 3: stop, turn, night, thank, tent, hill, town, rain, rt, i
Topic 4: town, route, park, wind, pass, height, river, little, climb, i
Topic 5: park, hill, time, pass, campground, find, place, town, rt, i
Topic 6: river, town, pass, stop, little, rain, head, night, total, i
Topic 7: time, town, place, little, tent, pass, hill, stop, rt, i
Topic 8: pretty, pj, hill, time, guy, river, tomorrow, town, rain, i
Topic 9: climb, rain, water, rt, head, town, park, time, trail, i
