## Libraries

In [8]:
import numpy as np
import pandas as pd
from gensim import corpora, models
import spacy
from spacy.lang.en import stop_words
from tqdm import tqdm
from transformers import pipeline

tqdm.pandas()

# !python -m spacy download en_core_web_md
nlp = spacy.load("en_core_web_md")

## Preprocess text

### Filter out non-English documents

In [6]:
# https://huggingface.co/papluca/xlm-roberta-base-language-detection
model_ckpt = "papluca/xlm-roberta-base-language-detection"
pipe = pipeline("text-classification", model=model_ckpt)

df_blogs = pd.DataFrame(columns=["blog", "lang"])
with open("blogs.txt", "r", encoding="latin") as f:
  i = 0
  for blog in tqdm(f):
    blog = blog.strip()
    lang = pipe(blog, truncation=True)

    df_blogs.loc[i, "blog"] = blog
    df_blogs.loc[i, "lang"] = lang[0]["label"]

    i += 1

df_blogs = df_blogs[df_blogs["lang"] == "en"]
df_blogs.to_csv("blogs.csv")

Device set to use cpu


### Apply standard preprocessing

In [None]:
# https://spacy.io/usage/linguistic-features


def preprocess_text(file_path, nlp, stop_word_removal=True, non_alpha_removal=True, lemmatization=True, lowercasing=True, additional_stop_words=[]):
    stop_words_to_use = list(stop_words.STOP_WORDS)
    stop_words_to_use += additional_stop_words

    df_blogs = pd.read_csv(file_path)
    documents = []

    for i, row in tqdm(df_blogs.iterrows()):
        doc = nlp(row["blog"]) # Convert to spaCy doc

        if len(doc) > 1:
            if non_alpha_removal: # Remove non alpha characters
                doc = [token for token in doc if token.is_alpha]

            if lemmatization: # Lemmatize words
                doc = [token.lemma_ for token in doc]

            if stop_word_removal: # Remove stop words
                doc = [token for token in doc if token not in stop_words_to_use]

            if lowercasing: # Lowercase words
                doc = [token.lower() for token in doc]

            documents.append(doc)

    # https://radimrehurek.com/gensim/auto_examples/core/run_corpora_and_vector_spaces.html
    dictionary = corpora.Dictionary(documents)
    dictionary.filter_extremes(no_below=2) # Filter out tokens appearing only once
    corpus = [dictionary.doc2bow(doc) for doc in documents]

    dictionary.save("dictionary.dict")
    corpora.MmCorpus.serialize("corpus.mm", corpus)

additional_stop_words = ["ride", "day", "bike", "road", "get", "go", "mile", "km", "metre", "like", "way", "good", "come", "look", "nice", "think", "trip", "know", "see", "great", "today"]
preprocess_text("blogs.csv", nlp, additional_stop_words=additional_stop_words)


290it [00:28,  7.81it/s]

## Explore dictionary

In [None]:
# https://radimrehurek.com/gensim/auto_examples/core/run_topics_and_transformations.html
dictionary = corpora.Dictionary.load("dictionary.dict")
corpus = corpora.MmCorpus("corpus.mm")

word_freq = {k: v for k, v in sorted(dictionary.cfs.items(), key=lambda item: item[1], reverse=True)}
for id in list(word_freq.keys())[:100]:
    print(dictionary[id])

i
time
stop
town
find
little
head
hill
start
pass
night
turn
leave
place
park
long
lot
people
hour
climb
rain
feel
big
right
eat
want
wind
tell
bit
thing
camp
pretty
decide
water
stay
route
river
end
new
home
use
close
work
dinner
old
meet
lunch
try
morning
rest
area
city
car
walk
rt
tent
need
talk
ask
run
store
bad
trail
hot
arrive
far
couple
guy
food
small
south
set
mountain
wait
year
minute
flat
spend
room
sleep
shower
early
campground
traffic
lake
tour
point
sit
north
away
continue
finally
state
drive
tomorrow
cross
house
later
tire
west


## Modelling

In [None]:
import logging
import re

# Allow logging
logging.basicConfig(filename='gensim.log', format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)


def clear_logfile():
    # To empty the log file
    with open("gensim.log", "w"):
        pass


def parse_logfile():
    """Parse gensim.log to extract the log-likelihood scores.

    Returns:
        A list of log-likelihood scores.
    """
    matcher = re.compile(r'(-*\d+\.\d+) per-word .* (\d+\.\d+) perplexity')
    likelihoods = []
    with open('gensim.log') as source:
        for line in source:
            match = matcher.search(line)
            if match:
                likelihoods.append(float(match.group(1)))
    return likelihoods

In [None]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

# # https://radimrehurek.com/gensim/models/ldamodel.html
clear_logfile()
lda = models.LdaModel(corpus_tfidf, num_topics=10)
likelihoods = parse_logfile()

# Plot likelihoods for convergence
fig, ax = plt.subplots(figsize=(12, 7))
ax.plot(likelihoods)
ax.set(title="LDA Convergence", xlabel="Iteration", ylabel="Log-Likelihood")

# Print most important words per topic
topics = lda.get_topics()
for topic in range(10):
    topic_probs = topics[topic, :]
    print(f"Topic {topic}: {', '.join([dictionary[i] for i in np.argsort(topic_probs)[-10:]])}")



Topic 0: town, wind, park, time, pass, city, climb, rain, rt, i
Topic 1: area, route, trail, town, train, climb, time, park, total, i
Topic 2: stop, camp, lake, pass, rain, rt, town, hill, campground, i
Topic 3: stop, turn, night, thank, tent, hill, town, rain, rt, i
Topic 4: town, route, park, wind, pass, height, river, little, climb, i
Topic 5: park, hill, time, pass, campground, find, place, town, rt, i
Topic 6: river, town, pass, stop, little, rain, head, night, total, i
Topic 7: time, town, place, little, tent, pass, hill, stop, rt, i
Topic 8: pretty, pj, hill, time, guy, river, tomorrow, town, rain, i
Topic 9: climb, rain, water, rt, head, town, park, time, trail, i
