## Set up

In [31]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from gensim.models import Phrases
from gensim.models.phrases import Phraser 
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from pprint import pprint
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [32]:
data = pd.read_csv("data/cleaned_data1.csv")

  data = pd.read_csv("data/cleaned_data1.csv")


## Preprocessing

Tokenizer

In [33]:
# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
data["tokenized"] = data["cleaned_text"].apply(lambda x: tokenizer.tokenize(x))

Lemmatizer & POS filtering

In [34]:
#https://www.h2kinfosys.com/blog/part-of-speech-tagging-chunking-with-nltk/
def get_wordnet_pos(tag):
    """Convert POS tag to a format recognized by WordNetLemmatizer"""
    if tag.startswith('J'): #NLTK tags adjectives as JJ (normal), 
                            #JJR (comparative) and JJS (superlative)
        return wordnet.ADJ  #and then it can be mapped to the wordnet database
                              #to get the lemma
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    """Lemmatizes tokens with POS tagging"""
    tagged_tokens = pos_tag(tokens)  
    return [lemmatizer.lemmatize(word, pos) for word, tag in tagged_tokens
            if (pos := get_wordnet_pos(tag))]

data["lemmatized"] = data["tokenized"].apply(lemmatize_tokens)

Stop words

In [45]:
#Custom defined list
english = [
    "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", 
    "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "cannot", "could", "did", 
    "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", 
    "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is", "it", 
    "its", "itself", "let", "me", "more", "most", "must", "my", "myself", "no", "nor", "not", "of", "off", "on", 
    "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "some", "such", 
    "than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", 
    "through", "to", "too", "under", "until", "up", "very", "was", "we", "were", "what", "when", "where", "which", 
    "while", "who", "whom", "why", "with", "would", "you", "your", "yours", "yourself", "yourselves", "will", "ll", 
    "re", "ve", "d", "s", "m", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", 
    "s", "t", "u", "v", "w", "x", "y", "z", "many", "us", "ok", "hows", "ive", "ill", "im", "cant", "topics", "topic",
    "discuss", "thoughts", "yo", "thats", "whats", "lets", "nothing", "oh", "omg", 
         "things", "stuff", "yall", "haha", "yes", "no", "wo", "like", 'good', 
         'work', 'got', 'going', 'dont', 'really', 'want', 'make', 'think', 
         'know', 'feel', 'people', 'life', "getting", "lot" "great", "i", "me", 
         "my", "myself", "we", "our", "ours", "ourselves", 
        "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", 
        "himself", "she", "her", "hers", "herself", "it", "its", "itself", 
        "they", "them", "their", "theirs","themselves", "what", "which", "who", 
        "whom", "this", "that", "these", "those", "am", "is", "are", "was", 
        "were", "be", "been", "being", "have", "has", "had", "having", "do", 
        "does", "did", "doing", "will", "would", "should", "can", "could", "may",
        "might", "must", "shall", "ought", "about", "above", "across", "after", 
        "against", "along", "amid", "among", "around", "as", "at", "before", "behind",
        "below", "beneath", "beside", "between", "beyond", "but", "by", 
        "concerning", "considering", "despite", "down", "during", "except", "for",
        "from", "in", "inside", "into", "like", "near", "next", "notwithstanding",
        "of", "off", "on", "onto", "opposite", "out", "outside", "over", "past",
        "regarding", "round", "since", "than", "through", "throughout", "till", 
        "to", "toward", "towards", "under", "underneath", "unlike", "until", "up",
        "upon", "versus", "via", "with", "within", "without", "cant", "cannot", 
        "couldve", "couldnt", "didnt", "doesnt", "dont", "hadnt", "hasnt", 
        "havent", "hed", "hell", "hes", "howd", "howll", "hows", "id", "ill", 
        "im", "ive", "isnt", "itd", "itll", "its", "lets", "mightve", "mustve", 
        "mustnt", "shant", "shed", "shell", "shes", "shouldve", "shouldnt", 
        "thatll", "thats", "thered", "therell", "therere", "theres", "theyd", 
        "theyll", "theyre", "theyve", "wed", "well", "were", "weve", "werent", 
        "whatd", "whatll", "whatre", "whats", "whatve", "whend", "whenll", 
        "whens", "whered", "wherell", "wheres", "whichd", "whichll", "whichre", 
        "whichs", "whod", "wholl", "whore", "whos", "whove", "whyd", "whyll", 
        "whys", "wont", "wouldve", "wouldnt", "youd", "youll", "youre", "youve",
        "f", "m", "because", "go", "lot", "get", "still", "way", "something", "much",
        "thing", "someone", "person", "anything", "goes", "ok", "so", "just", "mostly", 
        "put", "also", "lots", "yet", "ha", "etc", "wasnt", "yeah", "okay", "lol",
        "gt", "cuz", 'id']

reddit = ["welcome", "hi", "hello", "sub", "reddit", "thanks", "thank", "maybe",
          "wo30", "mods", "mod", "moderators", "subreddit", "btw", "aw", "aww", 
          "aww", "hey", "hello", "join", "joined", "post", "op", "fuck", "shit"]

topic_specific = ["self", "improvement", "selfimprovement", "rselfimprovement", 
    'even', 'time', 'bad', 'best', 'never', 'well', 'hard', 'always', 'help', 'first',
    'back', 'problem', 'right', 'try', 'need', 'actually', 'everything', 'long', 'sure', 
    'care', 'look', 'everyone', 'enough', 'else', 'great', 'point', 'kind', 
    'advice', 'now', 'year', 'old', 'month', 'young', 'age', 'last', 'later', 
    'ago', 'late', 'future', 'finally', 'early', 'bc', 'decade', 
    'tbh', 'spent', 'almost', 'ur', 'others', 'experience', 'change', 'world', 'different', 
    'important', 'positive', 'negative', 'value', 'question', 'moment', 'often', 'way', 
    'instead', 'situation', 'rather', 'understand', 'personal', 'practice', 'start', 
    'week', 'small', 'hour', 'use', 'easy', 'minute', 'break', 'daily', 'night', 'today', 
    'everyday', 'list', 'morning', 'task', 'spend',
    'everybody', 'everyone', 'someone', 'somebody', 'anything', 'everything', 'nothing', 'something',
    'good', 'great', 'fine', 'better', 'best', 'okay', 'alright', 'just', 'only', 'simply', 'actually', 
    'careful', 'attentive', 'caring', 'concerned', 
    'positive', 'negative', 
    'important', 'necessary', 'essential', 'key', 'significant', 'irrelevant', 
    'unimportant', 'worthwhile', 'change', 'alter', 'modify', 'adjust', 'shift', 'transform', 
    'problem', 'issue', 'challenge', 'difficulty', 'obstacle', 'struggle', 'complication', 'solution', 
    'answer', 'remedy', 'plan', 'idea', 'approach', 'method', 'strategy', 'option', 'choice', 'decision', 
    'situation', 'circumstance', 'condition', 'event', 'experience', 'case', 'example', 'scenario', 
     'case', 'step', 'action', 'move', 'process', 'procedure', 'approach', 'method', 'result', 'outcome', 'consequence', 
    'conclusion', 'end', 'finish', 'start', 'begin', 'initiate', 'open', 'launch', 'beginning', 'commencement',
    'day', 'one', 'happy', 'little', 'big', 'probably', 'reason', 'able', 'away', 'sometimes', 
    'ever', 'matter', 'real', 'bit', 'already', 'anyone', 'especially', 'definitely', 
    'normal', 'whole', 'comment', 'honestly', 'completely', 'literally', 'sorry', 'eventually', 
    'day', 'part']

stop_words = english + reddit + topic_specific

In [46]:
#Remove stopwords
def remove_stopwords(tokens):
    """Removes stopwords"""
    return [token for token in tokens if token not in stop_words]

# Apply function to column
data["filtered"] = data["lemmatized"].apply(remove_stopwords)

Bigrams

In [47]:
# Train bigram model on the lemmatized column
bigram = Phrases(data["filtered"], min_count=20)
bigram_phraser = Phraser(bigram)  # Optimizes since it is a lighter-weight
                                   #version of Phrases

def add_bigrams(tokens):
    """Adds bigrams to a list of tokens if they appear frequently enough."""
    bigram_tokens = bigram_phraser[tokens]
    return tokens + [token for token in bigram_tokens if '_' in token]

data["ngrams"] = data["filtered"].apply(add_bigrams)

Filtering of too rare of too common words

In [48]:
# Convert df column to a list of lists for later steps
docs = data["ngrams"].tolist() 

# Create a dictionary from the tokenized documents
dictionary = Dictionary(docs)

# Filter out words that appear in fewer than 20 documents or 
#  more than 80% of documents, and keep the 1500 most frequent
dictionary.filter_extremes(no_below=20, no_above=0.8, keep_n=1500)


Vectorize

In [49]:
# Create bag of words representation
corpus = [dictionary.doc2bow(doc) for doc in docs]

Save to disk

In [50]:
# Save corpus and dictionary together
#with open('corpus_and_dict.pkl', 'wb') as f:
    #pickle.dump((corpus, dictionary), f)


## Training

### Test ?

In [51]:
#Set hyperparameters
num_topics = 12
chunksize = 2500 
passes = 25
iterations = 75 
eval_every = None
minimum_probability = 0.05  
alpha = 'auto'
eta = 'auto'

# Make an index to word dictionary.
temp = dictionary[0]  
id2word = dictionary.id2token

#Run  #CHANGE FIRST LINE
model = LdaModel( 
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha=alpha,
    eta=eta,
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every,
    random_state = 21,
    minimum_probability=minimum_probability
)

#Evaluate
top_topics = model.top_topics(corpus)  #CHANGE
pprint(top_topics)
cm = CoherenceModel(model=model, corpus=corpus, texts=docs, coherence='c_v', dictionary=dictionary) #CHANGE
print(cm.get_coherence())  
model.save("lda_models/model33") #CHANGE TWICE

[([(0.01673114, 'relationship'),
   (0.01002331, 'place'),
   (0.009720163, 'less'),
   (0.009174838, 'wrong'),
   (0.008237542, 'man'),
   (0.008065588, 'love'),
   (0.007889548, 'pretty'),
   (0.007883921, 'least'),
   (0.0076255067, 'luck'),
   (0.007124511, 'human'),
   (0.006962437, 'true'),
   (0.006673605, 'fact'),
   (0.006646849, 'far'),
   (0.005936727, 'sense'),
   (0.0058509265, 'high'),
   (0.005833315, 'strong'),
   (0.0058320663, 'possible'),
   (0.0055662915, 'effort'),
   (0.005368156, 'worth'),
   (0.005330639, 'journey')],
  -2.6326552685957427),
 ([(0.07407926, 'body'),
   (0.05337779, 'gym'),
   (0.04117402, 'weight'),
   (0.035161953, 'food'),
   (0.03400926, 'exercise'),
   (0.03127595, 'healthy'),
   (0.026448587, 'sleep'),
   (0.01984232, 'water'),
   (0.01888247, 'diet'),
   (0.018713903, 'bed'),
   (0.017847773, 'walk'),
   (0.017391158, 'routine'),
   (0.017247967, 'hair'),
   (0.015878394, 'muscle'),
   (0.015202962, 'fat'),
   (0.014135354, 'workout'),
   

### LDAVis

In [52]:
top_topics = model.top_topics(corpus) 
pyLDAvis.enable_notebook()
# feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(model, corpus, dictionary)
pyLDAvis.display(lda_viz)



### Check topic probability distribution