In [1]:
from bs4 import BeautifulSoup
from gensim.models import CoherenceModel
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim.corpora as corpora
import gensim
import html
import nltk
import numpy as np
import pandas as pd
import re
import wandb

In [2]:
# Read the data
df_comments1 = pd.read_pickle('./pickle_dataframes/comments1.pkl')
df_comments2 = pd.read_pickle('./pickle_dataframes/comments2.pkl')
df_comments = pd.concat([df_comments1,df_comments2])
df_comments.reset_index(drop=True, inplace=True)

df_posts1 = pd.read_pickle('./pickle_dataframes/posts1.pkl')
df_posts2 = pd.read_pickle('./pickle_dataframes/posts2.pkl')
df_posts3 = pd.read_pickle('./pickle_dataframes/posts3.pkl')
df_posts = pd.concat([df_posts1, df_posts2, df_posts3])
df_posts.reset_index(drop=True, inplace=True)

df_postlinks = pd.read_pickle('./pickle_dataframes/posts_links.pkl')
df_tags = pd.read_pickle('./pickle_dataframes/tags.pkl')
df_users = pd.read_pickle('./pickle_dataframes/users.pkl')

### Take a look at our DFs

In [3]:
df_comments.head()

Unnamed: 0,Id,PostId,Score,Text,CreationDate,UserId
0,1,1,9,Is it fair to inquire about the disadvantages ...,2012-12-04 22:00:00.933,28
1,3,1,3,"I could have reformulated the question, but at...",2012-12-04 22:02:37.737,18
2,7,2,2,Source on this? I don't see how it could possi...,2012-12-04 22:10:10.070,45
3,13,2,1,@Nick122 In a parliamentary system like the No...,2012-12-04 22:14:33.463,43
4,15,2,0,"Yes, but you will give a negative vote by voti...",2012-12-04 22:16:29.437,45


In [4]:
df_posts.head()

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount
0,1,1,-1,5,2012-12-04 21:40:29.743,42,8309,<p>We all know the situation could arise in th...,18,2019-06-29 09:18:38.430,What are the disadvantages of first-past-the-p...,<election><voting-systems><first-past-the-post>,3,3
1,2,1,-1,19,2012-12-04 21:53:18.800,26,7832,<p>I've heard that mathematically it can be sh...,21,2017-05-03 13:53:26.063,Why can't voting be fair if there are more tha...,<voting><political-theory><voting-systems>,4,3
2,4,2,1,-1,2012-12-04 21:58:11.187,7,-1,<p>First-past-the-post voting tends to result ...,26,2012-12-04 21:58:11.187,Comment: N/A,Comment: N/A,-1,1
3,5,2,1,-1,2012-12-04 21:58:39.037,47,-1,<p>Simple plurality voting has very little in ...,8,2012-12-04 22:04:42.767,Comment: N/A,Comment: N/A,-1,1
4,6,1,-1,28,2012-12-04 21:58:47.500,46,68096,<p>Living in a country where mandatory voting ...,18,2019-02-03 17:38:05.237,What are the advantages/disadvantages of a man...,<voting><voting-systems>,8,5


### Filtering Posts

In [5]:
questions_df = df_posts[df_posts['PostTypeId'] == 1]

# questions_sample_df = questions_df.sample(frac=0.25)
questions_df = questions_df.sample(frac=0.1)

### Preprocess text

In [6]:
# Modify preprocess_text function
def preprocess_text(text, remove_stopwords=False, use_lemmatize=True, use_stemmer=False):
    # Decode HTML entities
    text = html.unescape(text)

    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Remove non-alphanumeric characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower())

    words = text.split()
    if remove_stopwords:
        words = [word for word in words if word not in stopwords.words('english')]
    if use_lemmatize:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
    elif use_stemmer:  # Apply stemming only if use_stemmer is True
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]

    text = ' '.join(words)
    
    return text

### WandB Timeeee

In [7]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33ms223730[0m ([33mdeeplearning-02456[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

# Define apply_lda_and_log function with run_name parameter
def apply_lda_and_log(df, remove_stopwords, use_lemmatize, use_stemmer, tags_weighting, run_name):
    # Start a new WandB run with the specified name
    wandb.init(project="stackexchange_politics", name=run_name)
    
    # Preprocess Title, Body, and Tags
    df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
    df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
    df['Tags'] = df['Tags'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))


    # Combine Title, Body, and Tags with specified weight for Tags
    # We Keep the original order (title, body, tags) as it reflects the natural flow of information
    df['CombinedText'] = df['Title'] + ' ' + df['Body'] + ' ' + (df['Tags'] * tags_weighting)

    # Create a Dictionary and Corpus needed for Topic Modeling
    words = [doc.split() for doc in df['CombinedText']]
    id2word = corpora.Dictionary(words)
    corpus = [id2word.doc2bow(text) for text in words]

    # Apply TF-IDF
    tfidf_vectorizer = TfidfVectorizer(max_features=1000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['CombinedText'])

    # Apply LDA for different numbers of topics
    for n_topics in [5, 10, 15, 20]:
        lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
        lda.fit(tfidf_matrix)

        # Calculate Coherence Score
        lda_gensim = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=n_topics, random_state=0)
        coherence_model_lda = CoherenceModel(model=lda_gensim, texts=words, dictionary=id2word, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()

        # Log Coherence and Perplexity Score
        wandb.log({"coherence_score": coherence_lda, "perplexity_score": lda.perplexity(tfidf_matrix)})
        
        # Extract and log the top words for each topic as a table
        feature_names = tfidf_vectorizer.get_feature_names_out()
        top_words_data = []
        for topic_idx, topic in enumerate(lda.components_):
            top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
            top_words_data.append([f"Topic {topic_idx}"] + top_words)

        # Create a WandB Table with top words data
        columns = ["Topic"] + [f"Word {i+1}" for i in range(10)]
        top_words_table = wandb.Table(data=top_words_data, columns=columns)
        
        # Log the table to WandB
        wandb.log({f"n_topics_{n_topics}_cleaned_{str(remove_stopwords)}_lemmatize_{str(use_lemmatize)}_weight_{tags_weighting}": top_words_table})

    
    # Close WandB run
    wandb.finish()

# Define apply_lda_and_log function with run_name parameter
def apply_topic_modeling_and_log(df, remove_stopwords, use_lemmatize, use_stemmer, tags_weighting, run_name):
    # Start a new WandB run with the specified name
    wandb.init(project="stackexchange_politics", entity="s223730", name=run_name)
    # Make sure the script runs in the correct WandB project
    print(wandb.run.project_name())

    # Preprocess Title, Body, and Tags
    df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
    df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
    df['Tags'] = df['Tags'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))


    # Combine Title, Body, and Tags with specified weight for Tags
    # We Keep the original order (title, body, tags) as it reflects the natural flow of information
    df['CombinedText'] = df['Title'] + ' ' + df['Body'] + ' ' + (df['Tags'] * tags_weighting)

    # Create a Dictionary and Corpus needed for Topic Modeling
    words = [doc.split() for doc in df['CombinedText']]
    id2word = corpora.Dictionary(words)
    corpus = [id2word.doc2bow(text) for text in words]

    # Apply TF-IDF
    tfidf_vectorizer = TfidfVectorizer(max_features=1000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['CombinedText'])

    # Apply LDA and NMF for different numbers of topics
    for n_topics in [5, 10, 15, 20]:
        
        # LDA
        lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
        lda.fit(tfidf_matrix)

        # Calculate Coherence Score
        lda_gensim = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=n_topics, random_state=0)
        coherence_model_lda = CoherenceModel(model=lda_gensim, texts=words, dictionary=id2word, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()

        # Log Coherence and Perplexity Score
        wandb.log({"coherence_score": coherence_lda, "perplexity_score": lda.perplexity(tfidf_matrix)})
        
        # Extract and log the top words for each topic as a table
        feature_names = tfidf_vectorizer.get_feature_names_out()
        top_words_data = []
        for topic_idx, topic in enumerate(lda.components_):
            top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
            top_words_data.append([f"Topic {topic_idx}"] + top_words)

        # Create a WandB Table with top words data
        columns = ["Topic"] + [f"Word {i+1}" for i in range(10)]
        top_words_table = wandb.Table(data=top_words_data, columns=columns)
        
        # Log the table to WandB
        wandb.log({f"n_topics_{n_topics}_cleaned_{str(remove_stopwords)}_lemmatize_{str(use_lemmatize)}_weight_{tags_weighting}": top_words_table})

        # NMF
        nmf_model = NMF(n_components=n_topics, random_state=0)
        nmf_W = nmf_model.fit_transform(tfidf_matrix)
        nmf_H = nmf_model.components_

        # Log the top words for each topic for NMF
        nmf_top_words_data = []
        for topic_idx, topic in enumerate(nmf_H):
            top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
            nmf_top_words_data.append([f"Topic {topic_idx}"] + top_words)

        nmf_top_words_table = wandb.Table(data=nmf_top_words_data, columns=columns)
        wandb.log({f"nmf_n_topics_{n_topics}": nmf_top_words_table})

    
    # Close WandB run
    wandb.finish()

# Define apply_lda_and_log function with run_name parameter
def apply_topic_modeling_and_log(df, remove_stopwords, use_lemmatize, use_stemmer, tags_weighting, run_name, ngram_range=(1, 1)):
    # Start a new WandB run with the specified name
    wandb.init(project="stackexchange_politics", entity="s223730", name=run_name)
    # Make sure the script runs in the correct WandB project
    print(wandb.run.project_name())

    # Preprocess Title, Body, and Tags
    df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
    df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
    df['Tags'] = df['Tags'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))


    # Combine Title, Body, and Tags with specified weight for Tags
    # We Keep the original order (title, body, tags) as it reflects the natural flow of information
    df['CombinedText'] = df['Title'] + ' ' + df['Body'] + ' ' + (df['Tags'] * tags_weighting)

    # Create a Dictionary and Corpus needed for Topic Modeling
    words = [doc.split() for doc in df['CombinedText']]
    id2word = corpora.Dictionary(words)
    corpus = [id2word.doc2bow(text) for text in words]

    # Apply TF-IDF
    # ngram_range=(1, 2) for bi-grams, (1, 3) for tri-grams, and (2, 2) for only bi-grams
    tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=ngram_range)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['CombinedText'])

    # Apply LDA and NMF for different numbers of topics
    for n_topics in [5, 10, 15, 20]:
        
        # LDA
        lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
        lda.fit(tfidf_matrix)

        # Calculate Coherence Score
        lda_gensim = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=n_topics, random_state=0)
        coherence_model_lda = CoherenceModel(model=lda_gensim, texts=words, dictionary=id2word, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()

        # Log Coherence and Perplexity Score
        wandb.log({"coherence_score": coherence_lda, "perplexity_score": lda.perplexity(tfidf_matrix)})
        
        # Extract and log the top words for each topic as a table
        feature_names = tfidf_vectorizer.get_feature_names_out()
        top_words_data = []
        for topic_idx, topic in enumerate(lda.components_):
            top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
            top_words_data.append([f"Topic {topic_idx}"] + top_words)

        # Create a WandB Table with top words data
        columns = ["Topic"] + [f"Word {i+1}" for i in range(10)]
        top_words_table = wandb.Table(data=top_words_data, columns=columns)
        
        # Log the table to WandB
        wandb.log({f"n_topics_{n_topics}_cleaned_{str(remove_stopwords)}_lemmatize_{str(use_lemmatize)}_weight_{tags_weighting}": top_words_table})

        # NMF
        nmf_model = NMF(n_components=n_topics, random_state=0)
        nmf_W = nmf_model.fit_transform(tfidf_matrix)
        nmf_H = nmf_model.components_

        # Log the top words for each topic for NMF
        nmf_top_words_data = []
        for topic_idx, topic in enumerate(nmf_H):
            top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
            nmf_top_words_data.append([f"Topic {topic_idx}"] + top_words)

        nmf_top_words_table = wandb.Table(data=nmf_top_words_data, columns=columns)
        wandb.log({f"nmf_n_topics_{n_topics}": nmf_top_words_table})

    # Close WandB run
    wandb.finish()

In [8]:
# Define apply_lda_and_log function with run_name parameter
def apply_topic_modeling_and_log(df, remove_stopwords, use_lemmatize, use_stemmer, tags_weighting, run_name, ngram_range=(1, 1), max_features=1000):
    # Start a new WandB run with the specified name
    wandb.init(project="stackexchange_politics", entity="s223730", name=run_name)

    # Preprocess Title, Body, and Tags
    df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
    df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
    df['Tags'] = df['Tags'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))


    # Combine Title, Body, and Tags with specified weight for Tags
    # We Keep the original order (title, body, tags) as it reflects the natural flow of information
    df['CombinedText'] = df['Title'] + ' ' + df['Body'] + ' ' + (df['Tags'] * tags_weighting)

    # Create a Dictionary and Corpus needed for Topic Modeling
    words = [doc.split() for doc in df['CombinedText']]
    id2word = corpora.Dictionary(words)
    corpus = [id2word.doc2bow(text) for text in words]

    # Apply TF-IDF with the specified max_features
    # ngram_range=(1, 2) for bi-grams, (1, 3) for tri-grams, and (2, 2) for only bi-grams
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['CombinedText'])

    # Apply LDA and NMF for different numbers of topics
    for n_topics in [5, 10, 15, 20]:
        
        # LDA
        lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
        lda.fit(tfidf_matrix)

        # Calculate Coherence Score
        lda_gensim = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=n_topics, random_state=0)
        coherence_model_lda = CoherenceModel(model=lda_gensim, texts=words, dictionary=id2word, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()

        # Log Coherence and Perplexity Score
        wandb.log({"coherence_score": coherence_lda, "perplexity_score": lda.perplexity(tfidf_matrix)})
        
        # Extract and log the top words for each topic as a table
        feature_names = tfidf_vectorizer.get_feature_names_out()
        top_words_data = []
        for topic_idx, topic in enumerate(lda.components_):
            top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
            top_words_data.append([f"Topic {topic_idx}"] + top_words)

        # Create a WandB Table with top words data
        columns = ["Topic"] + [f"Word {i+1}" for i in range(10)]
        top_words_table = wandb.Table(data=top_words_data, columns=columns)
        
        # Log the table to WandB
        wandb.log({f"n_topics_{n_topics}_cleaned_{str(remove_stopwords)}_lemmatize_{str(use_lemmatize)}_weight_{tags_weighting}": top_words_table})

        # NMF
        nmf_model = NMF(n_components=n_topics, random_state=0)
        nmf_W = nmf_model.fit_transform(tfidf_matrix)
        nmf_H = nmf_model.components_

        # Log the top words for each topic for NMF
        nmf_top_words_data = []
        for topic_idx, topic in enumerate(nmf_H):
            top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
            nmf_top_words_data.append([f"Topic {topic_idx}"] + top_words)

        nmf_top_words_table = wandb.Table(data=nmf_top_words_data, columns=columns)
        wandb.log({f"nmf_n_topics_{n_topics}": nmf_top_words_table})

    # Close WandB run
    wandb.finish()

In [None]:
# Define apply_lda_and_log function with run_name parameter
def apply_topic_modeling_and_log__(df, remove_stopwords, use_lemmatize, use_stemmer, tags_weighting, run_name, ngram_range=(1, 1), max_features=1000):
    # Start a new WandB run with the specified name
    wandb.init(project="stackexchange_politics", entity="s223730", name=run_name)

    # Preprocess Title, Body, and Tags
    df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
    df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
    df['Tags'] = df['Tags'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))


    # Combine Title, Body, and Tags with specified weight for Tags
    # We Keep the original order (title, body, tags) as it reflects the natural flow of information
    df['CombinedText'] = df['Title'] + ' ' + df['Body'] + ' ' + (df['Tags'] * tags_weighting)

    # Create a Dictionary and Corpus needed for Topic Modeling
    words = [doc.split() for doc in df['CombinedText']]
    id2word = corpora.Dictionary(words)
    corpus = [id2word.doc2bow(text) for text in words]

    # Apply TF-IDF with the specified max_features
    # ngram_range=(1, 2) for bi-grams, (1, 3) for tri-grams, and (2, 2) for only bi-grams
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['CombinedText'])

    # Apply LDA and NMF for different numbers of topics
    for n_topics in [5, 10, 15, 20]:
        
        # LDA
        lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
        lda.fit(tfidf_matrix)

        # Extract Topic Distributions for LDA
        lda_topic_distributions = lda.transform(tfidf_matrix)

        # Normalize LDA Topic Distributions
        lda_normalized = np.array(lda_topic_distributions) / np.sum(lda_topic_distributions, axis=1)[:, None]

        # Log LDA normalized distributions
        wandb.log({"lda_normalized_distributions": wandb.Table(data=lda_normalized.tolist())})

        # Calculate Coherence Score
        lda_gensim = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=n_topics, random_state=0)
        coherence_model_lda = CoherenceModel(model=lda_gensim, texts=words, dictionary=id2word, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()

        # Log Coherence and Perplexity Score
        wandb.log({"coherence_score": coherence_lda, "perplexity_score": lda.perplexity(tfidf_matrix)})
        
        # Extract and log the top words for each topic as a table
        feature_names = tfidf_vectorizer.get_feature_names_out()
        top_words_data = []
        for topic_idx, topic in enumerate(lda.components_):
            top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
            top_words_data.append([f"Topic {topic_idx}"] + top_words)

        # Create a WandB Table with top words data
        columns = ["Topic"] + [f"Word {i+1}" for i in range(10)]
        top_words_table = wandb.Table(data=top_words_data, columns=columns)
        
        # Log the table to WandB
        wandb.log({f"n_topics_{n_topics}_cleaned_{str(remove_stopwords)}_lemmatize_{str(use_lemmatize)}_weight_{tags_weighting}": top_words_table})

        # NMF
        nmf_model = NMF(n_components=n_topics, random_state=0)
        nmf_W = nmf_model.fit_transform(tfidf_matrix)

        # Normalize NMF Topic Distributions (nmf_W is already the topic distribution matrix)
        nmf_normalized = np.array(nmf_W) / np.sum(nmf_W, axis=1)[:, None]

        # Log NMF normalized distributions
        wandb.log({"nmf_normalized_distributions": wandb.Table(data=nmf_normalized.tolist())})

        nmf_H = nmf_model.components_

        # Log the top words for each topic for NMF
        nmf_top_words_data = []
        for topic_idx, topic in enumerate(nmf_H):
            top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
            nmf_top_words_data.append([f"Topic {topic_idx}"] + top_words)

        nmf_top_words_table = wandb.Table(data=nmf_top_words_data, columns=columns)
        wandb.log({f"nmf_n_topics_{n_topics}": nmf_top_words_table})

    # Close WandB run
    wandb.finish()

### Notes on Coherence and Perplexity

- **Coherence**: This measures how well the topics are defined. A higher coherence score generally indicates that the topics are more interpretable and distinct. Look for configurations with the highest coherence scores.

- **Perplexity**: This is a measure of how well the model predicts a sample. In general, lower perplexity indicates a better model. However, perplexity can sometimes be misleading, especially if the model is overfitting.

### Locality Sensitive Hashing (LSH)
- **Application**: LSH is typically used for similarity searches in high-dimensional data. In your case, it can help identify documents (posts or comments) that are similar in terms of their topic distributions.



### Running different LDA configurations

#### Baseline

In [None]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                             remove_stopwords=False, 
                             use_lemmatize=False, 
                             use_stemmer=False,
                             tags_weighting=1, 
                             run_name="MaxFeatures_500",
                             ngram_range=(1, 1),
                             max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                             remove_stopwords=False, 
                             use_lemmatize=False, 
                             use_stemmer=False,
                             tags_weighting=1, 
                             run_name="MaxFeatures_1000",
                             ngram_range=(1, 1),
                             max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                             remove_stopwords=False, 
                             use_lemmatize=False, 
                             use_stemmer=False,
                             tags_weighting=1, 
                             run_name="MaxFeatures_2000",
                             ngram_range=(1, 1),
                             max_features=2000)

#### Baseline removed stopwords

In [None]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=1, 
                  run_name="MaxFeatures_500_StopwordsRemoved",
                  ngram_range=(1, 1),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=1, 
                  run_name="MaxFeatures_1000_StopwordsRemoved",
                  ngram_range=(1, 1),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=1, 
                  run_name="MaxFeatures_2000_StopwordsRemoved",
                  ngram_range=(1, 1),
                  max_features=2000)

#### Removed stopwords tags weight = 2

In [None]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=2, 
                  run_name="MaxFeatures_500_StopwordsRemoved_TagsWeight2",
                  ngram_range=(1, 1),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=2, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_TagsWeight2",
                  ngram_range=(1, 1),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=2, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_TagsWeight2",
                  ngram_range=(1, 1),
                  max_features=2000)

#### Removed stopwords tags weight = 5

In [None]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=5, 
                  run_name="MaxFeatures_500_StopwordsRemoved_TagsWeight5",
                  ngram_range=(1, 1),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=5, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_TagsWeight5",
                  ngram_range=(1, 1),
                  max_features=1000)                  

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=5, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_TagsWeight5",
                  ngram_range=(1, 1),
                  max_features=2000)

#### Removed stopwords, lemmatized

In [None]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=1, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Lemmatized",
                  ngram_range=(1, 1),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=1, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Lemmatized",
                  ngram_range=(1, 1),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=1, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Lemmatized",
                  ngram_range=(1, 1),
                  max_features=2000)

#### Removed stopwords, lemmatized tags weight = 2

In [None]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=2, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Lemmatized_TagsWeight2",
                  ngram_range=(1, 1),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=2, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Lemmatized_TagsWeight2",
                  ngram_range=(1, 1),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=2, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Lemmatized_TagsWeight2",
                  ngram_range=(1, 1),
                  max_features=2000)

#### Removed stopwords, lemmatized tags weight = 5

In [None]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=5, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Lemmatized_TagsWeight5",
                  ngram_range=(1, 1),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=5, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Lemmatized_TagsWeight5",
                  ngram_range=(1, 1),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=5, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Lemmatized_TagsWeight5",
                  ngram_range=(1, 1),
                  max_features=2000)

#### Removed stopwords, stemmed

In [None]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=1, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Stemmed",
                  ngram_range=(1, 1),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=1, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Stemmed",
                  ngram_range=(1, 1),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=1, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Stemmed",
                  ngram_range=(1, 1),
                  max_features=2000)

#### Removed stopwords, stemmed tags weight = 2

In [None]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=2, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Stemmed_TagsWeight2",
                  ngram_range=(1, 1),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=2, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Stemmed_TagsWeight2",
                  ngram_range=(1, 1),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=2, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Stemmed_TagsWeight2",
                  ngram_range=(1, 1),
                  max_features=2000)

#### **Removed stopwords, stemmed tags weight = 5**

In [9]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=5, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Stemmed_TagsWeight5",
                  ngram_range=(1, 1),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=5, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Stemmed_TagsWeight5",
                  ngram_range=(1, 1),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=5, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Stemmed_TagsWeight5",
                  ngram_range=(1, 1),
                  max_features=2000)

[34m[1mwandb[0m: Currently logged in as: [33ms223730[0m. Use [1m`wandb login --relogin`[0m to force relogin


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.031 MB of 0.034 MB uploaded\r'), FloatProgress(value=0.9336071377991786, max=1.0…



0,1
coherence_score,█▁▂▆
perplexity_score,▁▃▆█

0,1
coherence_score,0.35496
perplexity_score,1277.37206


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.034 MB of 0.034 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▆
perplexity_score,▁▄▆█

0,1
coherence_score,0.35496
perplexity_score,2478.33239


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011131948155363919, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.034 MB of 0.034 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▆
perplexity_score,▁▄▇█

0,1
coherence_score,0.35496
perplexity_score,4224.38424


### The same but with Unigrams and Bigrams (1,2)

#### Baseline removed stopwords

In [10]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=1, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Bigram",
                  ngram_range=(1, 2),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=1, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Bigram",
                  ngram_range=(1, 2),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=1, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Bigram",
                  ngram_range=(1, 2),
                  max_features=2000)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01114388981109692, max=1.0)…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▁
perplexity_score,▁▃▆█

0,1
coherence_score,0.33588
perplexity_score,1339.88289


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01116766018870597, max=1.0)…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.33588
perplexity_score,2520.75587


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011156426855531107, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.037 MB of 0.037 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▁
perplexity_score,▁▃▆█

0,1
coherence_score,0.33588
perplexity_score,4763.16391


#### Removed stopwords tags weight = 2

In [11]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=2, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Bigram_TagsWeight2",
                  ngram_range=(1, 2),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=2, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Bigram_TagsWeight2",
                  ngram_range=(1, 2),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=2, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Bigram_TagsWeight2",
                  ngram_range=(1, 2),
                  max_features=2000)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011178220833405956, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▁
perplexity_score,▁▃▆█

0,1
coherence_score,0.33588
perplexity_score,1339.88289


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011174998611224712, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.034 MB of 0.036 MB uploaded\r'), FloatProgress(value=0.9290314101721645, max=1.0…



0,1
coherence_score,█▁▂▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.33588
perplexity_score,2520.75587


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011167671755538322, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.034 MB of 0.037 MB uploaded\r'), FloatProgress(value=0.9298363886223974, max=1.0…



0,1
coherence_score,█▁▂▁
perplexity_score,▁▃▆█

0,1
coherence_score,0.33588
perplexity_score,4763.16391


#### Removed stopwords tags weight = 5

In [12]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=5, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Bigram_TagsWeight5",
                  ngram_range=(1, 2),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=5, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Bigram_TagsWeight5",
                  ngram_range=(1, 2),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=5, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Bigram_TagsWeight5",
                  ngram_range=(1, 2),
                  max_features=2000)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011173683799764451, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.033 MB of 0.036 MB uploaded\r'), FloatProgress(value=0.9309706994075544, max=1.0…



0,1
coherence_score,█▁▂▁
perplexity_score,▁▃▆█

0,1
coherence_score,0.33588
perplexity_score,1339.88289


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011167756022122275, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.33588
perplexity_score,2520.75587


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011167643977872406, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.037 MB of 0.037 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
coherence_score,█▁▂▁
perplexity_score,▁▃▆█

0,1
coherence_score,0.33588
perplexity_score,4763.16391


#### Removed stopwords, lemmatized

In [13]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=1, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Lemmatized_Bigram",
                  ngram_range=(1, 2),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=1, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Lemmatized_Bigram",
                  ngram_range=(1, 2),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=1, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Lemmatized_Bigram",
                  ngram_range=(1, 2),
                  max_features=2000)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011131453699716885, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,██▅▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.31121
perplexity_score,1277.64546


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011175367588617115, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,██▅▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.31121
perplexity_score,2537.2021


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011117386111355801, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,██▅▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.31121
perplexity_score,4840.09945


#### Removed stopwords, lemmatized tags weight = 2

In [14]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=2, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Lemmatized_Bigram_TagsWeight2",
                  ngram_range=(1, 2),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=2, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Lemmatized_Bigram_TagsWeight2",
                  ngram_range=(1, 2),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=2, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Lemmatized_Bigram_TagsWeight2",
                  ngram_range=(1, 2),
                  max_features=2000)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011173947222414427, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.033 MB of 0.036 MB uploaded\r'), FloatProgress(value=0.9312300982044901, max=1.0…



0,1
coherence_score,██▅▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.31121
perplexity_score,1277.64546


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011167731944523338, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,██▅▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.31121
perplexity_score,2537.2021


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011164463888659763, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,██▅▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.31121
perplexity_score,4840.09945


#### Removed stopwords, lemmatized tags weight = 5

In [15]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=5, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Lemmatized_Bigram_TagsWeight5",
                  ngram_range=(1, 2),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=5, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Lemmatized_Bigram_TagsWeight5",
                  ngram_range=(1, 2),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=5, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Lemmatized_Bigram_TagsWeight5",
                  ngram_range=(1, 2),
                  max_features=2000)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011114268055340897, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.033 MB of 0.036 MB uploaded\r'), FloatProgress(value=0.9312300982044901, max=1.0…



0,1
coherence_score,██▅▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.31121
perplexity_score,1277.64546


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011170307866672778, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,██▅▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.31121
perplexity_score,2537.2021


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011118076388892303, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,██▅▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.31121
perplexity_score,4840.09945


#### Removed stopwords, stemmed

In [16]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=1, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Stemmed_Bigram",
                  ngram_range=(1, 2),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=1, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Stemmed_Bigram",
                  ngram_range=(1, 2),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=1, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Stemmed_Bigram",
                  ngram_range=(1, 2),
                  max_features=2000)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011172239344321294, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.032 MB of 0.034 MB uploaded\r'), FloatProgress(value=0.9332940945654307, max=1.0…



0,1
coherence_score,█▁▂▆
perplexity_score,▁▃▆█

0,1
coherence_score,0.35496
perplexity_score,1280.16521


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011171810188938657, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.032 MB of 0.034 MB uploaded\r'), FloatProgress(value=0.9318810222036028, max=1.0…



0,1
coherence_score,█▁▂▆
perplexity_score,▁▃▆█

0,1
coherence_score,0.35496
perplexity_score,2497.71731


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011168671300078535, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.035 MB of 0.035 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▆
perplexity_score,▁▃▅█

0,1
coherence_score,0.35496
perplexity_score,4711.58136


#### Removed stopwords, stemmed tags weight = 2

In [17]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=2, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Stemmed_Bigram_TagsWeight2",
                  ngram_range=(1, 2),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=2, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Stemmed_Bigram_TagsWeight2",
                  ngram_range=(1, 2),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=2, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Stemmed_Bigram_TagsWeight2",
                  ngram_range=(1, 2),
                  max_features=2000)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011167197688741403, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.032 MB of 0.034 MB uploaded\r'), FloatProgress(value=0.9332940945654307, max=1.0…

0,1
coherence_score,█▁▂▆
perplexity_score,▁▃▆█

0,1
coherence_score,0.35496
perplexity_score,1280.16521


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011168210188690056, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.034 MB of 0.034 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▆
perplexity_score,▁▃▆█

0,1
coherence_score,0.35496
perplexity_score,2497.71731


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.0111675120333934, max=1.0))…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.032 MB of 0.035 MB uploaded\r'), FloatProgress(value=0.9313349695281692, max=1.0…



0,1
coherence_score,█▁▂▆
perplexity_score,▁▃▅█

0,1
coherence_score,0.35496
perplexity_score,4711.58136


#### Removed stopwords, stemmed tags weight = 5

In [18]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=5, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Stemmed_Bigram_TagsWeight5",
                  ngram_range=(1, 2),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=5, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Stemmed_Bigram_TagsWeight5",
                  ngram_range=(1, 2),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=5, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Stemmed_Bigram_TagsWeight5",
                  ngram_range=(1, 2),
                  max_features=2000)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01116763194440864, max=1.0)…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.034 MB of 0.034 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▆
perplexity_score,▁▃▆█

0,1
coherence_score,0.35496
perplexity_score,1280.16521


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011167352322243258, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.034 MB of 0.034 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▆
perplexity_score,▁▃▆█

0,1
coherence_score,0.35496
perplexity_score,2497.71731


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011169167588943513, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.032 MB of 0.035 MB uploaded\r'), FloatProgress(value=0.9313349695281692, max=1.0…



0,1
coherence_score,█▁▂▆
perplexity_score,▁▃▅█

0,1
coherence_score,0.35496
perplexity_score,4711.58136


### The same but with Unigrams, Bigrams, and Trigrams (1,3)

#### Baseline removed stopwords

In [19]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=1, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Trigram",
                  ngram_range=(1, 3),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=1, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Trigram",
                  ngram_range=(1, 3),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=1, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Trigram",
                  ngram_range=(1, 3),
                  max_features=2000)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011167625933497523, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▁
perplexity_score,▁▃▆█

0,1
coherence_score,0.33588
perplexity_score,1339.88289


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011170689811115153, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.33588
perplexity_score,2621.50508


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01118198982213572, max=1.0)…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.037 MB of 0.037 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.33588
perplexity_score,5064.96906


#### Removed stopwords tags weight = 2

In [20]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=2, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Trigram_TagsWeight2",
                  ngram_range=(1, 3),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=2, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Trigram_TagsWeight2",
                  ngram_range=(1, 3),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=2, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Trigram_TagsWeight2",
                  ngram_range=(1, 3),
                  max_features=2000)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011167881477740593, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▁
perplexity_score,▁▃▆█

0,1
coherence_score,0.33588
perplexity_score,1339.88289


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011168753711131608, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.034 MB of 0.036 MB uploaded\r'), FloatProgress(value=0.9288443800237185, max=1.0…



0,1
coherence_score,█▁▂▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.33588
perplexity_score,2621.50508


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011167764811155697, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.037 MB of 0.037 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.33588
perplexity_score,5064.96906


#### Removed stopwords tags weight = 5

In [21]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=5, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Trigram_TagsWeight5",
                  ngram_range=(1, 3),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=5, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Trigram_TagsWeight5",
                  ngram_range=(1, 3),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=False,
                  tags_weighting=5, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Trigram_TagsWeight5",
                  ngram_range=(1, 3),
                  max_features=2000)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011187447688684592, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▁
perplexity_score,▁▃▆█

0,1
coherence_score,0.33588
perplexity_score,1339.88289


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011176567589023358, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.33588
perplexity_score,2621.50508


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011168355555532293, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.037 MB of 0.037 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.33588
perplexity_score,5064.96906


#### Removed stopwords, lemmatized

In [22]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=1, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Lemmatized_Trigram",
                  ngram_range=(1, 3),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=1, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Lemmatized_Trigram",
                  ngram_range=(1, 3),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=1, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Lemmatized_Trigram",
                  ngram_range=(1, 3),
                  max_features=2000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,██▅▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.31121
perplexity_score,1277.64546


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011167333333287389, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,██▅▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.31121
perplexity_score,2537.2021


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01116793982218951, max=1.0)…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.034 MB of 0.036 MB uploaded\r'), FloatProgress(value=0.930161810600299, max=1.0)…



0,1
coherence_score,██▅▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.31121
perplexity_score,4705.70965


#### Removed stopwords, lemmatized tags weight = 2

In [23]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=2, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Lemmatized_Trigram_TagsWeight2",
                  ngram_range=(1, 3),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=2, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Lemmatized_Trigram_TagsWeight2",
                  ngram_range=(1, 3),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=2, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Lemmatized_Trigram_TagsWeight2",
                  ngram_range=(1, 3),
                  max_features=2000)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01116872823331505, max=1.0)…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,██▅▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.31121
perplexity_score,1277.64546


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011222992133116349, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
coherence_score,██▅▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.31121
perplexity_score,2537.2021


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011129863888749645, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,██▅▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.31121
perplexity_score,4705.70965


#### Removed stopwords, lemmatized tags weight = 5

In [24]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=5, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Lemmatized_Trigram_TagsWeight5",
                  ngram_range=(1, 3),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=5, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Lemmatized_Trigram_TagsWeight5",
                  ngram_range=(1, 3),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=True, 
                  use_stemmer=False,
                  tags_weighting=5, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Lemmatized_Trigram_TagsWeight5",
                  ngram_range=(1, 3),
                  max_features=2000)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011167729633355824, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
coherence_score,██▅▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.31121
perplexity_score,1277.64546


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011225884722403458, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,██▅▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.31121
perplexity_score,2537.2021


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011168752322232144, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,██▅▁
perplexity_score,▁▄▆█

0,1
coherence_score,0.31121
perplexity_score,4705.70965


#### Removed stopwords, stemmed

In [25]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=1, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Stemmed_Trigram",
                  ngram_range=(1, 3),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=1, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Stemmed_Trigram",
                  ngram_range=(1, 3),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=1, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Stemmed_Trigram",
                  ngram_range=(1, 3),
                  max_features=2000)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011168510644347407, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.032 MB of 0.034 MB uploaded\r'), FloatProgress(value=0.933220957999719, max=1.0)…



0,1
coherence_score,█▁▂▆
perplexity_score,▁▃▅█

0,1
coherence_score,0.35496
perplexity_score,1287.06658


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011125706011080183, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.034 MB of 0.034 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▆
perplexity_score,▁▃▆█

0,1
coherence_score,0.35496
perplexity_score,2497.71731


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011167733333422802, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.032 MB of 0.035 MB uploaded\r'), FloatProgress(value=0.931306522994035, max=1.0)…

0,1
coherence_score,█▁▂▆
perplexity_score,▁▄▅█

0,1
coherence_score,0.35496
perplexity_score,4706.77275


#### Removed stopwords, stemmed tags weight = 2

In [26]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=2, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Stemmed_Trigram_TagsWeight2",
                  ngram_range=(1, 3),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=2, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Stemmed_Trigram_TagsWeight2",
                  ngram_range=(1, 3),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=2, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Stemmed_Trigram_TagsWeight2",
                  ngram_range=(1, 3),
                  max_features=2000)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01115793981104313, max=1.0)…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.034 MB of 0.034 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▆
perplexity_score,▁▃▅█

0,1
coherence_score,0.35496
perplexity_score,1287.06658


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011137275466656623, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.034 MB of 0.034 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▆
perplexity_score,▁▃▆█

0,1
coherence_score,0.35496
perplexity_score,2497.71731


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112298155462162, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.035 MB of 0.035 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▆
perplexity_score,▁▄▅█

0,1
coherence_score,0.35496
perplexity_score,4706.77275


#### Removed stopwords, stemmed tags weight = 5

In [27]:
apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=5, 
                  run_name="MaxFeatures_500_StopwordsRemoved_Stemmed_Trigram_TagsWeight5",
                  ngram_range=(1, 3),
                  max_features=500)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=5, 
                  run_name="MaxFeatures_1000_StopwordsRemoved_Stemmed_Trigram_TagsWeight5",
                  ngram_range=(1, 3),
                  max_features=1000)

apply_topic_modeling_and_log(df_posts[df_posts['PostTypeId'] == 1], 
                  remove_stopwords=True, 
                  use_lemmatize=False, 
                  use_stemmer=True,
                  tags_weighting=5, 
                  run_name="MaxFeatures_2000_StopwordsRemoved_Stemmed_Trigram_TagsWeight5",
                  ngram_range=(1, 3),
                  max_features=2000)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011167656022007578, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.032 MB of 0.034 MB uploaded\r'), FloatProgress(value=0.933220957999719, max=1.0)…

0,1
coherence_score,█▁▂▆
perplexity_score,▁▃▅█

0,1
coherence_score,0.35496
perplexity_score,1287.06658


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011126051844459854, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.034 MB of 0.034 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▆
perplexity_score,▁▃▆█

0,1
coherence_score,0.35496
perplexity_score,2497.71731


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011168635188692457, max=1.0…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize, use_stemmer))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

VBox(children=(Label(value='0.035 MB of 0.035 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
coherence_score,█▁▂▆
perplexity_score,▁▄▅█

0,1
coherence_score,0.35496
perplexity_score,4706.77275


### Suggestions for Improvements
- Adjust StopWords?
- **Hyperparameter Tuning**: Tune the parameters of the LDA model,
    - learning decay
    - batch size

### Topic Modelling


- Should only Post-Level have topic assigned to them?
    - Then Sub-Posts are assigned the same topic as Post
    - Comments are assigned the same topic as Post

In [None]:
questions_df.to_pickle('questions_cleaned_text.pkl')

### Sentiment Analysis
Apply Sentiment Analysis on:
- Post Level
- Sub Post Level
- Comment Level

### Community Detection

**User-Post-Topic Matrix**: 
- Create a matrix where rows represent users and columns represent topics. 
- Each cell contains the count of posts/comments a user has made in a particular topic.
    - Post Level: where `PostTypeId` == 1 AND `ParentId` == -1
    - Sub Post Level: where `PostTypeId` == 1 AND `ParentId` != -1
    - Comment Level: where `PostTypeId` == 2
- **Include Post Statistics**
    - AcceptedAnswerId
    - Score
    - ViewCount
    - AnswerCount
    - CommentCount
- **Include Comment Statistics**
    - Score

**Clustering Algorithms**
- K-Means: Use the user-topic matrix to cluster users. Determine the optimal number of clusters (communities) using the Elbow method or Silhouette score.

- Hierarchical Clustering: Useful for understanding the data structure and forming hierarchical communities. Dendrograms can visualize the community structure.

- DBSCAN: Good for datasets with noise and clusters of varying densities.

**Market Basket Analysis**
- Association Rules and Apriori Algorithm: 
    - Treat each user's set of topics as a 'basket'. 
    - Identify strong rules where the presence of one topic implies the presence of another in a user's posts
    - This can highlight topic-based communities.
- Frequent Itemsets: 
    - Identify sets of topics that frequently occur together in users' posts.

**Locality Sensitive Hashing (LSH)**
- LSH for Dimension Reduction: 
    - If the user-topic matrix is very sparse and high-dimensional, LSH can reduce dimensions while preserving the similarity structure. This can make subsequent clustering more effective.

**Advanced Techniques**
- PCY Algorithm: If you're dealing with very large data, this algorithm efficiently finds frequent itemsets, useful in subsequent association rule mining.

### Evaluating Communities

**Davies-Bouldin Index**: Evaluate the quality of clusters. 
- Lower Davies-Bouldin index values signify better clustering.