In [1]:
from bs4 import BeautifulSoup
from gensim.models import CoherenceModel
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim.corpora as corpora
import gensim
import html
import itertools
import nltk
import numpy as np
import pandas as pd
import pickle 
import re



In [2]:
# Load and merge dataframes
comments = pd.concat([pd.read_pickle('./pickle_dataframes/comments1.pkl'),
                      pd.read_pickle('./pickle_dataframes/comments2.pkl')]).reset_index(drop=True)

posts = pd.concat([pd.read_pickle('./pickle_dataframes/posts1.pkl'),
                   pd.read_pickle('./pickle_dataframes/posts2.pkl'),
                   pd.read_pickle('./pickle_dataframes/posts3.pkl')]).reset_index(drop=True)

users = pd.read_pickle('./pickle_dataframes/users.pkl')
postlinks = pd.read_pickle('./pickle_dataframes/posts_links.pkl')
tags = pd.read_pickle('./pickle_dataframes/tags.pkl')

questions = posts[posts.PostTypeId==1]
answers = posts[posts['PostTypeId'] == 2]

In [3]:
#comments = comments.sample(frac=0.1, random_state=0)
#posts = posts.sample(frac=0.1, random_state=0)

From running various tests we found that the topic modelling method that yielded the best highest coherence score and the lowest perplexity score was:   

In [4]:
# Modify preprocess_text function
def preprocess_text(text, remove_stopwords=False, use_lemmatize=True):
    # Decode HTML entities
    text = html.unescape(text)

    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Remove non-alphanumeric characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower())

    words = text.split()
    if remove_stopwords:
        words = [word for word in words if word not in stopwords.words('english')]
    if use_lemmatize:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

    text = ' '.join(words)
    
    return text

In [5]:
# Define apply_lda_and_log function with run_name parameter
def apply_topic_modeling_and_log(df, remove_stopwords, use_lemmatize, tags_weighting, run_name, ngram_range=(1, 1), max_features=1000):

    # Initialize dictionaries to store topic distributions
    lda_distributions = {}
    nmf_distributions = {}

    # Preprocess Title, Body, and Tags
    df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
    df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
    df['Tags'] = df['Tags'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))


    # Combine Title, Body, and Tags with specified weight for Tags
    # We Keep the original order (title, body, tags) as it reflects the natural flow of information
    df['CombinedText'] = df['Title'] + ' ' + df['Body'] + ' ' + (df['Tags'] * tags_weighting)

    # Create a Dictionary and Corpus needed for Topic Modeling
    words = [doc.split() for doc in df['CombinedText']]
    id2word = corpora.Dictionary(words)
    corpus = [id2word.doc2bow(text) for text in words]

    # Apply TF-IDF with the specified max_features
    # ngram_range=(1, 2) for bi-grams, (1, 3) for tri-grams, and (2, 2) for only bi-grams
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['CombinedText'])

    # Apply LDA and NMF for different numbers of topics
    # Prepare a structured dictionary to store results with n_topics as part of the key
    all_topics_results = {}
    for n_topics in [10, 15, 20, 25]:
        
        # LDA
        lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
        lda.fit(tfidf_matrix)

        # Extract Topic Distributions for LDA
        lda_topic_distributions = lda.transform(tfidf_matrix)

        # Normalize LDA Topic Distributions
        lda_normalized = np.array(lda_topic_distributions) / np.sum(lda_topic_distributions, axis=1)[:, None]

        # Calculate Coherence Score
        lda_gensim = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=n_topics, random_state=0)
        coherence_model_lda = CoherenceModel(model=lda_gensim, texts=words, dictionary=id2word, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()

        # Calculate LDA Perplexity
        lda_perplexity = lda.perplexity(tfidf_matrix)

        # Extract and log the top words for each topic as a table
        feature_names = tfidf_vectorizer.get_feature_names_out()
        top_words_data = []
        for topic_idx, topic in enumerate(lda.components_):
            top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
            top_words_data.append([f"Topic {topic_idx}"] + top_words)

        # NMF
        nmf_model = NMF(n_components=n_topics, random_state=0)
        nmf_W = nmf_model.fit_transform(tfidf_matrix)

        # Normalize NMF Topic Distributions (nmf_W is already the topic distribution matrix)
        nmf_normalized = np.array(nmf_W) / np.sum(nmf_W, axis=1)[:, None]

        nmf_H = nmf_model.components_

        # Calculate NMF Reconstruction Error
        nmf_reconstruction_error = np.linalg.norm(tfidf_matrix - nmf_W.dot(nmf_H))

        # Log the top words for each topic for NMF
        nmf_top_words_data = []
        for topic_idx, topic in enumerate(nmf_H):
            top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
            nmf_top_words_data.append([f"Topic {topic_idx}"] + top_words)


        # Store the results including perplexity and reconstruction error
        all_topics_results[f"{run_name}_n_topics_{n_topics}"] = {
            'lda_normalized': lda_normalized,
            'nmf_normalized': nmf_normalized,
            'lda_coherence': coherence_lda,
            'lda_perplexity': lda_perplexity,
            'nmf_reconstruction_error': nmf_reconstruction_error,
            'lda_top_words': top_words_data,
            'nmf_top_words': nmf_top_words_data
        }
        


    # Return the topic distributions
    return all_topics_results

### You want to jump straight to the topic modelled DataFrames, don't you?

Here you go: 

In [106]:
# Test various combinations
use_lemmatize_options = [True]
tags_weighting_options = [1, 2, 5]
ngram_range_options = [(1, 1), (1, 2), (1, 3)]
max_features_options = [1000]
remove_stopwords = True 

all_results = {}

for use_lemmatize, tags_weighting, ngram_range, max_features in itertools.product(use_lemmatize_options, tags_weighting_options, ngram_range_options, max_features_options):
    run_name = f"Run_remove_{remove_stopwords}_lemmatize_{use_lemmatize}_weight_{tags_weighting}_ngram_{ngram_range}_maxfeat_{max_features}"

    topics_results = apply_topic_modeling_and_log(
        questions,
        remove_stopwords,
        use_lemmatize,
        tags_weighting, 
        run_name, 
        ngram_range, 
        max_features
    )
    print(run_name)
    all_results.update(topics_results)

# save dictionary to person_data.pkl file
with open('./pickle_dataframes/all_results.pkl', 'wb') as fp:
    pickle.dump(all_results, fp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Tags'] = df['

Run_remove_True_lemmatize_True_weight_1_ngram_(1, 1)_maxfeat_1000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Tags'] = df['

Run_remove_True_lemmatize_True_weight_1_ngram_(1, 2)_maxfeat_1000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Tags'] = df['

Run_remove_True_lemmatize_True_weight_1_ngram_(1, 3)_maxfeat_1000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Tags'] = df['

Run_remove_True_lemmatize_True_weight_2_ngram_(1, 1)_maxfeat_1000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Tags'] = df['

Run_remove_True_lemmatize_True_weight_2_ngram_(1, 2)_maxfeat_1000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Tags'] = df['

Run_remove_True_lemmatize_True_weight_2_ngram_(1, 3)_maxfeat_1000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Tags'] = df['

Run_remove_True_lemmatize_True_weight_5_ngram_(1, 1)_maxfeat_1000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Tags'] = df['

Run_remove_True_lemmatize_True_weight_5_ngram_(1, 2)_maxfeat_1000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Body'] = df['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Tags'] = df['

Run_remove_True_lemmatize_True_weight_5_ngram_(1, 3)_maxfeat_1000


### Find best topic modelling technique + parameters

In [107]:
all_results = pd.read_pickle('./pickle_dataframes/all_results.pkl')

import heapq

# Initialize min-heaps to track the top 5 best scores and parameters for LDA and NMF
top_5_lda = []
top_5_nmf = []

# Iterate through all results
for run_name, results in all_results.items():
    # Extract LDA and NMF scores
    lda_score = (results['lda_coherence'], -results['lda_perplexity'])  # Negative perplexity for min-heap
    nmf_score = -results['nmf_reconstruction_error']  # Negative error for min-heap

    # Update top 5 LDA
    if len(top_5_lda) < 5 or lda_score > top_5_lda[0][0]:
        if len(top_5_lda) == 5:
            heapq.heappop(top_5_lda)
        heapq.heappush(top_5_lda, (lda_score, run_name))

    # Update top 5 NMF
    if len(top_5_nmf) < 5 or nmf_score > top_5_nmf[0][0]:
        if len(top_5_nmf) == 5:
            heapq.heappop(top_5_nmf)
        heapq.heappush(top_5_nmf, (nmf_score, run_name))

# Output top 5 LDA
print("Top 5 LDA:")
for score, params in sorted(top_5_lda, reverse=True):
    print(f"Parameters: {params}, Coherence: {score[0]}, Perplexity: {-score[1]}")

# Output top 5 NMF
print("\nTop 5 NMF:")
for score, params in sorted(top_5_nmf, reverse=True):
    print(f"Parameters: {params}, Reconstruction Error: {-score}")

Top 5 LDA:
Parameters: Run_remove_True_lemmatize_True_weight_5_ngram_(1, 1)_maxfeat_1000_n_topics_25, Coherence: 0.35796155297262766, Perplexity: 2915.358769222285
Parameters: Run_remove_True_lemmatize_True_weight_2_ngram_(1, 1)_maxfeat_1000_n_topics_25, Coherence: 0.35796155297262766, Perplexity: 2915.358769222285
Parameters: Run_remove_True_lemmatize_True_weight_5_ngram_(1, 3)_maxfeat_1000_n_topics_25, Coherence: 0.35796155297262766, Perplexity: 2955.353878801326
Parameters: Run_remove_True_lemmatize_True_weight_2_ngram_(1, 3)_maxfeat_1000_n_topics_25, Coherence: 0.35796155297262766, Perplexity: 2955.353878801326
Parameters: Run_remove_True_lemmatize_True_weight_1_ngram_(1, 3)_maxfeat_1000_n_topics_25, Coherence: 0.35796155297262766, Perplexity: 2955.353878801326

Top 5 NMF:
Parameters: Run_remove_True_lemmatize_True_weight_5_ngram_(1, 3)_maxfeat_1000_n_topics_25, Reconstruction Error: 115.23001833114516
Parameters: Run_remove_True_lemmatize_True_weight_2_ngram_(1, 3)_maxfeat_1000_n_

### Run the best topic model
- Parameters: Run_remove_True_lemmatize_True_weight_2_ngram_(1, 3)_maxfeat_1000_n_topics_20, Coherence: 0.4828223717706496, Perplexity: 1705.171976733514


In [108]:
remove_stopwords = True
use_lemmatize = True 
tags_weighting = 5
ngram_range = (1, 2)
max_features = 1000
n_topics = 25

# Apply preprocessing to each column
questions['Title'] = questions['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
questions['Body'] = questions['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
questions['Tags'] = questions['Tags'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))

# Combine Title, Body, and Tags
questions['CombinedText'] = questions['Title'] + ' ' + questions['Body'] + ' ' + (questions['Tags'] * tags_weighting)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questions['Title'] = questions['Title'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questions['Body'] = questions['Body'].apply(lambda x: preprocess_text(x, remove_stopwords, use_lemmatize))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versu

In [111]:
# Apply TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
tfidf_matrix = tfidf_vectorizer.fit_transform(questions['CombinedText'])

# Apply LDA
lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
lda.fit(tfidf_matrix)

# Assign topics to questions
topic_assignments = lda.transform(tfidf_matrix)

### Assign Topic to every question

In [258]:
questions['Topic'] = topic_assignments.argmax(axis=1)
questions.shape

(16002, 16)

### Assign Topic to every answer

In [259]:
topics_df = questions[['Id', 'Topic']]

# Merge to assign topics from questions to their answers
answers_with_topics = answers.merge(topics_df, left_on='ParentId', right_on='Id', how='left')

# Rename the 'Topic' column to something like 'InheritedTopic' to avoid confusion
answers_with_topics.rename(columns={'Topic': 'AnswerTopic'}, inplace=True)
answers_with_topics.shape

(36090, 18)

### Assign Topic to every comment

In [31]:
# How many comments do we have?
comments.shape

(184672, 6)

In [3]:
questions = pd.read_pickle('./pickle_dataframes/questions_with_topics.pkl')
answers = pd.read_pickle('./pickle_dataframes/answers_with_topics.pkl')
topics_df = questions[['Id', 'Topic']]

In [4]:
# Merge to assign topics from questions to their answers
answers_with_topics = answers.merge(questions[['Id', 'Topic']], left_on='ParentId', right_on='Id', how='left')
answers_with_topics.rename(columns={'Topic': 'AnswerTopic'}, inplace=True)

# Create sets for faster lookup
unique_question_ids = set(questions['Id'].unique())
unique_answer_ids = set(answers_with_topics['Id_x'].unique())

In [8]:
# Assign topics to comments
# Check if the comment is associated with a question
question_comments = comments[comments['PostId'].isin(unique_question_ids)]
question_comments = question_comments.merge(questions[['Id', 'Topic']], left_on='PostId', right_on='Id', how='left')

# Check if the comment is associated with an answer
answer_comments = comments[comments['PostId'].isin(unique_answer_ids)]
answer_comments = answer_comments.merge(answers_with_topics[['Id_x', 'AnswerTopic']], left_on='PostId', right_on='Id_x', how='left')

In [47]:
# Merge and select relevant columns for question comments
question_comments_with_topics = question_comments.merge(questions[['Id', 'Topic']], left_on='PostId', right_on='Id', how='left')
question_comments_with_topics = question_comments_with_topics[['Id_x', 'PostId', 'Score', 'Text', 'CreationDate', 'UserId', 'Topic_x']]
question_comments_with_topics.rename(columns={'Id_x': 'Id', 'Topic_x': 'CommentTopic'}, inplace=True)

In [48]:
# Merge and select relevant columns for answer comments
answer_comments_with_topics = answer_comments.merge(answers_with_topics[['Id_x', 'AnswerTopic']], left_on='PostId', right_on='Id_x', how='left')
answer_comments_with_topics = answer_comments_with_topics[['Id', 'PostId', 'Score', 'Text', 'CreationDate', 'UserId', 'AnswerTopic_x']]
answer_comments_with_topics = answer_comments_with_topics.iloc[:, :7]
answer_comments_with_topics.rename(columns={'Id_x': 'Id', 'AnswerTopic_x': 'CommentTopic'}, inplace=True)

In [52]:
# Concatenate question and answer comments
all_comments_with_topics = pd.concat([question_comments_with_topics, answer_comments_with_topics], ignore_index=True)
all_comments_with_topics.head()

Unnamed: 0,Id,PostId,Score,Text,CreationDate,UserId,CommentTopic
0,1,1,9,Is it fair to inquire about the disadvantages ...,2012-12-04 22:00:00.933,28,7
1,3,1,3,"I could have reformulated the question, but at...",2012-12-04 22:02:37.737,18,7
2,7,2,2,Source on this? I don't see how it could possi...,2012-12-04 22:10:10.070,45,21
3,13,2,1,@Nick122 In a parliamentary system like the No...,2012-12-04 22:14:33.463,43,21
4,15,2,0,"Yes, but you will give a negative vote by voti...",2012-12-04 22:16:29.437,45,21


In [None]:
all_comments_with_topics.CommentTopic.value_counts().sum()

# Only 50 columns did not have either a corresponding question or answer
# Let's see which comments it was
comments_without_question_or_answer = comments[~comments['PostId'].isin(unique_question_ids) & ~comments['PostId'].isin(unique_answer_ids)]
comments_without_question_or_answer.PostId.unique()

In [45]:
# Printing this we see that the posts do not exist and that's why their comments could not inherit a topic
posts[posts['Id'].isin(comments_without_question_or_answer.PostId.unique())]

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount


### Save the Results

In [51]:
# Save the posts dataframe with topic assignments
#questions.to_pickle('./pickle_dataframes/questions_with_topics.pkl')
#answers_with_topics.to_pickle('./pickle_dataframes/answers_with_topics.pkl')
#all_comments_with_topics.to_pickle('./pickle_dataframes/comments_with_topics.pkl')