In [None]:
from bs4 import BeautifulSoup
from gensim.models import CoherenceModel
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim.corpora as corpora
import gensim
import html
import itertools
import nltk
import numpy as np
import pandas as pd
import re
import wandb

In [None]:
# Read the data
df_c = pd.read_pickle('./pickle_dataframes/comments_typecasted.pkl')
df_p = pd.read_pickle('./pickle_dataframes/posts_typecasted.pkl')

df_pl = pd.read_pickle('./pickle_dataframes/post_links_typecasted.pkl')
df_t = pd.read_pickle('./pickle_dataframes/tags_typecasted.pkl')
df_u = pd.read_pickle('./pickle_dataframes/users_typecasted.pkl')

In [None]:
df_comments = df_c.copy()
df_posts = df_p.copy()
#df_post_links = df_p.copy()
#df_tags = df_t.copy()
df_users = df_u.copy()

### Filtering Posts

In [None]:
max_date_comments = df_comments["CreationDate"].max()
max_date_posts = df_posts["CreationDate"].max()

# Use the latest date as the reference for the n-year filter
max_date = max(max_date_comments, max_date_posts)
print(max_date)

### Topic model: only posts from active users

In [None]:
# Remove entries with -1 in UserId and OwnerUserId columns
df_comments = df_comments[df_comments['UserId'] != -1]
df_posts = df_posts[df_posts['OwnerUserId'] != -1]

# Selecting only questions from posts
questions_df = df_posts[df_posts['PostTypeId'] == 1]
answers_df = df_posts[df_posts['PostTypeId'] == 2]

#print("Number of comments ", df_comments.shape[0])
#print("Number of posts ", df_posts.shape[0])
#print("Number of questions ", questions_df.shape[0])
#print("Number of answers ", answers_df.shape[0])

# Count unique users involved in questions and comments
unique_user_count = len(pd.concat([questions_df["OwnerUserId"], answers_df["OwnerUserId"], df_comments["UserId"]]).unique())
# print("Number of Unique Users: ", unique_user_count)

# Calculate post and comment counts for each user
user_posts_count = df_posts.groupby('OwnerUserId').size().rename('PostCount')
user_comments_count = df_comments.groupby('UserId').size().rename('CommentCount')

# Merge counts with user data and fill missing values with 0
user_data = df_users.merge(user_posts_count, left_on='Id', right_index=True, how='left')
user_data = user_data.merge(user_comments_count, left_on='Id', right_index=True, how='left')
user_data.fillna({'PostCount': 0, 'CommentCount': 0}, inplace=True)

# Add a column for total activity and filter for active users
user_data['TotalActivity'] = user_data['PostCount'] + user_data['CommentCount']

# Get sets of active user IDs before Activity Threshold
print("Users before Activity Threshold: " , user_data['Id'].nunique())

# Get sets of active user IDs after Activity Threshold
active_users = user_data[user_data['TotalActivity'] > 200]
active_user_ids = set(active_users['Id'])
print("Users after Activity Threshold: " ,len(active_user_ids))

# Filter questions and comments for active user activity
filtered_questions_df = questions_df[questions_df['OwnerUserId'].isin(active_user_ids)]
active_user_post_ids = set(df_posts[df_posts['OwnerUserId'].isin(active_user_ids)]['Id'])
filtered_comments = df_comments[(df_comments['UserId'].isin(active_user_ids)) | 
                                (df_comments['PostId'].isin(active_user_post_ids))].drop_duplicates()

# Count unique users in filtered questions and comments
unique_active_user_count = len(pd.concat([filtered_questions_df["OwnerUserId"], filtered_comments["UserId"]]).unique())

#print("Number of comments ", filtered_comments.shape[0])
#print("Number of posts ", active_user_post_ids.shape[0])
#print("Number of Unique Users: ", unique_active_user_count)
