In [1]:
from bs4 import BeautifulSoup
from gensim.models import CoherenceModel
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim.corpora as corpora
import gensim
import html
import itertools
import nltk
import numpy as np
import pandas as pd
import re
import wandb

In [4]:
# Read the data
df_comments = pd.read_pickle('./pickle_dataframes/comments_typecasted.pkl')
df_posts = pd.read_pickle('./pickle_dataframes/posts_typecasted.pkl')

df_post_links = pd.read_pickle('./pickle_dataframes/post_links_typecasted.pkl')
df_tags = pd.read_pickle('./pickle_dataframes/tags_typecasted.pkl')
df_users = pd.read_pickle('./pickle_dataframes/users_typecasted.pkl')

### Filtering Posts

In [5]:
max_date_comments = df_comments["CreationDate"].max()
max_date_posts = df_posts["CreationDate"].max()

# Use the latest date as the reference for the n-year filter
max_date = max(max_date_comments, max_date_posts)
print(max_date)

2023-09-03 09:33:40.880000


### Active users

In [6]:
# Remove entries with -1 in UserId and OwnerUserId columns
df_comments = df_comments[df_comments['UserId'] != -1]
df_posts = df_posts[df_posts['OwnerUserId'] != -1]

# Selecting only questions from posts
questions_df = df_posts[df_posts['PostTypeId'] == 1]
answers_df = df_posts[df_posts['PostTypeId'] == 2]

# Calculate post and comment counts for each user
user_posts_count = df_posts.groupby('OwnerUserId').size().rename('PostCount')
user_comments_count = df_comments.groupby('UserId').size().rename('CommentCount')

# Merge counts with user data and fill missing values
user_data = df_users.merge(user_posts_count, left_on='Id', right_index=True, how='left')\
                    .merge(user_comments_count, left_on='Id', right_index=True, how='left')\
                    .fillna({'PostCount': 0, 'CommentCount': 0})

# Add total activity column and filter for active users
user_data['TotalActivity'] = user_data['PostCount'] + user_data['CommentCount']
active_users = user_data.query("TotalActivity > 200")
active_user_ids = set(active_users['Id'])

# Filter questions and comments for active user activity
filtered_questions = questions_df[questions_df['OwnerUserId'].isin(active_user_ids)]
active_user_post_ids = set(df_posts[df_posts['OwnerUserId'].isin(active_user_ids)]['Id'])
filtered_comments = df_comments[(df_comments['UserId'].isin(active_user_ids)) | 
                                (df_comments['PostId'].isin(active_user_post_ids))].drop_duplicates()

# Print counts
print(f"Number of comments before filtering: {df_comments.shape[0]}")
print(f"Number of questions before filtering: {questions_df.shape[0]}")
print(f"Number of comments after filtering: {filtered_comments.shape[0]}")
print(f"Number of questions after filtering: {filtered_questions.shape[0]}")
print(f"Number of users before Activity Threshold: {len(set(user_data['Id']))}")
print(f"Number of users after Activity Threshold: {len(active_user_ids)}")

Number of comments  47505654
Number of questions  2567938
