In [2]:
from bs4 import BeautifulSoup
from datetime import timedelta
from gensim.models import CoherenceModel
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim.corpora as corpora
import gensim
import html
import itertools
import nltk
import numpy as np
import pandas as pd
import re
import wandb

In [8]:
# Read the data
df_comments = pd.read_pickle('./pickle_dataframes/comments_typecasted.pkl')
df_posts = pd.read_pickle('./pickle_dataframes/posts_typecasted.pkl')

df_post_links = pd.read_pickle('./pickle_dataframes/post_links_typecasted.pkl')
df_tags = pd.read_pickle('./pickle_dataframes/tags_typecasted.pkl')
df_users = pd.read_pickle('./pickle_dataframes/users_typecasted.pkl')

### Filtering Posts

In [9]:
max_date_comments = df_comments["CreationDate"].max()
max_date_posts = df_posts["CreationDate"].max()

# Use the latest date as the reference for the n-year filter
max_date = max(max_date_comments, max_date_posts)
print(max_date)

2023-09-03 09:33:40.880000


### Active users

In [10]:
# Sample dataframes - replace these with your actual dataframes

def filter_data(years_back, activity_threshold):
    # Calculate the maximum dates for comments and posts
    max_date_comments = df_comments["CreationDate"].max()
    max_date_posts = df_posts["CreationDate"].max()

    # Use the latest date as the reference for the n-year filter
    max_date = max(max_date_comments, max_date_posts)
    min_date = max_date - timedelta(days=365 * years_back)

    # Remove entries with -1 in UserId and OwnerUserId columns
    df_comments_filtered = df_comments[(df_comments['UserId'] != -1) & (df_comments['CreationDate'] >= min_date)]
    df_posts_filtered = df_posts[(df_posts['OwnerUserId'] != -1) & (df_posts['CreationDate'] >= min_date)]

    # Selecting only questions and answers from posts
    questions_df = df_posts_filtered[df_posts_filtered['PostTypeId'] == 1]
    answers_df = df_posts_filtered[df_posts_filtered['PostTypeId'] == 2]

    # Calculate post and comment counts for each user
    user_posts_count = df_posts_filtered.groupby('OwnerUserId').size().rename('PostCount')
    user_comments_count = df_comments_filtered.groupby('UserId').size().rename('CommentCount')

    # Merge counts with user data and fill missing values
    user_data = df_users.merge(user_posts_count, left_on='Id', right_index=True, how='left')\
                        .merge(user_comments_count, left_on='Id', right_index=True, how='left')\
                        .fillna({'PostCount': 0, 'CommentCount': 0})

    # Add total activity column and filter for active users
    user_data['TotalActivity'] = user_data['PostCount'] + user_data['CommentCount']
    active_users = user_data[user_data["TotalActivity"] >= activity_threshold]
    active_user_ids = set(active_users['Id'])

    # Filter questions and comments for active user activity
    filtered_questions = questions_df[questions_df['OwnerUserId'].isin(active_user_ids)]
    active_user_post_ids = set(df_posts_filtered[df_posts_filtered['OwnerUserId'].isin(active_user_ids)]['Id'])
    filtered_comments = df_comments_filtered[(df_comments_filtered['UserId'].isin(active_user_ids)) | 
                                    (df_comments_filtered['PostId'].isin(active_user_post_ids))].drop_duplicates()

    # Print counts (or return these values if needed)
    print(f"Years back: {years_back}, Activity threshold: {activity_threshold}")
    print(f"Number of comments before filtering: {df_comments_filtered.shape[0]}")
    print(f"Number of questions before filtering: {questions_df.shape[0]}")
    print(f"Number of comments after filtering: {filtered_comments.shape[0]}")
    print(f"Number of questions after filtering: {filtered_questions.shape[0]}")
    print(f"Number of users before Activity Threshold: {len(set(user_data['Id']))}")
    print(f"Number of users after Activity Threshold: {len(active_user_ids)}\n")

In [11]:
# Scenario 1
filter_data(10, 100)

Years back: 10, Activity threshold: 100
Number of comments before filtering: 37504942
Number of questions before filtering: 2320501
Number of comments after filtering: 29251269
Number of questions after filtering: 474171
Number of users before Activity Threshold: 1116805
Number of users after Activity Threshold: 52337



In [12]:
# Scenario 2
filter_data(5, 100)

Years back: 5, Activity threshold: 100
Number of comments before filtering: 16458151
Number of questions before filtering: 1405906
Number of comments after filtering: 11461308
Number of questions after filtering: 177242
Number of users before Activity Threshold: 1116805
Number of users after Activity Threshold: 20851



In [13]:
# Scenario 3
filter_data(10, 200)

Years back: 10, Activity threshold: 200
Number of comments before filtering: 37504942
Number of questions before filtering: 2320501
Number of comments after filtering: 25879360
Number of questions after filtering: 276931
Number of users before Activity Threshold: 1116805
Number of users after Activity Threshold: 25901



In [14]:
# Scenario 4
filter_data(5, 200)

Years back: 5, Activity threshold: 200
Number of comments before filtering: 16458151
Number of questions before filtering: 1405906
Number of comments after filtering: 10096665
Number of questions after filtering: 92049
Number of users before Activity Threshold: 1116805
Number of users after Activity Threshold: 10087



In [15]:
# Scenario 3
filter_data(10, 250)

Years back: 10, Activity threshold: 250
Number of comments before filtering: 37504942
Number of questions before filtering: 2320501
Number of comments after filtering: 24737413
Number of questions after filtering: 225863
Number of users before Activity Threshold: 1116805
Number of users after Activity Threshold: 20447



In [16]:
# Scenario 4
filter_data(5, 250)

In [None]:
# Scenario 3
filter_data(3, 200)

In [None]:
# Scenario 4
filter_data(3, 200)