# Analyzing Posts and Comments from r/feminism
J. Nathan Matias, Dec 2024

In support of this r/feminism study: https://osf.io/xu258

Goals:
- for the time period covered by the spring 2018 survey
- create a dataset of comments and posts
- report on:
  - the number of newcomers per post (if possible)
  - the number (or proportion?) of comments removed per post
  - the number (or proportion?) of comments per post

In [180]:
%matplotlib inline
import inspect, os, sys, copy, pytz, re, glob, csv, uuid, datetime, jsonlines, time, yaml, math
from bloom_filter import BloomFilter

os.environ['AIRBRAKE_API_KEY'] = "ca826dbd1a4594241c239bba825edd9f" ## EDIT BEFORER USING
os.environ['AIRBRAKE_PROJECT_ID'] = "-1" ## EDIT BEFORE USING

import simplejson as json
import pandas as pd
from dateutil import parser
import datetime
import matplotlib.pyplot as plt   # Matplotlib for plotting
import matplotlib.dates as md
from collections import Counter, defaultdict
import csv, glob, heapq
utc=pytz.UTC

ENV = "production"
os.environ['CS_ENV'] = 'production'
BASE_DIR = "/cs/civilservant-jupyter"
sys.path.append(BASE_DIR)

with open(os.path.join(BASE_DIR, "config") + "/{env}.json".format(env=ENV), "r") as config:
  DBCONFIG = json.loads(config.read())

### LOAD SQLALCHEMY
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy import text, and_, or_
import sqlalchemy.orm.session
import utils.common


db_engine = create_engine("mysql://{user}:{password}@{host}/{database}".format(
    host = DBCONFIG['host'],
    user = DBCONFIG['user'],
    password = DBCONFIG['password'],
    database = DBCONFIG['database']))
DBSession = sessionmaker(bind=db_engine)
db_session = DBSession()


## LOAD MYSQLDB
import MySQLdb
db=MySQLdb.connect(host =   DBCONFIG['host'],   
                   user =   DBCONFIG['user'],
                   passwd = DBCONFIG['password'],
                   db =     DBCONFIG['database'])
cursor = db.cursor(MySQLdb.cursors.DictCursor)

# ### LOAD PRAW
# import reddit.connection
# conn = reddit.connection.Connect(base_dir=BASE_DIR)
# r = conn.connect()

from app.models import *

### FILTER OUT DEPRECATION WARNINGS ASSOCIATED WITH DECORATORS
# https://github.com/ipython/ipython/issues/9242
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning, message='.*use @default decorator instead.*')

In [11]:
subreddit_id = "2qr7i"
subreddit_name = "feminism"
data_dir = "./"

earliest_date = parser.parse("2018-01-1")
latest_date   = parser.parse("2018-04-17")
newcomer_begin_date = earliest_date - datetime.timedelta(days=30*6)

## Load Moderator Actions

In [9]:
recent_mod_actions = []

start = time.time()

for row in db_engine.execute(text("""
SELECT action_data FROM mod_actions 
    WHERE subreddit_id="{0}" AND 
          created_utc >= "{1}" AND
          created_utc <= "{2}"
    ORDER BY created_utc;
""".format(subreddit_id,
           earliest_date,
           latest_date +  datetime.timedelta(days=7)))):
    mod_action = json.loads(row['action_data'])
    mod_action['created'] = utc.localize(datetime.datetime.utcfromtimestamp(mod_action['created_utc']))
    recent_mod_actions.append(mod_action)

end = time.time()

print("{0} moderator actions loaded in {1} seconds".format(len(recent_mod_actions), int(end-start)))

15600 moderator actions loaded in 3103 seconds


## Load Comments

In [22]:
newcomer_comment_query = """
SELECT * FROM comments 
    WHERE subreddit_id="{0}" AND 
          created_utc >= "{1}" AND
          created_utc <= "{2}";
""".format(subreddit_id,
           newcomer_begin_date,
           latest_date)

print(newcomer_comment_query)

all_accounts = set()
all_comments = []
count_dates = defaultdict(int)
counter = 0

start = time.time()

cursor.execute(newcomer_comment_query)
for row in cursor.fetchall():
    if(isinstance(row['created_utc'], float)):
        row['created'] = utc.localize(datetime.datetime.utcfromtimestamp(row['created_utc']))
    else:
        row['created'] = row['created_utc']
    day_diff = (row['created'] - newcomer_begin_date).days
    
    all_accounts.add(row['user_id'])
    all_comments.append(row)
    count_dates[day_diff] += 1
    counter += 1
    
end = time.time()

print("Loaded {0} comments in {1} seconds".format(counter, int(end-start)))


SELECT * FROM comments 
    WHERE subreddit_id="2qr7i" AND 
          created_utc >= "2017-07-05 00:00:00" AND
          created_utc <= "2018-04-17 00:00:00";

Loaded 7145 comments in 516 seconds


### Load Posts

In [15]:
all_posts = []

start = time.time()

for row in db_engine.execute(text("""
SELECT post_data FROM posts 
    WHERE subreddit_id="{0}" AND 
          created >= "{1}" AND
          created <= "{2}"
    ORDER BY created;
""".format(subreddit_id,
           earliest_date,
           latest_date))):
    post = json.loads(row['post_data'])
    post['created'] = utc.localize(datetime.datetime.utcfromtimestamp(post['created']))
    all_posts.append(post)

end = time.time()

print("{0} posts actions loaded in {1} seconds".format(len(all_posts), int(end-start)))

912 posts actions loaded in 0 seconds


### Load Front Pages

In [169]:
all_front_pages = []

start = time.time()


front_page_query = text("""
SELECT created_at, page_type, page_data FROM front_pages WHERE 
          created_at >= "{0}" AND
          created_at <= "{1}"
    ORDER BY created_at ASC;
""".format(earliest_date,
           latest_date))
print(front_page_query)

for row in db_engine.execute(front_page_query):
    page = {}
    page['post_ranking'] = json.loads(row['page_data'])
    page['created'] = row['created_at']
    page['page_type'] = row['page_type']
    all_front_pages.append(page)

end = time.time()

print("{0} front pages loaded in {1} seconds".format(len(all_front_pages), int(end-start)))


SELECT created_at, page_type, page_data FROM front_pages WHERE 
          created_at >= "2018-01-01 00:00:00" AND
          created_at <= "2018-04-17 00:00:00"
    ORDER BY created_at ASC;

72126 front pages loaded in 28 seconds


# Merge Posts, Comments, and Mod Actions
Cover the period from January through April 2018

## Step one: sort all comments

In [43]:
EPOCH = utc.localize(datetime.datetime.utcfromtimestamp(0))

class CommentHeapObj(object):
    def __init__(self, comment):
        ## localize the timezone if the field is not localized
        try:
            self.index = int((pytz.utc.localize(comment['created']) - EPOCH).total_seconds())
        except:
            self.index = int((comment['created'] - EPOCH).total_seconds())
            
        self.val = comment
    def __lt__(self, other):
        return self.index < other.index

def heapsort(comments):
    h = []
    for comment in comments:
        heapq.heappush(h, CommentHeapObj(comment))
    return [heapq.heappop(h).val for i in range(len(h))]

In [51]:
all_comments = heapsort(all_comments)
recent_mod_actions = heapsort(recent_mod_actions)

## Step Two: apply mod actions to comments

In [252]:
mod_comment_actions = defaultdict(list)
mod_post_actions    = defaultdict(list)

approved_count = 0
removed_count = 0
total_coments_removed_at_least_once = []
comments_with_mod_actions = set()
posts_with_mod_actions    = set()

for action in recent_mod_actions:
     if action['action'] == "removecomment" or action['action'] == "approvecomment":
            comment_id = action['target_fullname'].replace("t1_", "")
            mod_comment_actions[comment_id].append(action)
            comments_with_mod_actions.add(action['target_fullname'].replace("t1_", ""))

     if action['action'] == "removelink" or action['action'] == "approvelink":
            post_id = action['target_fullname'].replace("t3_", "")
            mod_post_actions[post_id].append(action)
            posts_with_mod_actions.add(post_id)

print("{0} Total moderation actions".format(sum([len(x) for x in mod_comment_actions.values()])))
print("{0} Comments with moderation actions".format(len(mod_comment_actions)))
print("{0} Comments with more than one mod action".format(len([x for x in mod_comment_actions.values() if len(x)>1])))
print("")
print("{0} Posts with moderation actions".format(len(mod_post_actions)))
print("{0} Posts with more than one mod action".format(len([x for x in mod_post_actions.values() if len(x)>1])))
print("")

7197 Total moderation actions
4631 Comments with moderation actions
2330 Comments with more than one mod action

2602 Posts with moderation actions
1694 Posts with more than one mod action



In [72]:
#list(mod_comment_actions.values())[-1]

In [219]:
## now iterate through all comments and apply mod actions

comments_by_post = defaultdict(list)
counter = 0

for comment in all_comments:
    if comment['id'] in comments_with_mod_actions:
        mod_actions =  mod_comment_actions[comment['id']]
        visible = True
        for action in mod_actions:
            if action['action'] == 'removecomment':
                visible = False
                counter += 1
            elif action['action'] == 'approvecomment':
                visible = True
    else:
        visible = True
    comment['visible'] = visible 
    comments_by_post[comment['post_id']].append(comment)

In [276]:
## now iterate through "ghost" comments that received mod actions
## but did not appear in the comments dataset, likely because they
## were excised by the Reddit platform entirely

observed_comment_ids = set([x['id'] for x in all_comments])
modaction_comments_by_post = defaultdict(list)

counter = 0
for comment_id, mod_actions in mod_comment_actions.items():
    if comment_id not in observed_comment_ids:
        for action in mod_actions:
            if action['action'] == 'removecomment':
                visible = False
            elif action['action'] == 'approvecomment':
                visible = True

        ## now extract the post ID:
        post_id = re.sub(r'\/.*', '', action['target_permalink'].replace("/r/Feminism/comments/",""))

        comment = {
            "id": action['target_fullname'].replace("t1_",""),
            "author": action['target_author'],
            "visible": visible,
            "body": action['target_body'],
            "post_id": post_id
        }
        modaction_comments_by_post[post_id].append(comment)
        
        counter +=1
print("Found {0} comments not in the observed set, attached to {1} posts.".format(counter, 
                                                                                  len(modaction_comments_by_post)))

      

Found 3109 comments not in the observed set, attached to 711 posts.


In [285]:
## How many posts are not in the post dataset
## and how many posts have comments that aren't
## in the comments dataset?
all_post_ids = set([x['id'] for x in all_posts])
Counter([x in all_post_ids for x in modaction_comments_by_post.keys()])

print("Observed removed comments from {0} posts that ARE NOT in the dataset.".format(
    len(
        [x for x in modaction_comments_by_post.keys() if (x in all_post_ids) is not True]
)))

print("Observed removed comments from {0} posts that ARE in the dataset.".format(
    len(
        [x for x in modaction_comments_by_post.keys() if x in all_post_ids]
)))

Observed removed comments from 451 posts that ARE NOT in the dataset.
Observed removed comments from 260 posts that ARE in the dataset.


## Step Three: Merge Comments Into Posts and Apply Mod Actions

Note that we omit posts that are not observed in the dataset since it is likely that these posts **do** exist but were published in a period long before the observation period, and consequently are out of scope. In such a situation, someone might have come back months or years later to post spam to the post, which led to a content removal. It is also possible in some cases that those posts were too quickly auto-removed to be observed, but we are not able to confirm this with the available data.

In [289]:
counter = 0
posts_with_mod_actions_in_list = []


### TODO: add all removed comments that only appear in mod actions

total_unobserved_comments_applied = 0

for post in all_posts:
    visible = True
    time_removed = None

    post_id = post['id']
    
    if post_id in posts_with_mod_actions:
        counter += 1
        posts_with_mod_actions_in_list.append(post_id)
        mod_actions = mod_post_actions[post_id]
        for mod_action in mod_actions:
            if mod_action['action'] == "removelink":
                visible = False
                time_removed = mod_action['created']
            if mod_action['action'] == "approvelink":
                visible = True
                time_removed = None

    post['num_comments'] = len(comments_by_post[post_id])
    post['num_comments_removed'] = len([x for x in comments_by_post[post_id] if x['visible'] == False])
    post['visible'] = visible
    post['time_removed'] = time_removed

    if post_id in modaction_comments_by_post.keys():
        for comment in modaction_comments_by_post[post_id]:
            total_unobserved_comments_applied += 1

            post['num_comments'] += 1
            if(comment['visible'] == False):
                post['num_comments_removed'] += 1

print("Applied {0} additional unobserved comments from mod actions".format(total_unobserved_comments_applied)) 

Applied 969 additional unobserved comments from mod actions


In [290]:
# post_id_found = [x['id'] in list(comments_by_post.keys()) for x in all_posts]
# Counter(post_id_found)

In [291]:
# x = list(comments_by_post.keys())
# all_posts[0]['id'] in x

In [292]:
# all_posts[0]['id']

In [321]:
len([x['target_body'] for x in recent_mod_actions if x['action']=='removecomment'])

5916

In [294]:
end_date = utc.localize(parser.parse("2018-04-01"))
posts_for_analysis = [x for x in all_posts if 
                      x['created'] < end_date]
print("Selected {0} out of {1} posts".format(len(posts_for_analysis), len(all_posts)))

Selected 679 out of 912 posts


### Step Four: Apply Maximum Rank Position Analysis to posts

In [295]:
## note: lower number means a higher rank
## and there are four algorithms monitored
def max_rank_record():
    return {1:None, 2:None,3:None, 4:None}
max_rank_per_post = defaultdict(max_rank_record)

for rankings in all_front_pages:
    algorithm_key = rankings['page_type']
    for post_ranking in rankings['post_ranking']:
        counter = 0
        if post_ranking['subreddit_id'].replace("t5_","") == subreddit_id:
            post_id = post_ranking['id']
            rank_record = max_rank_per_post[post_id]
            if(rank_record[algorithm_key] is None or 
               rank_record[algorithm_key] > counter):
                rank_record[algorithm_key] = counter
        counter += 1
        
print("produced rank records for {0} r/feminism posts".format(len(max_rank_per_post)))

produced rank records for 1 r/feminism posts


In [296]:
# this post does not appear in the all_posts list so we omit doing the merge
# for post_id, rank_record in max_rank_per_post.items():
#     if post_id in 

# Output post records to CSV

In [310]:
all_posts_anonymized = []

for post in all_posts:
    included_keys = ['subreddit_id', 'created', 'visible', 'num_comments', 'num_comments_removed', 'permalink', 'title']
    anonymized_post = {}
    for key in included_keys:
        anonymized_post[key] = post[key]
    all_posts_anonymized.append(anonymized_post)

In [312]:
#all_posts[0]
#Counter([x['num_comments'] for x in all_posts_anonymized])

In [313]:
pd.DataFrame(all_posts_anonymized).to_csv(datetime.datetime.utcnow().strftime("%Y-%m-%d") + 
                                          "-post-aggregation-spring-2018.csv")