In [None]:
# Load modules
import os
import pandas as pd
import requests
import json
import csv
import time
from dateutil.relativedelta import relativedelta
import datetime as dt
from psaw import PushshiftAPI # https://github.com/dmarx/psaw
from glob import glob


# Submissions

In [None]:
# Load in data
df_subs = pd.read_csv("data/processed/submissions/TheRedPill-submissions-2020-06-01.csv")


In [None]:
# Clean up (drop text and old index column, make utc to date time)
df_subs = df_subs.drop(['Unnamed: 0','selftext'], axis = 1)
df_subs['created_date'] = pd.to_datetime(df_subs['created_utc'], unit = "s").dt.date 

## Quick stats

In [None]:
# Quick stats
num_posts = len(df_subs)
earliest_post = min(df_subs['created_date'])
latest_post = max(df_subs['created_date'])

In [None]:
print("# of posts: "+ str(num_posts))
print("earliest post: " + str(earliest_post))
print("latest post collected: " + str(latest_post))

## Authors

In [None]:
# Looking at authors
top25_authors = df_subs.author.value_counts().nlargest(25)
top25_authors

unique_authors = set(df_subs.author)
len(unique_authors)





In [None]:
# Get cumsum to see who's contributing
df_authors = pd.DataFrame(df_subs.author.value_counts().sort_values())
df_authors.index.name = 'author_name'
df_authors.reset_index(inplace=True)
df_authors['cum_sum'] = df_authors['author'].cumsum()
df_authors['cum_perc'] = df_authors['cum_sum']/df_authors['author'].sum()

# Deleted prevalence
deleted_count = df_authors['author'][df_authors['author_name'] == "[deleted]"].to_numpy()
deleted_perc = deleted_count/len(df_subs)                                            
deleted_perc = round(deleted_perc[0], 2)                                             




In [None]:
top_contributors = df_authors[df_authors['cum_perc'] > 0.5]


In [None]:
print("this many users make up 50% of posts: "\
      + str(len(df_authors[df_authors['cum_perc'] > 0.5])))
print("this many users make up 40% of posts: "+ str(len(df_authors[df_authors['cum_perc'] > 0.60])))
print("deleted posts are this % of all posts: " + str(round(deleted_perc*100,2)))




## Coverage

In [None]:
import matplotlib.pyplot as plt

# Plot submissions
plot_submissions = df_subs.groupby('created_date').id.count().plot()
plt.plot()
plt.savefig('trp_posts.png', bbox_inches='tight')

In [None]:
# Explore to see what happend end of 2018
grouped_by_date = pd.DataFrame(df_subs.groupby('created_date').id.count())
grouped_by_date.index.name = 'date'
grouped_by_date.reset_index(inplace = True)
grouped_by_date = grouped_by_date.rename(columns={"id": "count"})

In [None]:
# See if there are dates where there were no posts
from datetime import date, timedelta

d = grouped_by_date['date'].tolist()
date_set = set(min(grouped_by_date['date']) + timedelta(x) for x in range((max(grouped_by_date['date']) - min(grouped_by_date['date'])).days))
missing_dates = set(d).difference(set(date_set))
missing_dates
# seems like all dates are in there? double check? t

In [None]:
# Get dates where count = 1 
grouped_by_date.loc[grouped_by_date['count'] == 1, 'date']

In [None]:
grouped_by_date.loc[grouped_by_date['count'] == 1, 'date']

In [2]:
import requests

url = "https://api.pushshift.io/reddit/search/submission"

# Get before feb 2019 and after june 2018 (eyeballing from graph)
test = requests.get(url, params = {"subreddit": "TheRedPill", "size": 0, "aggs" : "subreddit", "before": 1548979200, "after": 1538179200}).json()["aggs"]
# 62 posts from Sept 29 2018 to Feb 2019 -- so something with reddit was happening here
# is there a way to look manually?


In [7]:
test["subreddit"][0]["doc_count"]

62

# Comments

In [None]:
# Load in data
df_comments = pd.read_csv("data/processed/comments/TheRedPill-comments-2020-06-01.csv")


In [None]:
# Clean up
df_comments = df_comments.drop(['Unnamed: 0'], axis = 1)
df_comments['created_date'] = pd.to_datetime(df_comments['created_utc'], unit = "s").dt.date 

In [None]:
df_comments.sort_values(['created_date'])


In [None]:
# Quick stats
num_comments = len(df_comments)
earliest_comments = min(df_comments['created_date'])
latest_comment = max(df_comments['created_date'])

In [None]:
print("# of comments: "+ str(num_comments))
print("earliest comment: " + str(earliest_comments))
print("latest comment collected: " + str(latest_comment))

## Authors

In [None]:
# Looking at authors
top25_authors = df_comments.author.value_counts().nlargest(25)
top25_authors

unique_authors = set(df_comments.author)
len(unique_authors)

In [206]:
# Get cumsum to see who's contributing
df_authors = pd.DataFrame(df_comments.author.value_counts().sort_values())

# Make index its own column
df_authors.index.name = 'author_name'
df_authors.reset_index(inplace = True)

# Rename column to count 
df_authors = df_authors.rename(columns={"author": "count"})
df_authors = df_authors.sort_values(['count'])
df_authors['cum_sum'] = df_authors['count'].cumsum()
df_authors['cum_perc'] = df_authors['cum_sum']/df_authors['count'].sum()

# Get count of deleted comments
deleted_comments = df_authors['count'][df_authors['author_name'] == "[deleted]"].to_numpy()[0]
perc_deleted = (deleted_comments/num_comments)*100

In [1]:
df_authors.author_name.hist()

NameError: name 'df_authors' is not defined

In [205]:
df_authors

Unnamed: 0,author_name,count,cum_sum,cum_perc
0,randallrandyrand,1,1,3.303038e-07
26439,monkeytoes77,1,2,6.606076e-07
26440,650nano,1,3,9.909114e-07
26441,bernedindigo,1,4,1.321215e-06
26442,suchacrisis,1,5,1.651519e-06
...,...,...,...,...
115481,MattyAnon,9688,2382369,7.869055e-01
115482,vengefully_yours,10262,2392631,7.902951e-01
115483,NeoreactionSafe,12479,2405110,7.944169e-01
115484,AutoModerator,15911,2421021,7.996724e-01


In [None]:
# Top 50% of commenters
top_contributors = df_authors[df_authors['cum_perc'] > 0.5]
len(top_contributors)

In [None]:
# Top 40% of commenters
len(df_authors[df_authors['cum_perc'] > 0.6])

# Top 30% of commenters
len(df_authors[df_authors['cum_perc'] > 0.7])

## Coverage

In [None]:
# from visual analysis of plot, seem to be a 
# hole in early 2019, though this doesn't seem to be same hole seen in submissions