-
Notifications
You must be signed in to change notification settings - Fork 6
/
moderator_scrape.py
99 lines (91 loc) · 3.93 KB
/
moderator_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from __future__ import print_function
from six import iteritems
from praw_object_data import retry_if_broken_connection, get_user_data
import pytz
import datetime
import writer
def scrape_moderators(opts, scraper):
target_subreddits = get_target_subreddits(opts)
if opts.use_subreddit_table_for_moderators:
target_subreddits.extend(get_subreddit_table_subreddits(opts))
print('getting moderators from %s total subreddits' % len(target_subreddits))
opts.master_moderator_set = set()
for subreddit in target_subreddits:
scrape_subreddit_for_moderators(subreddit, opts, scraper)
def get_subreddit_table_subreddits(opts):
db = opts.db
db.execute("""SELECT DISTINCT subreddit FROM %s.subreddits""" % db.schema)
return [x[0] for x in db.fetchall()]
def get_target_subreddits(opts):
db = opts.db
print(opts.subreddits)
if not opts.moderators_all:
if opts.repeat_moderator_subreddits:
target_subreddits = opts.subreddits
return target_subreddits
else:
db.execute("""SELECT subreddit FROM
(SELECT unnest(%%s) AS subreddit) t1
WHERE subreddit NOT IN (SELECT DISTINCT subreddit FROM %s.moderators)
GROUP BY subreddit
""" % (db.schema,), [opts.subreddits,])
else:
if opts.repeat_moderator_subreddits:
db.execute("""SELECT subreddit FROM
(SELECT subreddit, count(*) FROM
(SELECT subreddit FROM %s.threads
UNION ALL
SELECT subreddit FROM %s.comments) t1
GROUP BY subreddit
ORDER BY count DESC
)""" % ( db.schema, db.schema))
else:
db.execute("""SELECT subreddit FROM
(SELECT subreddit, count(*) FROM
(SELECT subreddit FROM %s.threads
UNION ALL
SELECT subreddit FROM %s.comments) t1
WHERE subreddit NOT IN (SELECT DISTINCT subreddit FROM %s.moderators)
GROUP BY subreddit
ORDER BY count DESC
) t2""" % ( db.schema, db.schema, db.schema))
return [x[0] for x in db.fetchall()]
@retry_if_broken_connection
def scrape_subreddit_for_moderators(subreddit, opts, scraper):
print(subreddit)
sub = scraper.subreddit(subreddit)
try:
mods = [x for x in sub.moderator()]
except prawcore.exceptions.Forbidden:
print('Cannot read moderators of subreddit (probably private)')
return False
#in case user missed capitalization
subreddit_proper_name = sub.display_name
print('+-----------------------------+')
print('getting moderator data for /r/%s' % subreddit_proper_name)
current_time = datetime.datetime.now()
for i, mod in enumerate(mods):
if opts.verbose:
print('getting data for /u/%s' % str(mod))
opts.db.execute("""INSERT INTO %s.moderators(subreddit, username, timestamp, pos)
VALUES (%%s, %%s, %%s, %%s);""" % opts.db.schema, (subreddit_proper_name,
str(mod),
current_time,
i)
)
if opts.scrape_moderators:
if opts.verbose:
print('Scraping moderators of /r/%s' % subreddit_proper_name)
if str(mod) in opts.master_moderator_set:
continue
data = get_user_data(mod, opts, 'minimal')
writer.write_user(data['userdata'], opts)
for key, value in iteritems(data['commentdata']):
writer.write_comment(value, opts)
for key, value in iteritems(data['threaddata']):
writer.write_thread(value, opts)
#print('wrote data for %s' % str(mod))
opts.master_moderator_set.add(str(mod))
opts.db.commit()
print('got moderators for /r/%s' % subreddit_proper_name)
return True