/
redditgraph.py
116 lines (80 loc) · 3.35 KB
/
redditgraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import json
import requests
from datetime import datetime
import matplotlib.pyplot as plt
import time
import statistics
class RedditFunction :
#This function returns a list of reddit post links
def getPushshiftData(self, results, after, before, q, size, genType) :
queryr = q.replace(" ","+")
args = '&sort=desc&sort_type=score&over_18=false&score=>2000&size=' +str(results) + '&after=' + str(after) + '&before=' +str(before) + '&q=' + str(queryr)
url = 'https://api.pushshift.io/reddit/search/submission/?' +str(args)
#print("URL = ", url)
r=requests.get(url)
data = json.loads(r.text)
#print(data)
fulllinks = []
for post in data['data'] :
fulllinks.append(post['full_link'])
posts = []
# score : number of upvotes
# num_comments: number of comments
# created utc: when the post was created
for post in data['data']:
posts.append([post['score'], post['num_comments'], post['created_utc'], post['full_link']])
#print(posts)
# sorting by upvotes and then comments in reverse order
#posts.sort(key = lambda x: (x[0], x[2]), reverse = True)
times=[]
pops=[]
for p in posts:
pops.append(p[0])
times.append(p[2])
for t in range(len(times)):
times[t]=datetime.utcfromtimestamp(times[t]).strftime('%Y-%m-%d')
rel_data={}
# making dictionary of {time: scores}
for t in range(len(times)):
rel_data[times[t]]=pops[t]
#print(rel_data)
times.sort()
pop_scores=[]
for time in times:
pop_scores.append(rel_data[time])
rel_data_grouped={}
for date in times:
short_date = '-'.join(date.split('-')[:2])
#print(short_date)
if short_date not in rel_data_grouped:
rel_data_grouped[short_date]=[]
rel_data_grouped[short_date].append(rel_data[date])
final_data={}
for month in rel_data_grouped:
final_data[month]=statistics.mean(rel_data_grouped[month])
newfinal_data={}
for date in final_data.keys():
newdate = datetime.strptime(date,'%Y-%m')
newfinal_data[newdate]=final_data[date]
#current_time = datetime.datetime.now()
#current_time_utc = datetime.utc()
#print(current_time, current_time_utc)
#print(p[0],p[2])
plt.plot(list(newfinal_data.keys()), list(newfinal_data.values()),color='crimson', label = q)
plt.ylabel('popularity')
plt.xlabel('time')
plt.yscale('log')
plt.xticks(rotation=60)
plt.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1)
filename = q.replace(' ','_')
if genType=='suggested':
plt.savefig('static/images/suggested/reddit_' + str(filename)+'.png',bbox_inches='tight')
elif genType=='query':
plt.savefig('static/images/query/reddit.png',bbox_inches='tight')
print('done in reddit thread')
return fulllinks[:size]
#tic = time.perf_counter()
#test = RedditFunction.getPushshiftData(0, 100 ,1526428800,1589587200, 'gravity', 10, 'query')
#toc = time.perf_counter()
#print(f"did the thing in {toc - tic:0.4f} seconds")
# print(test)