In [1]:
import requests
from datetime import datetime
import pandas as pd
from textblob import TextBlob
import time

In [2]:
def get_pushshift_data(df, data_type, **kwargs):
    """
    Gets data from the pushshift api.
    Read more: https://github.com/pushshift/api
    """
 
    base_url = f"https://api.pushshift.io/reddit/search/{data_type}/"
    payload = kwargs
    request = requests.get(base_url, params=payload).json()
    created = payload['before']
    for x in request['data']:
        author = x['author']
        body = x['title'] if data_type == "submission" else x['body']
        upvotes = x['score']
        polarity = TextBlob(body).polarity
        created = datetime.fromtimestamp(x['created_utc']).strftime("%Y-%m-%d %H:%M:%S")
        df = df.append({'author': author, 'created': created, 'text': body, 'polarity': polarity, 'upvotes': upvotes},
                  ignore_index=True)
    
    print(f"Retrieved {df.shape[0]} comments, till {created}")

    return df, created

In [3]:
def get_reddit(query, start='2018-07-01', end='2019-06-30', data_type="comment"):
    """Given a query, return the number of comments, number of unique authors and
    average polarity of the comments
    """
    df = pd.DataFrame(columns=["author", "created", "text", "polarity", "upvotes"])
    df, last_time = get_pushshift_data(df,
                       data_type=data_type,     # "comment" or "submission" 
                       q=query,                 
                       after=start,
                       before=end,            
                       sort_type="created_utc",
                       size=100)
        
    while last_time < end:
        for i in range(3):
            try:
                df, last_time = get_pushshift_data(df,
                       data_type=data_type,     
                       q=query,                 
                       after=last_time,
                       before=end,            
                       sort_type="created_utc",
                       size=100)
                # stop tries after the first successful try
                break
            except Exception as exc:
                print(f"Pausing for 5 mins: {exc}")
                time.sleep(5*60)
   
    # save to file
    df.to_csv(f"reddit_queries-{query}-{start}-{end}-{data_type}.csv")
    
    n_comments = df.shape[0]
    n_authors = df["author"].nunique()
    av_polarity = df['polarity'].mean()
    n_upvotes = df['upvotes'].abs().sum()
    
    return {"n_comments": n_comments, "n_authors": n_authors, "av_polarity": av_polarity, "n_upvotes": n_upvotes}

In [4]:
result = get_reddit(query="googl stock", start='2016-01-01', end='2021-05-01', data_type="submission")
result

Retrieved 100 comments, till 2017-02-09 14:36:47
Retrieved 200 comments, till 2017-10-27 04:52:30
Retrieved 300 comments, till 2018-03-14 03:03:57
Retrieved 400 comments, till 2018-08-01 16:04:40
Retrieved 499 comments, till 2018-11-28 16:34:09
Retrieved 599 comments, till 2019-05-09 13:46:31
Retrieved 699 comments, till 2019-12-11 05:17:11
Retrieved 799 comments, till 2020-02-05 16:29:47
Retrieved 899 comments, till 2020-03-24 02:20:09
Retrieved 999 comments, till 2020-04-28 01:46:03
Retrieved 1099 comments, till 2020-06-18 02:14:11
Retrieved 1199 comments, till 2020-08-05 02:52:53
Retrieved 1299 comments, till 2020-09-24 05:31:58
Retrieved 1399 comments, till 2020-11-30 22:10:31
Retrieved 1499 comments, till 2021-02-01 03:37:50
Retrieved 1599 comments, till 2021-04-21 22:45:59
Retrieved 1647 comments, till 2021-04-30 15:46:30
Retrieved 1647 comments, till 2021-05-01


{'n_comments': 1647,
 'n_authors': 564,
 'av_polarity': 0.05501720454684176,
 'n_upvotes': 21796}

In [5]:
result = get_reddit(query="TSLA stock", start='2015-01-01', end='2021-05-01', data_type="submission")
result

Retrieved 100 comments, till 2015-05-20 10:52:08
Retrieved 200 comments, till 2015-10-14 16:38:22
Retrieved 300 comments, till 2016-03-21 05:48:47
Retrieved 400 comments, till 2016-06-28 01:51:26
Retrieved 500 comments, till 2016-10-28 00:20:24
Retrieved 600 comments, till 2017-01-19 14:16:00
Retrieved 700 comments, till 2017-04-03 15:38:52
Retrieved 800 comments, till 2017-06-13 13:40:04
Retrieved 900 comments, till 2017-08-31 04:15:16
Retrieved 1000 comments, till 2017-11-09 20:59:14
Retrieved 1100 comments, till 2018-02-07 21:23:33
Retrieved 1200 comments, till 2018-03-30 01:00:10
Retrieved 1300 comments, till 2018-04-25 04:44:29
Retrieved 1400 comments, till 2018-06-18 23:41:32
Retrieved 1500 comments, till 2018-08-02 08:12:53
Retrieved 1590 comments, till 2018-08-11 22:26:55
Retrieved 1689 comments, till 2018-09-11 14:29:18
Retrieved 1789 comments, till 2018-10-11 18:26:37
Retrieved 1889 comments, till 2018-11-19 13:16:24
Retrieved 1989 comments, till 2019-01-18 14:39:20
Retrieved

{'n_comments': 11432,
 'n_authors': 5725,
 'av_polarity': 0.051638419956355125,
 'n_upvotes': 238528}

In [6]:
result = get_reddit(query="AMZN stock", start='2015-01-01', end='2021-05-01', data_type="submission")
result

Retrieved 100 comments, till 2015-11-25 21:44:01
Retrieved 200 comments, till 2016-07-18 22:26:51
Retrieved 300 comments, till 2016-12-21 10:56:12
Retrieved 400 comments, till 2017-03-31 18:49:40
Retrieved 500 comments, till 2017-07-12 19:28:26
Retrieved 600 comments, till 2017-09-29 12:45:18
Retrieved 700 comments, till 2017-11-28 22:02:39
Retrieved 800 comments, till 2018-01-25 03:15:46
Retrieved 900 comments, till 2018-03-04 08:16:43
Retrieved 1000 comments, till 2018-03-31 18:53:09
Retrieved 1100 comments, till 2018-05-05 02:32:50
Retrieved 1200 comments, till 2018-07-16 23:40:57
Retrieved 1298 comments, till 2018-08-27 22:35:28
Retrieved 1398 comments, till 2018-10-03 14:57:16
Retrieved 1498 comments, till 2018-10-30 18:47:01
Retrieved 1598 comments, till 2018-12-26 10:11:30
Retrieved 1698 comments, till 2019-02-10 23:11:54
Retrieved 1798 comments, till 2019-04-04 19:28:55
Retrieved 1898 comments, till 2019-05-17 03:20:33
Retrieved 1998 comments, till 2019-08-01 02:42:53
Retrieved

{'n_comments': 4889,
 'n_authors': 2366,
 'av_polarity': 0.06318829986316626,
 'n_upvotes': 66966}

In [9]:
result = get_reddit(query="AAPL stock", start="2016-01-01", end="2021-05-01", data_type="submission")
result

Retrieved 100 comments, till 2016-04-05 02:12:20
Retrieved 200 comments, till 2016-07-02 13:21:35
Retrieved 300 comments, till 2016-09-12 06:47:00
Retrieved 400 comments, till 2016-11-20 21:59:05
Retrieved 500 comments, till 2017-02-01 15:26:45
Retrieved 600 comments, till 2017-04-08 16:25:34
Retrieved 700 comments, till 2017-06-08 22:03:05
Retrieved 800 comments, till 2017-08-22 15:35:26
Retrieved 900 comments, till 2017-10-17 14:42:45
Retrieved 1000 comments, till 2017-12-20 14:36:37
Retrieved 1100 comments, till 2018-01-31 14:54:46
Retrieved 1200 comments, till 2018-03-06 22:35:15
Retrieved 1300 comments, till 2018-04-23 15:44:27
Retrieved 1400 comments, till 2018-06-10 16:57:02
Retrieved 1500 comments, till 2018-08-09 12:00:19
Retrieved 1593 comments, till 2018-09-14 15:06:04
Retrieved 1693 comments, till 2018-10-23 10:01:07
Retrieved 1793 comments, till 2018-11-19 20:05:52
Retrieved 1893 comments, till 2018-12-21 18:06:03
Retrieved 1993 comments, till 2019-01-27 15:39:04
Retrieved

{'n_comments': 7736,
 'n_authors': 3515,
 'av_polarity': 0.058108098740032636,
 'n_upvotes': 144657}

In [10]:
result = get_reddit(query="TSLA stock", start="2016-01-01", end="2021-05-01", data_type="submission")
result

Retrieved 100 comments, till 2016-04-14 16:59:17
Retrieved 200 comments, till 2016-08-20 20:35:17
Retrieved 300 comments, till 2016-12-01 16:09:04
Retrieved 400 comments, till 2017-02-21 14:25:16
Retrieved 500 comments, till 2017-05-01 11:26:21
Retrieved 600 comments, till 2017-07-14 21:12:16
Retrieved 700 comments, till 2017-10-05 06:51:46
Retrieved 800 comments, till 2017-12-31 00:55:29
Retrieved 900 comments, till 2018-03-03 00:07:34
Retrieved 1000 comments, till 2018-04-05 23:09:38
Retrieved 1100 comments, till 2018-05-14 22:46:04
Retrieved 1200 comments, till 2018-07-04 02:28:56
Retrieved 1300 comments, till 2018-08-07 21:47:03
Retrieved 1389 comments, till 2018-08-22 22:48:05
Retrieved 1489 comments, till 2018-09-29 21:25:04
Retrieved 1589 comments, till 2018-10-24 23:29:03
Retrieved 1689 comments, till 2018-12-14 14:30:46
Retrieved 1789 comments, till 2019-01-31 12:27:47
Retrieved 1889 comments, till 2019-03-15 12:54:06
Retrieved 1989 comments, till 2019-04-22 14:26:54
Retrieved

{'n_comments': 11175,
 'n_authors': 5593,
 'av_polarity': 0.05232376721870658,
 'n_upvotes': 233744}