In [21]:
import datetime as dt
import csv
import pandas as pd

from psaw import PushshiftAPI

In [22]:
# initialise API
api = PushshiftAPI()

## Load Comments via Pushshift API
Pushshift BigQuery does not provide data for after December 2019, so we query comments for Jan and Feb 2020 via the regular API

In [None]:
%%time

# initialise empty df for writing into
sample_df = pd.DataFrame(columns = ['created_utc', 'body', 'subreddit'])


for year, month in [(2020, 1), (2020, 2)]:
    print(year, month)

    start_time_epoch = int(dt.datetime(year = year, month = month, day = 1, hour = 0, minute = 0 ).timestamp())
    end_time_epoch = int(dt.datetime(year = year, month = month+1, day = 1, hour = 0, minute = 0 ).timestamp())

    gen = api.search_comments(before = end_time_epoch, after = start_time_epoch, subreddit = ['news', 'worldnews'], filter = ['body', 'subreddit'])
    print(' queried results')
    
    # write result to df
    results_df = pd.DataFrame({'created_utc': row.created_utc, 'body': row.body, 'subreddit': row.subreddit} for row in gen)
    print(' wrote to df')

    # export df to csv
    results_df.to_csv(f'../../0_data/raw/unlabelled_reddit/news_{year}_0{month}.csv', index=False)
    print(' exported to csv')
    print()                              

2020 1
 queried results


## Number of Posts per Month by Subreddit

In [3]:
%%time

# number of posts per month by subreddit

for subreddit in ['news', 'worldnews']:
    
    print(subreddit.upper())
    
    for year in [2017, 2018, 2019, 2020]:
        
        for month in range(1,13):
            
            start_time_epoch = int(dt.datetime(year = year, month = month, day = 1).timestamp())
            if month < 12:
                end_time_epoch = int(dt.datetime(year = year, month = month + 1, day = 1).timestamp())
            else:
                end_time_epoch = int(dt.datetime(year = year, month = month, day = 31, hour=23, minute=59).timestamp())

            gen = api.search_comments(before = end_time_epoch, after = start_time_epoch, subreddit = subreddit)
            thing = next(gen)
            print(f'{year}-{month}\t', api.metadata_['total_results'])
                                         
    print()

NEWS
2017-1	 945979
2017-2	 796637
2017-3	 658227
2017-4	 723085
2017-5	 738083
2017-6	 754922
2017-7	 647046
2017-8	 860177
2017-9	 760055
2017-10	 915381
2017-11	 817122
2017-12	 775292
2018-1	 786127
2018-2	 883865
2018-3	 870529
2018-4	 755070
2018-5	 794167
2018-6	 811301
2018-7	 802475
2018-8	 810093
2018-9	 618974
2018-10	 759539
2018-11	 730160
2018-12	 646816
2019-1	 745473
2019-2	 620229
2019-3	 772925
2019-4	 600211
2019-5	 606135
2019-6	 627461
2019-7	 687474
2019-8	 915870
2019-9	 690937
2019-10	 717822
2019-11	 588362
2019-12	 624543
2020-1	 730637
2020-2	 637406
2020-3	 743858
2020-4	 721977
2020-5	 1019562
2020-6	 1518271
2020-7	 1076804
2020-8	 1124469
2020-9	 895991
2020-10	 940018
2020-11	 760590
2020-12	 718765

WORLDNEWS
2017-1	 1018827
2017-2	 836394
2017-3	 818657
2017-4	 868556
2017-5	 988239
2017-6	 910001
2017-7	 742829
2017-8	 741297
2017-9	 742880
2017-10	 762009
2017-11	 790119
2017-12	 750653
2018-1	 817626
2018-2	 632033
2018-3	 857636
2018-4	 860770
2018