In [1]:
import datetime as dt
import csv
import pandas as pd

from psaw import PushshiftAPI

In [2]:
# initialise API
api = PushshiftAPI()

## Load Comments via Pushshift API
Pushshift BigQuery does not provide data for after December 2019, so we query comments for Jan and Feb 2020 via the regular API

In [None]:
%%time

# initialise empty df for writing into
sample_df = pd.DataFrame(columns = ['created_utc', 'body', 'subreddit'])


for year, month in [(2020, 2)]:
    print(year, month)

    start_time_epoch = int(dt.datetime(year = year, month = month, day = 1, hour = 0, minute = 0 ).timestamp())
    end_time_epoch = int(dt.datetime(year = year, month = month+1, day = 1, hour = 0, minute = 0 ).timestamp())

    gen = api.search_comments(before = end_time_epoch, after = start_time_epoch,
                              subreddit = ['the_donald', 'libertarian', 'conservative', 'politics', 'chapotraphouse'],
                              filter = ['body', 'subreddit'])
    print(' queried results')
    
    # write result to df
    results_df = pd.DataFrame({'created_utc': row.created_utc, 'body': row.body, 'subreddit': row.subreddit} for row in gen)
    print(' wrote to df')

    # export df to csv
    results_df.to_csv(f'../../0_data/raw/labelled_reddit/politics_{year}_0{month}.csv', index=False)
    print(' exported to csv')
    print()                              

2020 2
 queried results




## Number of Posts per Month by Subreddit

In [7]:
%%time

# number of posts per month by subreddit

post_counts = {}

for subreddit in ['politics', 'chapotraphouse', 'the_donald', 'libertarian', 'conservative']:
    
    post_counts[subreddit] = {}
    
    print(subreddit.upper())
    
    for year in [2017, 2018, 2019]:
        
        for month in range(1,13):
            
            start_time_epoch = int(dt.datetime(year = year, month = month, day = 1).timestamp())
            if month < 12:
                end_time_epoch = int(dt.datetime(year = year, month = month + 1, day = 1).timestamp())
            else:
                end_time_epoch = int(dt.datetime(year = year, month = month, day = 31, hour=23, minute=59).timestamp())

            gen = api.search_comments(before = end_time_epoch, after = start_time_epoch, subreddit = subreddit)
            thing = next(gen)
            post_counts[subreddit][f'{year}-{month}'] = api.metadata_['total_results']
            print(f'{year}-{month}\t', api.metadata_['total_results'])
                                         
    print()

POLITICS
2017-1	 2229332
2017-2	 1970443
2017-3	 1819303
2017-4	 1412045
2017-5	 2028607
2017-6	 1818615
2017-7	 1710163
2017-8	 1824377
2017-9	 1340072
2017-10	 1588002
2017-11	 1617578
2017-12	 1717618
2018-1	 1832810
2018-2	 1771915
2018-3	 1977332
2018-4	 1741231
2018-5	 1721785
2018-6	 2032413
2018-7	 2046990
2018-8	 1978164
2018-9	 2244002
2018-10	 2211803
2018-11	 2241899
2018-12	 1856188
2019-1	 2669884
2019-2	 2223564
2019-3	 2089752
2019-4	 1756430
2019-5	 1687417
2019-6	 1722016
2019-7	 2094665
2019-8	 1960885
2019-9	 1879030
2019-10	 2345308
2019-11	 2161344
2019-12	 2132256

CHAPOTRAPHOUSE
2017-1	 3768
2017-2	 7507
2017-3	 13441
2017-4	 24808
2017-5	 42151
2017-6	 56487
2017-7	 82618
2017-8	 112315
2017-9	 113693
2017-10	 131249
2017-11	 120224
2017-12	 117371
2018-1	 135049
2018-2	 136442
2018-3	 160848
2018-4	 170189
2018-5	 183683
2018-6	 187931
2018-7	 228797
2018-8	 237250
2018-9	 262281
2018-10	 309807
2018-11	 315664
2018-12	 344926
2019-1	 339598
2019-2	 347795
201

In [8]:
pd.DataFrame.from_dict(post_counts)

Unnamed: 0,politics,chapotraphouse,the_donald,libertarian,conservative
2017-1,2229332,3768,1584878,59082,55278
2017-2,1970443,7507,1527566,61326,54286
2017-3,1819303,13441,1360477,54932,49076
2017-4,1412045,24808,1184622,72786,38604
2017-5,2028607,42151,1221550,82550,59817
2017-6,1818615,56487,1159259,83073,47396
2017-7,1710163,82618,1042985,89055,60373
2017-8,1824377,112315,1057270,94053,66218
2017-9,1340072,113693,870331,99360,53848
2017-10,1588002,131249,1061931,105267,57797
