In [1]:
import bz2
import csv
import json
import os
import pickle
import concurrent.futures
import urllib.request

In [2]:
with open('filenames.csv', 'r') as file:
    reader = csv.DictReader(file)
    filenames = []
    for row in reader:
        filenames.append(row['Filename'])
if os.path.isfile('visited.pickle'):
    with open('visited.pickle', 'rb') as pfile:
        visited = pickle.load(pfile)
else:
    visited = set()
# print(visited)

In [4]:
if os.path.isfile('downloaded.pickle'):
    with open('downloaded.pickle', 'rb') as pfile:
        downloaded = pickle.load(pfile)
else:
    downloaded = []

to_visit = []
for file in filenames:
    if file not in visited:
        if file not in downloaded:
            to_visit.append(file)
print(to_visit)
print(downloaded)

['RS_2016-09.bz2', 'RS_2016-10.bz2', 'RS_2016-11.bz2', 'RS_2016-12.bz2', 'RS_2017-01.bz2']
['RS_2017-02.bz2', 'RS_2017-05.bz2', 'RS_2017-06.bz2', 'RS_2017-04.bz2', 'RS_2017-03.bz2', 'RS_2017-07.bz2', 'RS_2017-09.bz2', 'RS_2017-08.bz2', 'RS_2017-10.bz2', 'RS_2017-11.bz2']


In [5]:
def load_url(name):
    print("Downloading " + name)
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36')]
    urllib.request.install_opener(opener)
    urllib.request.urlretrieve("https://files.pushshift.io/reddit/submissions/" + name, name)
    return(name)

# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(5) as executor:
    # Start the load operations and mark each future with its URL
    future_to_url = {executor.submit(load_url, name): name for name in to_visit}
    for future in concurrent.futures.as_completed(future_to_url):
        url = future_to_url[future]
        try:
            name = future.result()
        except Exception as exc:
            print('%r generated an exception: %s' % (url, exc))
        else:
            print("Finished downloading " + name)
            downloaded.append(name)
            with open('downloaded.pickle', 'wb') as pfile:
                pickle.dump(downloaded, pfile)

Downloading RS_2016-09.bz2
Downloading RS_2016-10.bz2
Downloading RS_2016-11.bz2
Downloading RS_2016-12.bz2
Downloading RS_2017-01.bz2
Finished downloading RS_2016-09.bz2
Finished downloading RS_2016-10.bz2
Finished downloading RS_2016-11.bz2
Finished downloading RS_2016-12.bz2
Finished downloading RS_2017-01.bz2


In [6]:
for name in downloaded:
    print("Starting " + name)
    with open("decompressed.bin", 'wb') as new_file, bz2.BZ2File(name, 'rb') as file:
        for data in iter(lambda : file.read(100 * 1024), b''):
            new_file.write(data)
    print("   Finding askscience")
    with open("decompressed.bin", 'r', encoding='utf-8') as file, open('raw_submissions.txt', "a+", encoding='utf-8') as outfile:
        towrite = ""
        for line in file:
            try:
                entry = json.loads(line)
            except ValueError:
                print("JSON decoding failed")
                continue
            if 'subreddit' not in entry: continue
            if entry['subreddit'].lower() == 'askscience' or \
                (entry['subreddit'].lower() == 'science' and 'askscience' in entry['title'].lower()):
                towrite = towrite + line + "\n"
        outfile.write(towrite)
    os.remove(name)
    visited.add(name)
    with open('visited.pickle', 'wb') as pfile:
        pickle.dump(visited, pfile)
    print("Finished " + name)

Starting RS_2017-02.bz2
   Finding askscience
Finished RS_2017-02.bz2
Starting RS_2017-05.bz2
   Finding askscience
Finished RS_2017-05.bz2
Starting RS_2017-06.bz2
   Finding askscience
Finished RS_2017-06.bz2
Starting RS_2017-04.bz2
   Finding askscience
Finished RS_2017-04.bz2
Starting RS_2017-03.bz2
   Finding askscience
Finished RS_2017-03.bz2
Starting RS_2017-07.bz2
   Finding askscience
Finished RS_2017-07.bz2
Starting RS_2017-09.bz2
   Finding askscience
Finished RS_2017-09.bz2
Starting RS_2017-08.bz2
   Finding askscience
Finished RS_2017-08.bz2
Starting RS_2017-10.bz2
   Finding askscience
Finished RS_2017-10.bz2
Starting RS_2017-11.bz2
   Finding askscience
Finished RS_2017-11.bz2
Starting RS_2016-09.bz2
   Finding askscience
Finished RS_2016-09.bz2
Starting RS_2016-10.bz2
   Finding askscience
Finished RS_2016-10.bz2
Starting RS_2016-11.bz2
   Finding askscience
Finished RS_2016-11.bz2
Starting RS_2016-12.bz2
   Finding askscience
Finished RS_2016-12.bz2
Starting RS_2017-01.

In [None]:
# Single threaded download version
# For file in filenames:
# import urllib.request
# opener = urllib.request.build_opener()
# opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36')]
# urllib.request.install_opener(opener)
# for name in filenames:
#     if name in visited: continue
#     print("Downloading " + name)
#     urllib.request.urlretrieve("https://files.pushshift.io/reddit/submissions/" + name, name)
#     print("   Decompressing")
#     with open("decompressed.bin", 'wb') as new_file, bz2.BZ2File(name, 'rb') as file:
#         for data in iter(lambda : file.read(100 * 1024), b''):
#             new_file.write(data)
#     print("   Finding askscience")
#     with open("decompressed.bin", 'r', encoding='utf-8') as file, open('raw_submissions.txt', "a+", encoding='utf-8') as outfile:
#         towrite = ""
#         for line in file:
#             try:
#                 entry = json.loads(line)
#             except ValueError:
#                 print("JSON decoding failed")
#                 continue
#             if 'subreddit' not in entry: continue
#             if entry['subreddit'].lower() == 'askscience' or \
#                 (entry['subreddit'].lower() == 'science' and 'askscience' in entry['title'].lower()):
#                 towrite = towrite + line + "\n"
#         outfile.write(towrite)
#     os.remove(name)
#     visited.add(name)
#     with open('visited.pickle', 'wb') as pfile:
#         pickle.dump(visited, pfile)
#     print("Finished " + name)
# #     break
# # I'll need to clean up the file - remove duplicates, remove non-climate change topics, remove blanks