In [1]:
import os
import json
import re
import hashlib
from tqdm import tqdm
import unicodecsv as csv

In [2]:
items = []

In [3]:
def md5hash(item):
    m = hashlib.md5()
    m.update(json.dumps(item["metadata"]["id"]).encode('ascii'))
    m.update(json.dumps(item["text"]).encode('ascii'))
    m.update(json.dumps(item["metadata"]["header"]).encode('ascii'))
    return m.hexdigest()

In [4]:
for root, dirs, files in os.walk('../data/'):
    for file in files:
        path = os.path.join(root, file)
        if path.endswith(".json") and ("/110/" in path or "/111/" in path):
            with open(path, "r") as infile:
                data = json.load(infile)
                congress = int(re.search(r"data\/(\d+)\/output", path).group(1))
                metadata = {
                    "header": data["header"],
                    "id": data["id"],
                    "title": data["title"],
                    "document_title": data["doc_title"],
                    "congress": congress
                }
                for item in data["content"]:
                    item["metadata"] = metadata
                    item["hash"] = md5hash(item)
                    items.append(item)

In [5]:
with open("../data/corpus.json", "w") as outfile:
    json.dump(items, outfile)

# Searches

In [6]:
def write_search_results(items, file):
    with open(file, "wb") as outfile:
        writer = csv.writer(outfile)
        writer.writerow(["HASH", "SPEAKER", "BIOGUIDE", "DATE", "TITLE", "TEXT"])
        for item in items:
            date = "-".join([item["metadata"]["header"]["day"], item["metadata"]["header"]["month"], item["metadata"]["header"]["year"]])
            writer.writerow([item["hash"], item["speaker"], item["speaker_bioguide"], date, item["metadata"]["title"], item["text"]])

### Iraq

In [9]:
matched_items = []
for item in tqdm(items):
    if "speaker_bioguide" not in item:
        item["speaker_bioguide"] = None
    if "iraq" in item["text"].lower() and item["metadata"]["congress"] == 110:
        matched_items.append(item)
with open("../data/iraq.json", "w") as outfile:
    json.dump(matched_items, outfile)
write_search_results(matched_items, "../data/iraq.csv")
print("Search returned %s results." % len(matched_items))

100%|██████████| 999390/999390 [00:01<00:00, 545182.32it/s]


Search returned 13486 results.


### Afghanistan

In [10]:
matched_items = []
for item in tqdm(items):
    if "speaker_bioguide" not in item:
        item["speaker_bioguide"] == None
    if "afghanistan" in item["text"].lower() and item["metadata"]["congress"] == 111:
        matched_items.append(item)
with open("../data/afghanistan.json", "w") as outfile:
    json.dump(matched_items, outfile)
write_search_results(matched_items, "../data/afghanistan.csv")
print("Search returned %s results." % len(matched_items))

100%|██████████| 999390/999390 [00:01<00:00, 543482.11it/s]


Search returned 4786 results.
