In [1]:
import os
import json
import re
import hashlib
import random
from datetime import datetime
from tqdm import tqdm
import unicodecsv as csv

In [2]:
items = []

In [3]:
def md5hash(item):
    m = hashlib.md5()
    m.update(json.dumps(item["metadata"]["id"]).encode('ascii'))
    m.update(json.dumps(item["text"]).encode('ascii'))
    m.update(json.dumps(item["metadata"]["header"]).encode('ascii'))
    return m.hexdigest()

In [4]:
for root, dirs, files in os.walk('../data/'):
    for file in files:
        path = os.path.join(root, file)
        if path.endswith(".json") and ("/110/" in path or "/111/" in path):
            with open(path, "r") as infile:
                data = json.load(infile)
                congress = int(re.search(r"data\/(\d+)\/output", path).group(1))
                metadata = {
                    "header": data["header"],
                    "id": data["id"],
                    "title": data["title"],
                    "document_title": data["doc_title"],
                    "congress": congress
                }
                for item in data["content"]:
                    item["metadata"] = metadata
                    item["hash"] = md5hash(item)
                    items.append(item)

In [5]:
with open("../data/corpus.json", "w") as outfile:
    json.dump(items, outfile)

# Searches

In [23]:
def write_search_results(items, file):
    with open(file, "wb") as outfile:
        writer = csv.writer(outfile)
        writer.writerow(["HASH", "SPEAKER", "BIOGUIDE", "DATE", "TITLE", "TEXT"])
        for item in items:
            text = item["text"]
            date = "-".join([item["metadata"]["header"]["day"], item["metadata"]["header"]["month"], item["metadata"]["header"]["year"]])
            writer.writerow([item["hash"], item["speaker"], item["speaker_bioguide"], date, item["metadata"]["title"], item["text"]])
            
def write_sample(items, file):
    with open(file, "wb") as outfile:
        writer = csv.writer(outfile)
        writer.writerow(["ID", "RELEVANT"])
        for item in items:
            writer.writerow([item["hash"], ""])
            
def write_markdown(items, title, file):
    with open(file, "w") as outfile:
        outfile.write("%% %s\n%% %s Total Records | %s" % (title, str(len(items)), str(datetime.now().date())))
        for item in items:
            outfile.write("\n\n---\n\n")
            date = " ".join([item["metadata"]["header"]["day"], item["metadata"]["header"]["month"], item["metadata"]["header"]["year"]])
            outfile.write("## %s\n" % item["metadata"]["title"])
            outfile.write("## `%s`\n" % item["hash"])
            outfile.write("`%s — %s`\n\n" % (item["speaker"], date))
            text = item["text"].replace("  ", "\n").replace("`", "'").replace("\\", "/")
            outfile.write(text)
            
def write_website(items, directory):
    with open(directory + "README.md", "w") as outfile:
        outfile.write("---\n---\n\n# Congressional Record Index\n")
        outfile.write("This website provides a temporary interface for accessing Congressional Record search results.\nTo find a particular post, use `Cmd+F` and search for either its title or its ID.\n\n")
        outfile.write("---\n\n")
        for item in items:
            outfile.write("* [%s (%s)](%s.md)\n" % (item["metadata"]["title"], item["hash"], item["hash"]))
    for item in tqdm(items):
        with open(directory + "%s.md" % item["hash"], "w") as outfile:
            date = " ".join([item["metadata"]["header"]["day"], item["metadata"]["header"]["month"], item["metadata"]["header"]["year"]])
            outfile.write("---\nlayout: default\n---\n\n# %s\n" % item["metadata"]["title"])
            outfile.write("## `%s`\n" % item["hash"])
            outfile.write("`%s — %s`\n\n" % (item["speaker"], date))
            outfile.write("---\n\n")
            text = item["text"].replace("  ", "\n").replace("`", "'").replace("\\", "/")
            outfile.write(text)
            

In [15]:
all_items = []

### Iraq

In [16]:
def is_statement(item):
    if "speaker_bioguide" not in item:
        item["speaker_bioguide"] = None
    return not (str(item["speaker_bioguide"]) == "None")

In [17]:
matched_items = []
for item in tqdm(items):
    if not is_statement(item):
        continue
    if "speaker_bioguide" not in item:
        item["speaker_bioguide"] = None
    if "iraq" in item["text"].lower() and item["metadata"]["congress"] == 110:
        matched_items.append(item)
with open("../data/iraq.json", "w") as outfile:
    json.dump(matched_items, outfile)
write_search_results(matched_items, "../data/iraq.csv")
write_sample(random.sample(matched_items, 500), "../data/iraq_sample.csv")
write_markdown(matched_items, "Iraq Dataset", "../data/iraq.md")
all_items.extend(matched_items)
print("Search returned %s results." % len(matched_items))

100%|██████████| 999390/999390 [00:01<00:00, 587025.84it/s]


Search returned 10216 results.


### Afghanistan

In [18]:
matched_items = []
for item in tqdm(items):
    if not is_statement(item):
        continue
    if "speaker_bioguide" not in item:
        item["speaker_bioguide"] == None
    if "afghanistan" in item["text"].lower() and item["metadata"]["congress"] == 111:
        matched_items.append(item)
with open("../data/afghanistan.json", "w") as outfile:
    json.dump(matched_items, outfile)
write_search_results(matched_items, "../data/afghanistan.csv")
write_sample(random.sample(matched_items, 500), "../data/afghanistan_sample.csv")
write_markdown(matched_items, "Afghanistan Dataset", "../data/afghanistan.md")
all_items.extend(matched_items)
print("Search returned %s results." % len(matched_items))

100%|██████████| 999390/999390 [00:01<00:00, 536784.45it/s]


Search returned 3539 results.


In [19]:
write_markdown(all_items, "Iraq and Afghanistan Dataset", "../data/all.md")

In [24]:
write_website(all_items, "../docs/")

100%|██████████| 13755/13755 [00:01<00:00, 8137.38it/s]
