In [1]:
import json
from dateutil.parser import parse
from datetime import timedelta
import requests
import math

In [2]:
addl_fields = {}
with open("/Users/miles/Source/combine-cr/data/sr2.json", "r") as datain:
    for mention in json.load(datain):
        addl_fields[mention['id']] = mention

In [3]:
classifications = {}
with open("/Users/miles/Source/combine-cr/data/classifications.json", "r") as datain:
    for element in json.load(datain): # is array
        classifications[element['ID']] = {
            "relevant": element['RELEVANT'],
            "sentiment": element['CATEGORICAL SENTIMENT']
        }

In [4]:
def get_congress(time):
    time = time - timedelta(days=3)
    year = time.year
    return int(math.ceil(0.5*year - 894))

In [5]:
mentions = {}
with open("/Users/miles/Source/combine-cr/data/mentions.json", "r") as datain:
    for element in json.load(datain): # is array
        mentions[element['ID']] = {
            "relevant": classifications[element['ID']]["relevant"],
            "sentiment": classifications[element['ID']]["sentiment"],
            "date": element['DATE'],
            "date_parsed": parse(element['DATE']),
            "title": element['TITLE'],
            "first_name": element['FIRST NAME'],
            "last_name": element['LAST NAME'],
            "party": element['PARTY'],
            "sex": element['SEX'],
            "state": element['STATE'],
            "statement": element['STATEMENT'],
            "id": element['ID'],
            "congress": get_congress(parse(element['DATE']))
        }

In [6]:
strikes = [strike for strike in requests.get("https://tbij.dronescout.org/data").json().itervalues() if not isinstance(strike, str)]

In [7]:
strikedates = []

In [8]:
for strike in strikes:
    try:
        if not strike['location'].endswith("Afghanistan"):
            strikedates.append(parse(strike['date']))
    except:
        continue

In [9]:
strikedates = sorted(strikedates)

In [10]:
len(strikedates)

704

In [11]:
for mention in mentions.itervalues():
    latest_strike = None
    for strikedate in strikedates:
        if strikedate > mention['date_parsed']:
            break
        latest_strike = strikedate
    days_since_strike = None
    if latest_strike is not None:
        days_since_strike = (mention['date_parsed'] - latest_strike).days
    mention['days_since_strike'] = days_since_strike
#     print mention['date']
#     print days_since_strike

In [12]:
def decode_dw_row(row):
    # decodes row such as:
    #  114   29774  71  24  CALIFOR    100  0  1   CAPPS         -0.389   -0.227      -81.84920   1077     26    0.927
    # format: congress, icpsr, state code, district number (0 if senate or president), state name, party code (100=dem, 200=republican), occupancy, office attainment type, name, 1st dimension coord, 2nd dimension coord, log likelyhood, # of votes, # of classification errors, geometric mean probability
    elements = [element.strip() for element in row.split("  ") if element.strip() != ""]
    if len(elements) == 16:
        del elements[9]
    if len(elements) != 15:
        print "error: != 15 elements in a row! {}".format(len(elements))
        print row
        raise Exception("!= 15 elements in row")
    return {
        "latest_congress": elements[0],
        "icpsr": elements[1],
        "state_code": elements[2],
        "district_number": elements[3],
        "state": elements[4],
        "party_code": int(elements[5]),
        "occupancy": elements[6],
        "office_attainment_type": elements[7],
        "last_name": elements[8].split(" ")[0],
        "dim_1": float(elements[9]),
        "dim_2": float(elements[10]),
        "log_likelyhood": float(elements[11]),
        "votes": int(elements[12]),
        "classification_errors": float(elements[13]),
        "geometric_mean_probability": float(elements[14])
    }
dw_nominate_scores = {}
with open("/Users/miles/Source/combine-cr/data/DW-NOMINATE.txt", "r") as dw:
    for row in dw.readlines():
        dat = decode_dw_row(row)
        if int(dat['latest_congress']) < 106:
            continue
        party = ""
        if dat['party_code'] == 100:
            party = "Democrat"
        if dat['party_code'] == 200:
            party = "Republican"
        name = dat['last_name'] + dat['state'] + str(dat['latest_congress'])
        dw_nominate_scores[name.lower()] = dat

In [13]:
for mention in mentions.itervalues():
    name = (mention['last_name'] + mention['state'][:7] + str(get_congress(mention['date_parsed']))).lower()
    if name in dw_nominate_scores:
        dw_nominate_data = dw_nominate_scores[name]
        for key in dw_nominate_data.iterkeys():
            mention[key] = dw_nominate_data[key]

In [14]:
mentions_compiled = []
for mention in mentions.itervalues():
    z = mention
    
    if z['id'] in addl_fields:
        z['days_until_term_ends'] = addl_fields[z['id']]['days_until_term_ends']

    # party scores
    try:
        if z['party'] == "Democrat":
            z['party'] = 1
        elif z['party'] == "Republican":
            z['party'] = 0
        else:
            z['party'] = None
    except:
        z['party'] = None
    
    # sentiment scores
    try:
        if z['sentiment'] == "Pro":
            z['sentiment'] = 1
        elif z['sentiment'] == "Anti":
            z['sentiment'] = 0
        else:
            z['sentiment'] = None
    except:
        z['party'] = None
        
    # relevant scores
    try:
        if z['relevant'] == "Y":
            z['relevant'] = 1
        elif z['relevant'] == "N":
            z['relevant'] = 0
        else:
            z['relevant'] = None
    except:
        z['party'] = None
        
    # code sex into dataset (can't code gender, not binary, though neither is sex... oh well, STATA doesn't care...)
    try:
        if z['sex'] == "M":
            z['sex'] = 1
        elif z['sex'] == "F":
            z['sex'] = 0
        else:
            z['sex'] = None
    except:
        z['sex'] = None
        
    # recent scores
    try:
        if z['days_since_strike'] <= 7:
            z['strike_recently'] = 1
        else:
            z['strike_recently'] = 0
    except:
        z['strike_recently'] = None
       
    try:
        del z['date_parsed']
        del z['statement']
        del z['classification_errors']
        del z['party_code']
        del z['district_number']
        del z['geometric_mean_probability']
        del z['log_likelyhood']
        del z['latest_congress']
    except:
        pass
        
    mentions_compiled.append(z)

In [15]:
keys = []
for z in mentions_compiled:
    for key in z.iterkeys():
        if key not in keys:
            keys.append(key)
print keys

['last_name', 'congress', 'days_until_term_ends', 'sex', 'days_since_strike', 'relevant', 'date', 'state_code', 'dim_2', 'id', 'dim_1', 'office_attainment_type', 'first_name', 'strike_recently', 'icpsr', 'sentiment', 'title', 'occupancy', 'state', 'votes', 'party']


In [16]:
import unicodecsv as csv
with open("data_formatted.csv", "wb") as outfile:
    writer = csv.writer(outfile)
    writer.writerow(keys)
    for z in mentions_compiled:
        row = []
        for key in keys:
            if key in z and z[key] != "Unknown":
                row.append(z[key])
            else:
                row.append(None)
        writer.writerow(row)