In [1]:
import json
from dateutil.parser import parse
from datetime import timedelta
import requests
import math

In [2]:
addl_fields = {}
with open("/Users/miles/Source/combine-cr/data/sr2.json", "r") as datain:
    for mention in json.load(datain):
        addl_fields[mention['id']] = mention

In [3]:
classifications = {}
with open("/Users/miles/Source/combine-cr/data/classifications.json", "r") as datain:
    for element in json.load(datain): # is array
        classifications[element['ID']] = {
            "relevant": element['RELEVANT'],
            "sentiment": element['CATEGORICAL SENTIMENT']
        }

In [4]:
def get_congress(time):
    time = time - timedelta(days=3)
    year = time.year
    return int(math.ceil(0.5*year - 894))

In [5]:
mentions = {}
with open("/Users/miles/Source/combine-cr/data/mentions.json", "r") as datain:
    for element in json.load(datain): # is array
        mentions[element['ID']] = {
            "relevant": classifications[element['ID']]["relevant"],
            "sentiment": classifications[element['ID']]["sentiment"],
            "date": element['DATE'],
            "date_parsed": parse(element['DATE']),
            "title": element['TITLE'],
            "first_name": element['FIRST NAME'],
            "last_name": element['LAST NAME'],
            "party": element['PARTY'],
            "sex": element['SEX'],
            "state": element['STATE'],
            "statement": element['STATEMENT'],
            "id": element['ID'],
            "congress": get_congress(parse(element['DATE']))
        }

In [6]:
strikes = [strike for strike in requests.get("https://tbij.dronescout.org/data").json().itervalues() if not isinstance(strike, str)]

In [7]:
strikedates = []

In [8]:
for strike in strikes:
    try:
        if not strike['location'].endswith("Afghanistan"):
            strikedates.append(parse(strike['date']))
    except:
        continue

In [9]:
strikedates = sorted(strikedates)

In [10]:
len(strikedates)

705

In [11]:
restrictive_mentions_by_year = {}

In [12]:
for mention in mentions.itervalues():
    latest_strike = None
    for strikedate in strikedates:
        if strikedate > mention['date_parsed']:
            break
        latest_strike = strikedate
    days_since_strike = None
    if latest_strike is not None:
        days_since_strike = (mention['date_parsed'] - latest_strike).days
    mention['days_since_strike'] = days_since_strike
    syear = str(mention['date_parsed'].year)
    if syear not in restrictive_mentions_by_year:
        restrictive_mentions_by_year[syear] = 0
    if mention['relevant'] == "Y" and mention["sentiment"] == "Anti":
        restrictive_mentions_by_year[syear] += 1
#     print mention['date']
#     print days_since_strike

In [13]:
import json
print json.dumps(restrictive_mentions_by_year, sort_keys=True)
print sum(restrictive_mentions_by_year.itervalues())
print len([
    mention for mention in mentions.itervalues() 
    if mention['relevant'] == "Y" 
    and mention['sentiment'] == "Anti"])

{"2000": 0, "2001": 0, "2002": 0, "2003": 0, "2004": 0, "2005": 1, "2006": 1, "2007": 0, "2008": 0, "2009": 16, "2010": 12, "2011": 33, "2012": 48, "2013": 290, "2014": 67, "2015": 9, "2016": 6, "2017": 1}
484
484


In [30]:
def decode_dw_row(row):
    # decodes row such as:
    #  114   29774  71  24  CALIFOR    100  0  1   CAPPS         -0.389   -0.227      -81.84920   1077     26    0.927
    # format: congress, icpsr, state code, district number (0 if senate or president), state name, party code (100=dem, 200=republican), occupancy, office attainment type, name, 1st dimension coord, 2nd dimension coord, log likelyhood, # of votes, # of classification errors, geometric mean probability
    elements = [element.strip() for element in row.split("  ") if element.strip() != ""]
    if len(elements) == 16:
        del elements[9]
    if len(elements) != 15:
        print "error: != 15 elements in a row! {}".format(len(elements))
        print row
        raise Exception("!= 15 elements in row")
    return {
        "latest_congress": elements[0],
        "icpsr": elements[1],
        "state_code": elements[2],
        "district_number": elements[3],
        "state": elements[4],
        "party_code": int(elements[5]),
        "occupancy": elements[6],
        "office_attainment_type": elements[7],
        "last_name": elements[8].split(" ")[0],
        "dim_1": float(elements[9]),
        "dim_2": float(elements[10]),
        "log_likelyhood": float(elements[11]),
        "votes": int(elements[12]),
        "classification_errors": float(elements[13]),
        "geometric_mean_probability": float(elements[14])
    }
dw_nominate_scores = {}
with open("/Users/miles/Source/combine-cr/data/DW-NOMINATE.txt", "r") as dw:
    for row in dw.readlines():
        dat = decode_dw_row(row)
        if int(dat['latest_congress']) < 106:
            continue
        party = ""
        if dat['party_code'] == 100:
            party = "Democrat"
        if dat['party_code'] == 200:
            party = "Republican"
        name = dat['last_name'] + dat['state'] + str(dat['latest_congress'])
        dw_nominate_scores[name.lower()] = dat

In [31]:
for mention in mentions.itervalues():
    name = (mention['last_name'] + mention['state'][:7] + str(get_congress(mention['date_parsed']))).lower()
    if name in dw_nominate_scores:
        dw_nominate_data = dw_nominate_scores[name]
        for key in dw_nominate_data.iterkeys():
            mention[key] = dw_nominate_data[key]

In [32]:
from datetime import date
def get_current_executive_party(statement_date):
    if statement_date > date(2017,1,19):
        return 0
    if statement_date > date(2009,1,19):
        return 1
    if statement_date > date(2001, 1, 19):
        return 0
    return None

In [33]:
states_1 =   ["WASHINGTON",
            "OREGON",
            "MONTANA",
            "MINNESOTA",
            "UTAH",
            "COLORADO",
            "KANSAS",
            "LOUISIANA",
            "MISSISSIPPI",
            "KENTUCKY",
            "ALABAMA",
            "INDIANA",
            "WEST VIRGINIA",
            "FLORIDA",
            "VIRGINIA",
            "PENNSYLVANIA",
            "NORTH CAROLINA",
            "MARYLAND",
            "NEW JERSEY",
            "NEW HAMPSHIRE",
            "ARKANSAS",
            "HAWAII"]

states_2 =   [
            "NORTH DAKOTA",
            "CALIFORNIA",
            "NEVADA",
            "ARIZONA",
            "NEW MEXICO",
            "TEXAS",
            "GEORGIA",
            "NEW YORK",
            "SOUTH DAKOTA",
            "NEVADA"]

# data according to https://publicintelligence.net/dod-us-drone-activities-map/
# states_1 is all states without predators and reapers, states_2 are those with

def drone_state_status(statename):
    statename = statename.strip()
    if statename.lower() == "unknown":
        return None
    if statename.upper()[:7] in [state[:7] for state in states_1]:
        return 1
    if statename.upper()[:7] in [state[:7] for state in states_2]:
        return 2
    return 0

In [34]:
mentions_compiled = []
for mention in mentions.itervalues():
    z = mention
    
    if z['id'] in addl_fields:
        z['days_until_term_ends'] = addl_fields[z['id']]['days_until_term_ends']

    # party scores
    try:
        if z['party'] == "Democrat":
            z['party'] = 1
        elif z['party'] == "Republican":
            z['party'] = 0
        else:
            z['party'] = None
    except:
        z['party'] = None
    
    # sentiment scores
    try:
        if z['sentiment'] == "Pro":
            z['sentiment'] = 1
        elif z['sentiment'] == "Anti":
            z['sentiment'] = 0
        else:
            z['sentiment'] = None
    except:
        z['sentiment'] = None
        
    # restraining
    
    if z['sentiment']:
        if z['sentiment'] == 1:
            z['restraining'] = 0
        if z['sentiment'] == 0:
            z['restraining'] = 1
    else:
        z['restraining'] = None
        
    # relevant scores
    try:
        if z['relevant'] == "Y":
            z['relevant'] = 1
        elif z['relevant'] == "N":
            z['relevant'] = 0
        else:
            z['relevant'] = None
    except:
        z['relevant'] = None
        
    # code sex into dataset (can't code gender, not binary, though neither is sex... oh well, STATA doesn't care...)
    try:
        if z['sex'] == "M":
            z['sex'] = 1
        elif z['sex'] == "F":
            z['sex'] = 0
        else:
            z['sex'] = None
    except:
        z['sex'] = None
        
    # recent scores
    try:
        if z['days_since_strike'] <= 7:
            z['strike_recently'] = 1
        else:
            z['strike_recently'] = 0
    except:
        z['strike_recently'] = None
        
    # loyalty
    if 'date_parsed' in z and 'party' in z:
        if z['party'] == get_current_executive_party(z['date_parsed'].date()):
            z['loyalty'] = 1
        else:
            z['loyalty'] = 0
    else:
        z['loyalty'] = None
    
    # filibuster
    z['filibuster'] = 0
    if 'icpsr' in z and 'date_parsed' in z and z['icpsr'] is not None and int(z['icpsr']) == 41104 and z['date_parsed'].date() == date(2013, 3, 6): # rand paul on the day of his filibuster
        z['filibuster'] = 1
            
    # extreme
    if 'dim_1' in z and z['dim_1']:
        if abs(z['dim_1']) >= 0.6: # is it not a bit high?
            z['extreme'] = 1
        else:
            z['extreme'] = 0
    else:
        z['extreme'] = None
        
    # dronebase
    if 'state' in z and z['state']:
        z['dronebase'] = drone_state_status(z['state'])
    else:
        z['dronebase'] = None
        
    # house — somewhat unreliable for legislators who have served in both
    if 'dim_1' in z:
        if 'district_number' in z and z['district_number'] is not None:
            if int(z['district_number']) != 0:
                z['house'] = 1 # house
            else:
                z['house'] = 0 # senate
    else:
        z['house'] = None
       
    try:
        del z['date_parsed']
        del z['statement']
        del z['classification_errors']
        del z['party_code']
        del z['district_number']
        del z['geometric_mean_probability']
        del z['log_likelyhood']
        del z['latest_congress']
    except:
        pass
        
    mentions_compiled.append(z)

In [35]:
keys = []
for z in mentions_compiled:
    for key in z.iterkeys():
        if key not in keys:
            keys.append(key)
print keys

['last_name', 'congress', 'house', 'days_until_term_ends', 'sex', 'days_since_strike', 'relevant', 'date', 'loyalty', 'state_code', 'dim_2', 'id', 'dim_1', 'office_attainment_type', 'first_name', 'strike_recently', 'icpsr', 'sentiment', 'dronebase', 'title', 'occupancy', 'restraining', 'state', 'votes', 'party', 'filibuster', 'extreme']


In [36]:
import unicodecsv as csv
with open("data_formatted.csv", "wb") as outfile:
    writer = csv.writer(outfile)
    writer.writerow(keys)
    for z in mentions_compiled:
        row = []
        for key in keys:
            if key in z and z[key] != "Unknown":
                row.append(z[key])
            else:
                row.append(None)
        writer.writerow(row)