In [1]:
import json
from dateutil.parser import parse
from datetime import timedelta
import requests
import math
import unicodecsv as csv
import numpy

In [2]:
"""
* relevant: whether Heajin classified the particular statement as relevant or irrelevant. 1 indicates that the statement is relevant, while 0 indicates that it is not.
* k_relevant: whether Dr. Kreps classified the particular statement as relevant or irrelevant. 1 indicates that the statement is relevant, while 0 indicates that it is not.
* house: whether or not the legislator is a member of the House of Representatives at the time of the statement. 1 indicates they are a member of the House, 0 indicates they are a member of the Senate. Blank values indicate that the data is either not known or that they are a member of neither (speaker, etc).
* sex: the sex of the legislator, according to the United States legislator biography project.
* days_since_last_strike: the number of days that have elapsed since the most recent drone strike. Data is retrieved from https://tbij.dronescout.org, which is itself an interface for the Bureau of Investigative Journalism’s data.
* last_name: the last name of the legislator who made the statement.
* dim_1: the 1st dimension DW-NOMINATE constant space score for the legislator.
* dim_2: the 2nd dimension DW-NOMINATE constant space score for the legislator.
* id: the ID of the statement (calculated by finding the MD5 hash of the date, text, document title, and speaker last name appended to one another, see `annotate.py` for implementation).
* first_name: the first name of the legislator.
* k_sentiment: the sentiment, as classified by Dr. Kreps. 1 indicates pro-drone, 0 indicates neutral, and -1 anti-drone/restricting statements.
* icpsr: the ICPSR legislator code.
* general_sentiment: the statement’s general outlook toward drones. 1 indicates a statement that is pro-drones, 0 indicates a statement that is neutral, and -1 indicates a statement that is anti-drones.
* state: the first seven letters of the legislator’s state.
* statement: the raw text of the statement, as recorded in the Congressional Record.
* party: the legislator’s party. 0 indicates Democrats, 1 indicates Republicans.
* filibuster: whether the statement was made during Rand Paul’s drone filibuster.
* congress: the Congress that the statement was made in.
* days_until_term_ends: the number of days until the legislator is up for reelection.
* loyalty: whether the legislator’s party is the same as the President. 1 if true, 0 otherwise.
* date: the date the statement was made.
* state_code: the ICPSR state code.
* dronebase: whether or not the legislator has a drone base in their home state. 0 if no, 1 if yes but without Predator/Reaper, 2 if with Predator/Reaper
* categories: the Python-object encoded categories, as classified by Heajin
* strike_recently: whether a drone strike took place within the week previous to the statement being made
* votes: the number of votes considered in the DW-NOMINATE score of the legislator
* k_restraining: whether the statement was classified by Dr. Kreps as restraining. 1 indicates statements that are restraining, while 0 indicates those that aren't.
* periphery: the standard deviations from the party norm, where positive values indicate more partisan tendencies and negative values indicate more moderate tendencies
* unsigned_periphery: the unsigned (absolute value) periphery score
* restraining: 1 if the general sentiment is -1, 0 if not, and None if general sentiment is not known
* seniority: the number of congresses the legislator has served in, including the current congress, at the time of the statement
"""

"\n* relevant: whether Heajin classified the particular statement as relevant or irrelevant. 1 indicates that the statement is relevant, while 0 indicates that it is not.\n* k_relevant: whether Dr. Kreps classified the particular statement as relevant or irrelevant. 1 indicates that the statement is relevant, while 0 indicates that it is not.\n* house: whether or not the legislator is a member of the House of Representatives at the time of the statement. 1 indicates they are a member of the House, 0 indicates they are a member of the Senate. Blank values indicate that the data is either not known or that they are a member of neither (speaker, etc).\n* sex: the sex of the legislator, according to the United States legislator biography project.\n* days_since_last_strike: the number of days that have elapsed since the most recent drone strike. Data is retrieved from https://tbij.dronescout.org, which is itself an interface for the Bureau of Investigative Journalism\xe2\x80\x99s data.\n* l

In [3]:
addl_fields = {}
with open("/Users/miles/Source/combine-cr/data/sr2.json", "r") as datain:
    for mention in json.load(datain):
        addl_fields[mention['id']] = mention

In [4]:
# LOAD AND COMBINE CLASSIFICATIONS
classifications = {}
with open("/Users/miles/Source/combine-cr/data/kreps_classifications.json", "r") as datain:
    for element in json.load(datain): # is array
        classifications[element['ID']] = {
            "k_relevant": element['RELEVANT'],
            "k_sentiment": element['CATEGORICAL SENTIMENT']
        }
with open("/Users/miles/Source/combine-cr/data/detailed_classifications.csv", "r") as datain:
    r = csv.reader(datain)
    first = True
    for row in r:
        if first:
            first = False
            continue
        relevant_str = row[0].strip()
        categories_str = row[1].strip()
        categorical_sentiments_str = row[2].strip()
        general_sentiment_str = row[3].strip()
        id_str = row[5].strip()
        
        # make categories dictionary
        categories = {}
        sentiments = categorical_sentiments_str.split(",")
        i = 0
        for category in categories_str.split(","):
            if category == "TRANSPARENCY":
                category = "DISCLOSURE" # the two were used interchangably
            try:
                categories[category.strip()] = {
                    "ANTI": -1,
                    "NEUTRAL": 0,
                    "PRO": 1,
                }[sentiments[i].strip().upper()]
            except:
                #print categorical_sentiments_str
                #print categories_str
                #print id_str
                pass
            i += 1
            
        # parse classifications
        relevant = relevant_str == "RELEVANT" # later, this is normalized into 1 and 0
        general_sentiment = None
        if general_sentiment_str in ["PRO", "ANTI", "NEUTRAL"]:
            general_sentiment = {
                "PRO": 1,
                "NEUTRAL": 0,
                "ANTI": -1
            }[general_sentiment_str]
        
        # push into main classification tracking object
        classifications[id_str]["general_sentiment"] = general_sentiment
        classifications[id_str]["categories"] = categories
        classifications[id_str]["relevant"] = relevant # later, this is normalized into 1 and 0

In [5]:
def get_congress(time):
    time = time - timedelta(days=3)
    year = time.year
    return int(math.ceil(0.5*year - 894))

In [6]:
mentions = {}
with open("/Users/miles/Source/combine-cr/data/mentions.json", "r") as datain:
    for element in json.load(datain): # is array
        mentions[element['ID']] = {
            "id": element['ID'],
            "k_relevant": classifications[element['ID']]["k_relevant"],
            "k_sentiment": classifications[element['ID']]["k_sentiment"],
            "categories": classifications[element['ID']]["categories"],
            "general_sentiment": classifications[element['ID']]["general_sentiment"],
            "relevant": classifications[element['ID']]["relevant"],
            "date": element['DATE'],
            "date_parsed": parse(element['DATE']),
            "title": element['TITLE'],
            "first_name": element['FIRST NAME'],
            "last_name": element['LAST NAME'],
            "party": element['PARTY'],
            "sex": element['SEX'],
            "state": element['STATE'],
            "statement": element['STATEMENT'],
            "congress": get_congress(parse(element['DATE']))
        }

In [7]:
strikes = [strike for strike in requests.get("https://tbij.dronescout.org/data").json().itervalues() if not isinstance(strike, str)]

In [8]:
strikedates = []

In [9]:
for strike in strikes:
    try:
        if not strike['location'].endswith("Afghanistan"):
            strikedates.append(parse(strike['date']))
    except:
        continue

In [10]:
strikedates = sorted(strikedates)

In [11]:
len(strikedates)

714

In [12]:
restrictive_mentions_by_year = {}

In [13]:
for mention in mentions.itervalues():
    latest_strike = None
    for strikedate in strikedates:
        if strikedate > mention['date_parsed']:
            break
        latest_strike = strikedate
    days_since_strike = None
    if latest_strike is not None:
        days_since_strike = (mention['date_parsed'] - latest_strike).days
    mention['days_since_strike'] = days_since_strike
    syear = str(mention['date_parsed'].year)
    if syear not in restrictive_mentions_by_year:
        restrictive_mentions_by_year[syear] = 0
    if mention['relevant'] == "Y" and mention["sentiment"] == "Anti":
        restrictive_mentions_by_year[syear] += 1
#     print mention['date']
#     print days_since_strike

In [14]:
import json
print json.dumps(restrictive_mentions_by_year, sort_keys=True)
print sum(restrictive_mentions_by_year.itervalues())
print len([
    mention for mention in mentions.itervalues() 
    if mention['relevant'] == "Y" 
    and mention['sentiment'] == "Anti"])

{"2000": 0, "2001": 0, "2002": 0, "2003": 0, "2004": 0, "2005": 0, "2006": 0, "2007": 0, "2008": 0, "2009": 0, "2010": 0, "2011": 0, "2012": 0, "2013": 0, "2014": 0, "2015": 0, "2016": 0, "2017": 0}
0
0


In [15]:
# ADD DW-NOMINATE TO THE DATASET

def decode_dw_row(row):
    # decodes row such as:
    #  114   29774  71  24  CALIFOR    100  0  1   CAPPS         -0.389   -0.227      -81.84920   1077     26    0.927
    # format: congress, icpsr, state code, district number (0 if senate or president), state name, party code (100=dem, 200=republican), occupancy, office attainment type, name, 1st dimension coord, 2nd dimension coord, log likelyhood, # of votes, # of classification errors, geometric mean probability
    elements = [element.strip() for element in row.split("  ") if element.strip() != ""]
    if len(elements) == 16:
        del elements[9]
    if len(elements) != 15:
        print "error: != 15 elements in a row! {}".format(len(elements))
        print row
        raise Exception("!= 15 elements in row")
    return {
        "latest_congress": elements[0],
        "icpsr": elements[1],
        "state_code": elements[2],
        "district_number": elements[3],
        "state": elements[4],
        "party_code": int(elements[5]),
        "occupancy": elements[6],
        "office_attainment_type": elements[7],
        "last_name": elements[8].split(" ")[0],
        "dim_1": float(elements[9]),
        "dim_2": float(elements[10]),
        "log_likelyhood": float(elements[11]),
        "votes": int(elements[12]),
        "classification_errors": float(elements[13]),
        "geometric_mean_probability": float(elements[14])
    }
dw_nominate_scores = {}
raw_legislators = []
_already_included_legislators = []
minimum_congress = 106
terms = {}
with open("/Users/miles/Source/combine-cr/data/DW-NOMINATE.txt", "r") as dw:
    for row in dw.readlines():
        dat = decode_dw_row(row)
        if int(dat['latest_congress']) < 106:
            continue
        party = ""
        if dat['party_code'] == 100:
            party = "Democrat"
        if dat['party_code'] == 200:
            party = "Republican"
        dat["party"] = party
        name = dat['last_name'] + dat['state'] + str(dat['latest_congress'])
        dw_nominate_scores[name.lower()] = dat
        if dat['icpsr'] not in _already_included_legislators and dat['latest_congress'] >= minimum_congress:
            raw_legislators.append(dat)
            _already_included_legislators.append(dat['icpsr'])
        
        if dat['icpsr'] not in terms:
            terms[dat['icpsr']] = []
        if int(dat['latest_congress']) not in terms[dat['icpsr']]:
            terms[dat['icpsr']].append(int(dat['latest_congress']))

print len(raw_legislators)

1145


In [16]:
def get_seniority(time, legislator):
    congress = get_congress(time)
    terms_served = terms[legislator['icpsr']]
    count = 0
    for term in terms_served:
        if term <= congress:
            count += 1
    return count

In [17]:
republican_std_dev = numpy.std([legislator["dim_1"] for legislator in raw_legislators if legislator["party_code"] == 200])
democrat_std_dev = numpy.std([legislator["dim_1"] for legislator in raw_legislators if legislator["party_code"] == 100])

republican_mean = numpy.mean([legislator["dim_1"] for legislator in raw_legislators if legislator["party_code"] == 200])
democrat_mean = numpy.mean([legislator["dim_1"] for legislator in raw_legislators if legislator["party_code"] == 100])

def periphery(legislator):
    if "party_code" not in legislator:
        return None # legislator not known
    if legislator["party_code"] not in [100, 200]:
        print legislator["party_code"]
        return None
    party_std_dev = None # not pythonic, not a problem
    party_mean = None
    opposite_mean = None
    if legislator["party_code"] is 200:
        party_std_dev = republican_std_dev
        party_mean = republican_mean
        opposite_mean = democrat_mean
    elif legislator["party_code"] is 100:
        party_std_dev = democrat_std_dev
        party_mean = democrat_mean
        opposite_mean = republican_mean
    
    score = abs(party_mean - legislator["dim_1"]) / party_std_dev
    
    neg = (legislator["dim_1"] < party_mean) == (opposite_mean < party_mean)
    if neg:
        score *= -1
        
    return score

In [18]:
print "Republican standard deviation: " + str(republican_std_dev)
print "Democrat standard deviation: " + str(democrat_std_dev)
print "Republican mean 1st dimension score: " + str(republican_mean)
print "Democrat mean 1st dimension score: " + str(democrat_mean)

Republican standard deviation: 0.160409314961
Democrat standard deviation: 0.139125844637
Republican mean 1st dimension score: 0.439763665595
Democrat mean 1st dimension score: -0.336179190751


In [19]:
#print [legislator["last_name"] + str(periphery(legislator)) for legislator in sorted(raw_legislators, key=lambda k: periphery(k))]

In [20]:
# MERGE DW-NOMINATE DATA WITH LEGISLATOR DATA
for mention in mentions.itervalues():
    name = (mention['last_name'] + mention['state'][:7] + str(get_congress(mention['date_parsed']))).lower()
    if name in dw_nominate_scores:
        dw_nominate_data = dw_nominate_scores[name]
        for key in dw_nominate_data.iterkeys():
            mention[key] = dw_nominate_data[key]

In [21]:
from datetime import date
def get_current_executive_party(statement_date):
    if statement_date > date(2017,1,19):
        return 0
    if statement_date > date(2009,1,19):
        return 1
    if statement_date > date(2001, 1, 19):
        return 0
    return None

In [22]:
states_1 =   ["WASHINGTON",
            "OREGON",
            "MONTANA",
            "MINNESOTA",
            "UTAH",
            "COLORADO",
            "KANSAS",
            "LOUISIANA",
            "MISSISSIPPI",
            "KENTUCKY",
            "ALABAMA",
            "INDIANA",
            "WEST VIRGINIA",
            "FLORIDA",
            "VIRGINIA",
            "PENNSYLVANIA",
            "NORTH CAROLINA",
            "MARYLAND",
            "NEW JERSEY",
            "NEW HAMPSHIRE",
            "ARKANSAS",
            "HAWAII"]

states_2 =   [
            "NORTH DAKOTA",
            "CALIFORNIA",
            "NEVADA",
            "ARIZONA",
            "NEW MEXICO",
            "TEXAS",
            "GEORGIA",
            "NEW YORK",
            "SOUTH DAKOTA",
            "NEVADA"]

# data according to https://publicintelligence.net/dod-us-drone-activities-map/
# states_1 is all states without predators and reapers, states_2 are those with

def drone_state_status(statename):
    statename = statename.strip()
    if statename.lower() == "unknown":
        return None
    if statename.upper()[:7] in [state[:7] for state in states_1]:
        return 1
    if statename.upper()[:7] in [state[:7] for state in states_2]:
        return 2
    return 0

In [23]:
# encode CATEGORIES

categories = []

# find all the categories
for z in mentions.itervalues():
    for category in z['categories']:
        if category not in categories:
            categories.append(category)
            
print categories

# create columns
for z in mentions.itervalues():
    for category in categories:
        present = 0
        sentiment = None
        if category in z['categories']:
            present = 1
            sentiment = z['categories'][category]
        z['CATEGORY_' + category] = present
        z['CATEGORICAL_SENTIMENT_' + category] = sentiment

[u'DISCLOSURE', u'HUMANITARIAN', u'FOREIGNPOLICY', u'REFERENCE', u'DOMESTIC-LEGALITY', u'FINANCE', u'TECHNOLOGY', u'FOREIGNUSE', u'INTERNATIONAL-LEGALITY', u'PTSD', u'JOBS']


In [24]:
# normalize data that is not already normalized

mentions_compiled = []
for z in mentions.itervalues():
    
    if z['id'] in addl_fields:
        z['days_until_term_ends'] = addl_fields[z['id']]['days_until_term_ends']

    # party scores
    try:
        if z['party'] == "Democrat":
            z['party'] = 1
        elif z['party'] == "Republican":
            z['party'] = 0
        else:
            z['party'] = None
    except:
        z['party'] = None
        
    # Kreps sentiment
    if z['k_sentiment']:
        if z['k_sentiment'].strip().lower() == "anti":
            z['k_sentiment'] = -1
        elif z['k_sentiment'].strip().lower() == "pro":
            z['k_sentiment'] = 1
        elif z['k_sentiment'].strip().lower() == "neutral":
            z['k_sentiment'] = 0
            
    # restraining
    if z['k_sentiment']:
        if z['k_sentiment'] == 1 or z['k_sentiment'] == 0:
            z['k_restraining'] = 0
        if z['k_sentiment'] == -1:
            z['k_restraining'] = 1            
    else:
        z['k_restraining'] = None
        
    # Kreps relevant scores
    try:
        if z['k_relevant'] == "Y":
            z['k_relevant'] = 1
        elif z['k_relevant'] == "N":
            z['k_relevant'] = 0
        else:
            z['k_relevant'] = None
    except e:
        print e
        z['k_relevant'] = None
        
    # relevant scores
    try:
        if z['relevant'] == True:
            z['relevant'] = 1
        elif z['relevant'] == False:
            z['relevant'] = 0
        else:
            z['relevant'] = None
    except:
        z['relevant'] = None
        
    # code sex into dataset (can't code gender, not binary, though neither is sex... oh well, STATA doesn't care...)
    try:
        if z['sex'] == "M":
            z['sex'] = 1
        elif z['sex'] == "F":
            z['sex'] = 0
        else:
            z['sex'] = None
    except:
        z['sex'] = None
        
    # absolute value perphery
    z['unsigned_periphery'] = None
    if 'periphery' in z and z['periphery'] is not None:
        z['unsigned_periphery'] = abs(z['periphery'])
        
    # recent scores
    try:
        if z['days_since_strike'] <= 7:
            z['strike_recently'] = 1
        else:
            z['strike_recently'] = 0
    except:
        z['strike_recently'] = None
        
    # loyalty
    if 'date_parsed' in z and 'party' in z:
        if z['party'] == get_current_executive_party(z['date_parsed'].date()):
            z['loyalty'] = 1
        else:
            z['loyalty'] = 0
    else:
        z['loyalty'] = None
        
    # general restraining
    z['restraining'] = None
    if 'general_sentiment' in z and z['general_sentiment'] == -1:
        z['restraining'] = 1
    else:
        z['restraining'] = 0
    
    # filibuster
    z['filibuster'] = 0
    if 'icpsr' in z and 'date_parsed' in z and z['icpsr'] is not None and int(z['icpsr']) == 41104 and z['date_parsed'].date() == date(2013, 3, 6): # rand paul on the day of his filibuster
        z['filibuster'] = 1
            
    # party periphery
    z['periphery'] = periphery(z)
    
    # legislative vulnerability
    # todo
        
    # dronebase
    if 'state' in z and z['state']:
        z['dronebase'] = drone_state_status(z['state'])
    else:
        z['dronebase'] = None
        
    # house — somewhat unreliable for legislators who have served in both
    if 'dim_1' in z:
        if 'district_number' in z and z['district_number'] is not None:
            if int(z['district_number']) != 0:
                z['house'] = 1 # house
            else:
                z['house'] = 0 # senate
    else:
        z['house'] = None
    
    if 'icpsr' in z and z['icpsr'] is not None:
        z['seniority'] = get_seniority(z['date_parsed'], z)
    
    # delete unneded variables
    try:
        del z['date_parsed']
        del z['classification_errors']
        del z['party_code']
        del z['district_number']
        del z['occupancy']
        del z['geometric_mean_probability']
        del z['log_likelyhood']
        del z['latest_congress']
        del z['office_attainment_type']
    except:
        pass
        
    mentions_compiled.append(z)

In [25]:
keys = []
for z in mentions_compiled:
    for key in z.iterkeys():
        if key not in keys:
            keys.append(key)
print keys

['relevant', 'restraining', u'CATEGORICAL_SENTIMENT_DISCLOSURE', 'k_relevant', u'CATEGORICAL_SENTIMENT_JOBS', 'house', 'sex', u'CATEGORY_FOREIGNPOLICY', 'days_since_strike', 'last_name', u'CATEGORICAL_SENTIMENT_REFERENCE', u'CATEGORICAL_SENTIMENT_FOREIGNUSE', 'seniority', u'CATEGORY_PTSD', 'dim_2', 'id', 'dim_1', 'first_name', u'CATEGORY_DISCLOSURE', 'k_sentiment', 'icpsr', 'title', u'CATEGORICAL_SENTIMENT_FOREIGNPOLICY', 'general_sentiment', u'CATEGORICAL_SENTIMENT_PTSD', 'state', u'CATEGORICAL_SENTIMENT_INTERNATIONAL-LEGALITY', 'statement', u'CATEGORICAL_SENTIMENT_FINANCE', 'party', u'CATEGORY_REFERENCE', 'filibuster', u'CATEGORY_INTERNATIONAL-LEGALITY', u'CATEGORICAL_SENTIMENT_DOMESTIC-LEGALITY', u'CATEGORY_HUMANITARIAN', 'congress', 'days_until_term_ends', 'loyalty', u'CATEGORY_JOBS', u'CATEGORY_DOMESTIC-LEGALITY', 'date', 'state_code', 'dronebase', 'categories', u'CATEGORY_FOREIGNUSE', u'CATEGORY_TECHNOLOGY', u'CATEGORY_FINANCE', 'periphery', u'CATEGORICAL_SENTIMENT_TECHNOLOGY', u

In [26]:
import unicodecsv as csv
with open("data_combined.csv", "wb") as outfile:
    writer = csv.writer(outfile)
    writer.writerow(keys)
    for z in mentions_compiled:
        if z['relevant'] == z['k_relevant']:
            continue
        row = []
        for key in keys:
            if key in z and z[key] != "Unknown":
                row.append(z[key])
            else:
                row.append(None)
        writer.writerow(row)

In [27]:
agreeing_relevance = 0
for mention in mentions_compiled:
    if mention["relevant"] == mention["k_relevant"]:
        agreeing_relevance += 1
        
total = float(len(mentions_compiled))
print "relevance match: " + str(agreeing_relevance / total)
print "total: " + str(total)
print "Kreps relevant: " + str(len([mention for mention in mentions_compiled if mention["k_relevant"] == 1]))
print "relevant: " + str(len([mention for mention in mentions_compiled if mention["relevant"] == 1]))

relevance match: 0.701454234388
total: 1169.0
Kreps relevant: 795
relevant: 787
