In [1]:
import json
from dateutil.parser import parse
from datetime import timedelta
import requests
import math
import unicodecsv as csv
import numpy

In [2]:
"""
* house: whether or not the legislator is a member of the House of Representatives at the time of the statement. 1 indicates they are a member of the House, 0 indicates they are a member of the Senate. Blank values indicate that the data is either not known or that they are a member of neither (speaker, etc).
* sex: the sex of the legislator, according to the United States legislator biography project.
* last_name: the last name of the legislator who made the statement.
* dim_1: the 1st dimension DW-NOMINATE constant space score for the legislator.
* dim_2: the 2nd dimension DW-NOMINATE constant space score for the legislator.
* id: the ID of the statement (calculated by finding the MD5 hash of the date, text, document title, and speaker last name appended to one another, see `annotate.py` for implementation).
* first_name: the first name of the legislator.
* icpsr: the ICPSR legislator code.
* state: the first seven letters of the legislator’s state.
* statement: the raw text of the statement, as recorded in the Congressional Record.
* party: the legislator’s party. 1 indicates Democrats, 0 indicates Republicans.
* congress: the Congress that the statement was made in.
* days_until_term_ends: the number of days until the legislator is up for reelection.
* loyalty: whether the legislator’s party is the same as the President. 1 if true, 0 otherwise.
* date: the date the statement was made.
* state_code: the ICPSR state code.
* votes: the number of votes considered in the DW-NOMINATE score of the legislator
* periphery: the standard deviations from the party norm, where positive values indicate more partisan tendencies and negative values indicate more moderate tendencies
* unsigned_periphery: the unsigned (absolute value) periphery score
* seniority: the number of congresses the legislator has served in, including the current congress, at the time of the statement
"""

'\n* house: whether or not the legislator is a member of the House of Representatives at the time of the statement. 1 indicates they are a member of the House, 0 indicates they are a member of the Senate. Blank values indicate that the data is either not known or that they are a member of neither (speaker, etc).\n* sex: the sex of the legislator, according to the United States legislator biography project.\n* last_name: the last name of the legislator who made the statement.\n* dim_1: the 1st dimension DW-NOMINATE constant space score for the legislator.\n* dim_2: the 2nd dimension DW-NOMINATE constant space score for the legislator.\n* id: the ID of the statement (calculated by finding the MD5 hash of the date, text, document title, and speaker last name appended to one another, see `annotate.py` for implementation).\n* first_name: the first name of the legislator.\n* icpsr: the ICPSR legislator code.\n* state: the first seven letters of the legislator\xe2\x80\x99s state.\n* statement

In [3]:
def get_congress(time):
    time = time - timedelta(days=3)
    year = time.year
    return int(math.ceil(0.5*year - 894))

In [4]:
# ADD DW-NOMINATE TO THE DATASET

def decode_dw_row(row):
    # decodes row such as:
    #  114   29774  71  24  CALIFOR    100  0  1   CAPPS         -0.389   -0.227      -81.84920   1077     26    0.927
    # format: congress, icpsr, state code, district number (0 if senate or president), state name, party code (100=dem, 200=republican), occupancy, office attainment type, name, 1st dimension coord, 2nd dimension coord, log likelyhood, # of votes, # of classification errors, geometric mean probability
    elements = [element.strip() for element in row.split("  ") if element.strip() != ""]
    if len(elements) == 16:
        del elements[9]
    if len(elements) != 15:
        print "error: != 15 elements in a row! {}".format(len(elements))
        print row
        raise Exception("!= 15 elements in row")
    return {
        "latest_congress": elements[0],
        "icpsr": elements[1],
        "state_code": elements[2],
        "district_number": elements[3],
        "state": elements[4],
        "party_code": int(elements[5]),
        "occupancy": elements[6],
        "office_attainment_type": elements[7],
        "last_name": elements[8].split(" ")[0],
        "dim_1": float(elements[9]),
        "dim_2": float(elements[10]),
        "log_likelyhood": float(elements[11]),
        "votes": int(elements[12]),
        "classification_errors": float(elements[13]),
        "geometric_mean_probability": float(elements[14])
    }
dw_nominate_scores = {}
raw_legislators = []
_already_included_legislators = []
minimum_congress = 106
terms = {}
with open("/Users/miles/Source/combine-cr/data/DW-NOMINATE.txt", "r") as dw:
    for row in dw.readlines():
        dat = decode_dw_row(row)
        if int(dat['latest_congress']) < 106:
            continue
        party = ""
        if dat['party_code'] == 100:
            party = "Democrat"
        if dat['party_code'] == 200:
            party = "Republican"
        dat["party"] = party
        name = dat['last_name'] + dat['state'] + str(dat['latest_congress'])
        dw_nominate_scores[name.lower()] = dat
        if dat['icpsr'] not in _already_included_legislators and dat['latest_congress'] >= minimum_congress:
            raw_legislators.append(dat)
            _already_included_legislators.append(dat['icpsr'])
        
        if dat['icpsr'] not in terms:
            terms[dat['icpsr']] = []
        if int(dat['latest_congress']) not in terms[dat['icpsr']]:
            terms[dat['icpsr']].append(int(dat['latest_congress']))

print len(raw_legislators)

1145


In [5]:
mentions = {}
with open("/Users/miles/Permanent/ghraib.json", "r") as datain:
    for element in json.load(datain): # is array
        party_code = None
        if element['party'] == "Republican":
            party_code = 200
        elif element['party'] == "Democrat":
            party_code = 100
            
        dim_1 = None
        if element['dim_1'] is not None and element['dim_1'] != "Unknown":
            dim_1 = float(element['dim_1'])
    
        dim_2 = None
        if element['dim_2'] is not None and element['dim_2'] != "Unknown":
            dim_2 = float(element['dim_2'])
        
        mentions[element['id']] = {
            "id": element['id'],
            "date": element['date'],
            "date_parsed": parse(element['date']),
            "title": element['title'],
            "first_name": element['first_name'],
            "last_name": element['last_name'],
            "party": element['party'],
            "sex": element['sex'],
            "state": element['state'],
            "statement": element['statement'],
            "congress": get_congress(parse(element['date'])),
            "dim_1": dim_1,
            "dim_2": dim_2,
            "party_code": party_code,
        }

In [6]:
def get_seniority(time, legislator):
    congress = get_congress(time)
    terms_served = terms[legislator['icpsr']]
    count = 0
    for term in terms_served:
        if term <= congress:
            count += 1
    return count

In [7]:
republican_std_dev = numpy.std([legislator["dim_1"] for legislator in raw_legislators if legislator["party_code"] == 200])
democrat_std_dev = numpy.std([legislator["dim_1"] for legislator in raw_legislators if legislator["party_code"] == 100])

republican_mean = numpy.mean([legislator["dim_1"] for legislator in raw_legislators if legislator["party_code"] == 200])
democrat_mean = numpy.mean([legislator["dim_1"] for legislator in raw_legislators if legislator["party_code"] == 100])

def periphery(legislator):
    if legislator['dim_1'] is None or legislator['dim_2'] is None:
        return None
    
    if "party_code" not in legislator:
        return None # legislator not known
    if legislator["party_code"] not in [100, 200]:
        print legislator["party_code"]
        return None
    party_std_dev = None # not pythonic, not a problem
    party_mean = None
    opposite_mean = None
    if legislator["party_code"] is 200:
        party_std_dev = republican_std_dev
        party_mean = republican_mean
        opposite_mean = democrat_mean
    elif legislator["party_code"] is 100:
        party_std_dev = democrat_std_dev
        party_mean = democrat_mean
        opposite_mean = republican_mean
    
    score = abs(party_mean - legislator["dim_1"]) / party_std_dev
    
    neg = (legislator["dim_1"] < party_mean) == (opposite_mean < party_mean)
    if neg:
        score *= -1
        
    return score

In [8]:
print "Republican standard deviation: " + str(republican_std_dev)
print "Democrat standard deviation: " + str(democrat_std_dev)
print "Republican mean 1st dimension score: " + str(republican_mean)
print "Democrat mean 1st dimension score: " + str(democrat_mean)

Republican standard deviation: 0.160409314961
Democrat standard deviation: 0.139125844637
Republican mean 1st dimension score: 0.439763665595
Democrat mean 1st dimension score: -0.336179190751


In [9]:
from datetime import date
def get_current_executive_party(statement_date):
    if statement_date > date(2017,1,19):
        return 0
    if statement_date > date(2009,1,19):
        return 1
    if statement_date > date(2001, 1, 19):
        return 0
    return None

In [10]:
# normalize data that is not already normalized

mentions_compiled = []
for z in mentions.itervalues():
    
#     if z['id'] in addl_fields:
#         z['days_until_term_ends'] = addl_fields[z['id']]['days_until_term_ends']

    # party scores
    try:
        if z['party'] == "Democrat":
            z['party'] = 1
        elif z['party'] == "Republican":
            z['party'] = 0
        else:
            z['party'] = None
    except:
        z['party'] = None
        
    # code sex into dataset (can't code gender, not binary, though neither is sex... oh well, STATA doesn't care...)
    try:
        if z['sex'] == "M":
            z['sex'] = 1
        elif z['sex'] == "F":
            z['sex'] = 0
        else:
            z['sex'] = None
    except:
        z['sex'] = None
        
    # absolute value perphery
    z['unsigned_periphery'] = None
    if 'periphery' in z and z['periphery'] is not None:
        z['unsigned_periphery'] = abs(z['periphery'])
        
    # loyalty
    if 'date_parsed' in z and 'party' in z:
        if z['party'] == get_current_executive_party(z['date_parsed'].date()):
            z['loyalty'] = 1
        else:
            z['loyalty'] = 0
    else:
        z['loyalty'] = None
            
    # party periphery
    z['periphery'] = periphery(z)
        
    # house — somewhat unreliable for legislators who have served in both
    if 'dim_1' in z:
        if 'district_number' in z and z['district_number'] is not None:
            if int(z['district_number']) != 0:
                z['house'] = 1 # house
            else:
                z['house'] = 0 # senate
    else:
        z['house'] = None
    
    if 'icpsr' in z and z['icpsr'] is not None:
        z['seniority'] = get_seniority(z['date_parsed'], z)
    
    # delete unneded variables
    try:
        del z['date_parsed']
        del z['classification_errors']
        del z['party_code']
        del z['district_number']
        del z['occupancy']
        del z['geometric_mean_probability']
        del z['log_likelyhood']
        del z['latest_congress']
        del z['office_attainment_type']
    except:
        pass
        
    mentions_compiled.append(z)

None
None
None
None
None
None
None
None
None


In [11]:
keys = []
for z in mentions_compiled:
    for key in z.iterkeys():
        if key not in keys:
            keys.append(key)
print keys

['last_name', 'congress', 'loyalty', 'sex', 'party_code', 'date', 'dim_2', 'id', 'dim_1', 'first_name', 'title', 'state', 'periphery', 'statement', 'party', 'unsigned_periphery']


In [12]:
import unicodecsv as csv
with open("data_combined.csv", "wb") as outfile:
    writer = csv.writer(outfile)
    writer.writerow(keys)
    for z in mentions_compiled:
        row = []
        for key in keys:
            if key in z and z[key] != "Unknown":
                row.append(z[key])
            else:
                row.append(None)
        writer.writerow(row)