# Imports and Setup

In [2]:
import requests
import urllib
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt
import unicodedata
from wordcloud import WordCloud
import re
from unidecode import unidecode
import scipy
import scipy.stats

%matplotlib inline

In [None]:
# If this is True, searches will start with, e.g. "Senator Barbara Mikulski"
# If False, they will start with, e.g. "Barbara Mikulski"
INCLUDE_TITLE = False

# Autosuggestion Collection
This function handles the core process of collecting autosuggestion data from Google or Bing.

In [None]:
# ----------------------------------------------------------------------------------------------------------------
# collect_autosuggestions
#
# parameters:
# "source" is either "google" or "bing"
# "tld" stands for "top level domain" and can be any of the 2-letter country codes
#      listed here where google operates: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
# "lang" is the language of the suggestions returned, should be two letter codes 
#      from here: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
# "query" is the query that you would like to see autocompleted
# ----------------------------------------------------------------------------------------------------------------

def collect_autosuggestions(source, tld, lang, query):
    if source == "google":
        # Some info on this api: http://shreyaschand.com/blog/2013/01/03/google-autocomplete-api/
        url = 'http://www.google.'+tld+'/complete/search?&client=firefox&%s' % (urllib.urlencode({'q': query.encode('utf-8'), 'hl': lang}))
       
    elif source == "bing":
        # Note: for Bing the language is controlled by the tld, so the lang parameter will have no effect on its own
        url = 'http://api.bing.com/osjson.aspx?%s' % (urllib.urlencode({'query': query.encode('utf-8'), 'cc': tld}))
   
    r = requests.get(url)
    suggestions = r.json()[1]
    return suggestions

# Download Congress Member Data from Govtrack.us

In [3]:
members_text = requests.get("https://www.govtrack.us/api/v2/role?current=true&limit=800")
members_data = json.loads(members_text.text)
members = members_data['objects']

In [4]:
len(members)

542

# Find autosuggestions for the Congress members
While doing so, put the attributes we care about into a Pandas dataframe

In [41]:
# Make lists of the attributes we care about
name = []
auto_suggestion = []
gender = []
party = []
rank = []
caucus = []
firstname = []
lastname = []
title = []
title_long = []

print members[0]

for member in members:
    name.append(member['person']['name'])
    gender.append(member['person']['gender'])
    party.append(member['party'])
    rank.append(member['senator_rank'])
    caucus.append(member['caucus'])
    firstname.append(member['person']['firstname'])
    lastname.append(member['person']['lastname'])
    title.append(member['title'])
    title_long.append(member['title_long'])
    
    search_string = member['title'] + ' ' if INCLUDE_TITLE else ''
    search_string += member['person']['firstname'] + ' ' + member['person']['lastname']
    
    suggestions = collect_autosuggestions("google", "com", "en", search_string)
    auto_suggestion.append(suggestions)

{u'senator_rank': u'junior', u'extra': {u'rss_url': u'http://www.blunt.senate.gov/public/?a=rss.feed', u'fax': u'202-224-8149', u'contact_form': u'http://www.blunt.senate.gov/public/index.cfm/contact-form?p=contact-roy', u'office': u'260 Russell Senate Office Building', u'address': u'260 Russell Senate Office Building Washington DC 20510'}, u'congress_numbers': [112, 113, 114], u'id': 268, u'startdate': u'2011-01-05', u'senator_class_label': u'Class 3', u'district': None, u'title': u'Sen.', u'title_long': u'Senator', u'current': True, u'state': u'MO', u'party': u'Republican', u'leadership_title': None, u'website': u'http://www.blunt.senate.gov', u'description': u'Junior Senator from Missouri', u'phone': u'202-224-5721', u'role_type': u'senator', u'role_type_label': u'Senator', u'enddate': u'2017-01-03', u'senator_rank_label': u'Junior', u'person': {u'name': u'Sen. Roy Blunt [R-MO]', u'firstname': u'Roy', u'twitterid': u'RoyBlunt', u'middlename': u'', u'gender': u'male', u'bioguideid': 

In [6]:
dict_total = {"name": name, "gender": gender, "suggestion": auto_suggestion, 'party': party, 'rank': rank, 'firstname': firstname, 'lastname': lastname, 'title': title, 'title_long': title_long}
df = pd.DataFrame(dict_total)

# Functions for analysis

In [7]:
# ----------------------------------------------------------------------------------------------------------------
# find_people_for_term
#
# parameters:
# "df_in" is a Pandas dataframe containing (at least) the names of members and lists
#      of autosuggestions
# "term" is the term to be found in the autosuggestions
#
# return values:
# "found" is a list of the complete suggestions containing term
# "people" is a list of the rows from the dataframe of the members who had a
#      suggestion including term
# ----------------------------------------------------------------------------------------------------------------


def find_people_for_term(df_in, term):
    found = []
    people = []
    for idx, row in df_in.iterrows():
        for i in row['suggestion']:
            if term.lower() in i:
                found.append(i)
                people.append(row)
    return found, people

In [39]:
# ----------------------------------------------------------------------------------------------------------------
# compare_lists
#
# parameters:
# "list1" and "list2" are the lists of words suggested for each category. Each
#      autosuggestion of a word is represented by a duplicate of the word in the list. 
# "len1" and "len2" are the number of people in the categories each list came from
#
# return values:
# "combined_dict" is a dictionary where the keys are words from the list of autosuggestions
#      and the values are the the reciprocal of the p-value given by a chi-squared test of
#      the form below, signed to indicate which group is overrepresented,
#      where n1 is the number of times the word appears in list1 and n2 is the number of
#      times the word appears in list2
#
#        +---------------+---------+---------+
#        |               | Group 1 | Group 2 |
#        +---------------+---------+---------+
#        | Suggested     | n1      |      n2 |
#        +---------------+---------+---------+
#        | Not Suggested | len1-n1 | len2-n2 |
#        +---------------+---------+---------+
#
# "dict1" and "dict2" are dictionaries where the key is a word that was suggested and the
#      value is the number of times it appeared in list1 or list2, respectively
# ----------------------------------------------------------------------------------------------------------------

def compare_lists(list1, list2, len1, len2):
    dict1 = {}
    dict2 = {}
    
    for word in list1:
        if word in dict1:
            dict1[word] += 1
        else:
            dict1[word] = 1
            
    for word in list2:
        if word in dict2:
            dict2[word] += 1
        else:
            dict2[word] = 1
            
    combined_dict = {}
    
    len1 = float(len1)
    len2 = float(len2)
    
    for word in dict2:
        if word not in dict1:
            dict1[word] = 0
    for word in dict1:
        if word not in dict2:
            dict2[word] = 0
        if dict1[word] + dict2[word] > 7 and len(word) > 0:
            table = [[dict1[word], dict2[word]],[len1-dict1[word],len2-dict2[word]]]
            _, combined_dict[word], _, _ = scipy.stats.chi2_contingency(table)
            
            combined_dict[word] = 1/combined_dict[word]

            if dict2[word]/len2 > dict1[word]/len1:
                combined_dict[word] *= -1
            
    return (combined_dict, dict1, dict2)

In [9]:
# ----------------------------------------------------------------------------------------------------------------
# get_terms_list
#
# parameters:
# "df_in" is a Pandas dataframe containing (at least) the names of members and lists
#      of autosuggestions
#
# return values:
# "fin_list" is a list of the terms suggested as searches for the members in df_in,
#      based on removing their names and titles. If a term is suggested for multiple members
#      it appears the corresponding member of times, but a maximum of once per member.
# ----------------------------------------------------------------------------------------------------------------

def get_terms_list(df_in):
    text_list = []
    fin_list = []

    for idx, i in df_in.iterrows():
        new_words = {}
        for e in i['suggestion']:
            skip_words = ['congressman','congresswoman']
            skip_words += [i['firstname'],i['lastname'], re.sub(r'\W+', '', i['title']), i['title_long']]
            skip_words += i['name'].split(' ')
            for k, s in enumerate(skip_words):
                skip_words[k] = s.lower()
            
            fd = []
            tokens = e.split(' ')
            for t in tokens:
                if len(t) > 2 and t not in skip_words:
                    fd.append(t)
            new_words[" ".join(fd)] = True
                    
        for k in new_words.keys():
            fin_list.append(k)
    
    return fin_list

In [40]:
# ----------------------------------------------------------------------------------------------------------------
# compare_suggestions
#
# parameters:
# "df_a" and "df_b" are Pandas dataframes containing (at least) the names of members and lists
#      of autosuggestions, for each of the two categories being compared
# "output" is whether or not to print the results of the comparison
#
# return values:
# "sorted_keys" is a list of the terms suggested as searches for the members in df_in,
#      sorted by the strength of their correlation to group a.
#  compared_dict is a dictionary where the keys are the terms suggested as searches
#      and the values are signed reciprocals of p-values (see compare_lists)
# ----------------------------------------------------------------------------------------------------------------

def compare_suggestions(df_a, df_b, output = True):
    terms_a = get_terms_list(df_a)
    terms_b = get_terms_list(df_b)
    
    compared_dict, dict_a, dict_b = compare_lists(terms_a, terms_b, len(df_a), len(df_b))
    sorted_keys = sorted(compared_dict, key=compared_dict.get)
    
    if output:
        for key in sorted_keys:
            print key + ": " + str(compared_dict[key])
            print "        a: %d b: %d" % (dict_a[key] if key in dict_a else 0, dict_b[key] if key in dict_b else 0)
    
    return sorted_keys, compared_dict

# Comparison by gender

In [42]:
df_male = df[df["gender"]=="male"]
df_female = df[df["gender"]=="female"]

print "Female members of Congress: " + str(df_female.shape[0])
print "Male members of Congress: " + str(df_male.shape[0])

Female members of Congress: 108
Male members of Congress: 434


In [30]:
suggestions, values = compare_suggestions(df_female, df_male)

wife: -185.542404295
        a: 0 b: 34
obituary: -39.6956507271
        a: 0 b: 24
congress: -35.8998365183
        a: 9 b: 76
linkedin: -13.3595718836
        a: 0 b: 17
actor: -5.07693411748
        a: 0 b: 11
primary: -4.52516441257
        a: 6 b: 43
supreme court: -3.62007203653
        a: 3 b: 26
ted cruz: -3.61752153834
        a: 0 b: 9
donald trump: -3.0390795124
        a: 1 b: 14
committee assignments: -2.85224978844
        a: 4 b: 29
email address: -1.44123903022
        a: 1 b: 9
net worth: -1.37606519554
        a: 7 b: 35
endorsement: -1.33585570201
        a: 2 b: 13
trump: -1.21625654515
        a: 5 b: 25
contact: -1.1621775395
        a: 4 b: 17
immigration: -1.0716614808
        a: 1 b: 7
for senate: -1.05318960385
        a: 4 b: 18
voting record: -1.0413606222
        a: 11 b: 46
retiring: 1.05859915939
        a: 4 b: 13
website: 1.0716614808
        a: 2 b: 6
youtube: 1.0716614808
        a: 2 b: 6
address: 1.1621775395
        a: 5 b: 16
wiki: 1.20532482591
 

To check for name collisions, examine the results of some of the more dubious queries.
This code prints the specific queries for each keyword.

In [44]:
for term in suggestions[0:13] + suggestions[-10:]:
    print term
    searches, people = find_people_for_term(df, term)
    print searches
#print people

wife
[u'john mccain wife', u'ron wyden wife', u'john thune wife', u'marco rubio wife', u'rand paul wife', u'ted cruz wife', u'bernie sanders wife', u'thad cochran wife', u'michael einziger wife', u'elijah cummings wife', u'chaka fattah wife', u'chaka fattah jr wife', u'sam johnson wife', u'frank lucas wife', u'gregory meeks wife', u'david price wife', u'dana rohrabacher wife', u'paul ryan wife', u'adrian smith wife', u'jason chaffetz wife', u'duncan hunter wife', u'david schweikert wife', u'rob woodall wife', u'trey gowdy wife', u'sean duffy wife', u'tom cotton wife', u'patrick murphy wife', u'hakeem jeffries wife', u"beto o'rourke wife", u'joaquin castro wife', u'denny hecker wife', u'mark desaulnier wife', u'garret graves wife', u'bruce poliquin wife', u'david rouzer wife']
linkedin
[u'michael doyle linkedin', u'james mcgovern linkedin', u'michael simpson linkedin', u'daniel lipinski linkedin', u'brian higgins linkedin', u'robert wittman linkedin', u'thomas rooney linkedin', u'glenn 

# Write the results to CSV for qualitative/macro analysis

In [45]:
test1 = []
test2 = []
key_list = []

import csv
from itertools import izip

with open('input.csv', 'wb') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(key_list)
    wr.writerow(test1)
    wr.writerow(test2)
    myfile.close()

a = izip(*csv.reader(open("input.csv", "rb")))
csv.writer(open("output1.csv", "wb")).writerows(a)

# Comparison between Democrat and Republican

In [12]:
test1 = []
test2 = []
key_list = []

df_democrat = df[df["party"]=="Democrat"]
df_republic = df[df["party"]=="Republican"]

suggestions, values = compare_suggestions(df_democrat, df_republic)

trump: -113440.790874
        a: 1 b: 29
donald trump: -805.481209154
        a: 0 b: 15
voting record: -69.0705773545
        a: 16 b: 41
supreme court: -45.6340764743
        a: 6 b: 22
primary: -31.9490138952
        a: 14 b: 35
address: -11.0131440753
        a: 5 b: 16
ted cruz: -10.6407460853
        a: 1 b: 8
congress: -6.61889089169
        a: 31 b: 54
wife: -4.76085306408
        a: 11 b: 23
florida: -3.14108928601
        a: 2 b: 7
actor: -2.46862305621
        a: 3 b: 8
district: -2.42048014937
        a: 37 b: 56
for congress: -2.37120511402
        a: 111 b: 152
committee assignments: -2.20468627129
        a: 12 b: 21
website: -2.17901127556
        a: 2 b: 6
for senate: -1.68208037898
        a: 8 b: 14
retiring: -1.61968196665
        a: 6 b: 11
staff: -1.55451515723
        a: 172 b: 224
chief staff: -1.50182600817
        a: 67 b: 91
twitter: -1.2147132024
        a: 171 b: 220
facebook: -1.07515543879
        a: 86 b: 111
senate: -1.04890050356
        a: 20 b: 26
qu

# Compare within parties and within genders

In [33]:

df_female_dem = df[(df["gender"]=="female") & (df["party"]=="Democrat")]
df_female_rep = df[(df["gender"]=="female") & (df["party"]=="Republican")]
df_male_dem = df[(df["gender"]=="male") & (df["party"]=="Democrat")]
df_male_rep = df[(df["gender"]=="male") & (df["party"]=="Republican")]

print "Female Democrats: " + str(len(df_female_dem))
print "Female Republicans: " + str(len(df_female_rep))
print "Male Democrats: " + str(len(df_male_dem))
print "Male Republicans: " + str(len(df_male_rep))

Female Democrats: 79
Female Republicans: 29
Male Democrats: 159
Male Republicans: 272


In [34]:
suggestions, values = compare_suggestions(df_female_dem, df_female_rep)

district: -120.711101867
        a: 12 b: 12
wiki: -3.25463600429
        a: 13 b: 8
staff: -1.96631585977
        a: 68 b: 27
twitter: -1.90278855816
        a: 65 b: 26
voting record: -1.43898122396
        a: 7 b: 4
congress: -1.05506988502
        a: 6 b: 3
chief staff: 1.01011964332
        a: 29 b: 10
facebook: 1.04388545945
        a: 35 b: 12
senate: 1.15352027068
        a: 11 b: 3
for congress: 1.15597497871
        a: 44 b: 16
bio: 1.15627297614
        a: 25 b: 8
committees: 1.21033616375
        a: 50 b: 17
husband: 1.29783287125
        a: 6 b: 2
office: 2.06693326713
        a: 59 b: 19
hillary clinton: 7.56710170471
        a: 9 b: 0
internship: 8.83850655362
        a: 28 b: 5
district map: 9.83572827974
        a: 10 b: 0


In [35]:
suggestions, values = compare_suggestions(df_female_dem, df_male_dem)

obituary: -65.5187073386
        a: 0 b: 14
wife: -25.7537097375
        a: 0 b: 11
congress: -8.25394397331
        a: 6 b: 25
committee assignments: -2.8506865158
        a: 2 b: 10
tpp: -1.31379894823
        a: 3 b: 9
wiki: -1.2786220854
        a: 13 b: 30
for senate: -1.1043776222
        a: 2 b: 6
net worth: -1.08482801635
        a: 6 b: 13
primary: -1.07360467109
        a: 4 b: 10
district: -1.0708344978
        a: 12 b: 25
family: 1.1043776222
        a: 3 b: 5
email: 1.1043776222
        a: 3 b: 5
contact: 1.10944954115
        a: 4 b: 6
biography: 1.27203114124
        a: 5 b: 10
cuba: 1.31379894823
        a: 4 b: 8
instagram: 1.37921657175
        a: 3 b: 6
voting record: 1.94796710794
        a: 7 b: 9
bernie sanders: 3.59237470156
        a: 7 b: 7
scheduler: 3.63589959674
        a: 5 b: 4
hillary clinton: 3.76903226261
        a: 9 b: 10
facebook: 11.3622319055
        a: 35 b: 51
district map: 15.0440485679
        a: 10 b: 8
for congress: 15.0834230037
        a: 4

In [36]:
suggestions, values = compare_suggestions(df_female_rep, df_male_rep)

wife: -4.83026950552
        a: 0 b: 23
congress: -2.59054312728
        a: 3 b: 51
district map: -2.52424627093
        a: 0 b: 15
contact: -1.78558144476
        a: 0 b: 11
primary: -1.68035063541
        a: 2 b: 33
net worth: -1.67064131526
        a: 1 b: 22
linkedin: -1.63013096932
        a: 0 b: 10
obituary: -1.63013096932
        a: 0 b: 10
retiring: -1.54617025842
        a: 1 b: 10
committee assignments: -1.39920670793
        a: 2 b: 19
actor: -1.34720036692
        a: 0 b: 8
ted cruz: -1.34720036692
        a: 0 b: 8
supreme court: -1.28987604912
        a: 2 b: 20
biography: -1.04085156511
        a: 1 b: 14
donald trump: -1.04085156511
        a: 1 b: 14
address: -1.02970190689
        a: 1 b: 15
senate: 1.00277255178
        a: 3 b: 23
for senate: 1.12550871464
        a: 2 b: 12
voting record: 1.25365407194
        a: 4 b: 37
chief staff: 1.32388062321
        a: 10 b: 81
facebook: 1.34358195184
        a: 12 b: 99
endorsement: 1.34720036692
        a: 1 b: 7
for congre