# Autosuggestion Collection
This function handles the core process of collecting autosuggestion data from Google or Bing.

In [32]:
import requests
import urllib
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt
import unicodedata
from wordcloud import WordCloud
from bs4 import BeautifulSoup
import re
from unidecode import unidecode

%matplotlib inline

INCLUDE_TITLE = True

# ----------------------------------------------------------------------------------------------------------------
# collect_autosuggestions
#
# parameters:
# "source" is either "google" or "bing"
# "tld" stands for "top level domain" and can be any of the 2-letter country codes listed here where google operates: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
# "lang" is the language of the suggestions returned, should be two letter codes from here: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
# "query" is the query that you would like to see autocompleted
# ----------------------------------------------------------------------------------------------------------------

def collect_autosuggestions(source, tld, lang, query):
    if source == "google":
        # Some info on this api: http://shreyaschand.com/blog/2013/01/03/google-autocomplete-api/
        url = 'http://www.google.'+tld+'/complete/search?&client=firefox&%s' % (urllib.urlencode({'q': query.encode('utf-8'), 'hl': lang}))
       
    elif source == "bing":
        # Note: for Bing the language is controlled by the tld, so the lang parameter will have no effect on its own
        url = 'http://api.bing.com/osjson.aspx?%s' % (urllib.urlencode({'query': query.encode('utf-8'), 'cc': tld}))
   
    r = requests.get(url)
    suggestions = r.json()[1]
    return suggestions

In [2]:
members_text = requests.get("https://www.govtrack.us/api/v2/role?current=true&limit=800")
members_data = json.loads(members_text.text) # First 100 for now
members = members_data['objects']
# print members

In [3]:
len(members)

542

In [4]:
# import string
# Making lists of name, gender, and auto_suggestion in govtrack.us

name = []
auto_suggestion = []
gender = []
party = []
rank = []
caucus = []
firstname = []
lastname = []
title = []
title_long = []

print members[0]

for member in members:
    name.append(member['person']['name'])
    gender.append(member['person']['gender'])
    party.append(member['party'])
    rank.append(member['senator_rank'])
    caucus.append(member['caucus'])
    firstname.append(member['person']['firstname'])
    lastname.append(member['person']['lastname'])
    title.append(member['title'])
    title_long.append(member['title_long'])
    
    search_string = member['title'] + ' ' if INCLUDE_TITLE else ''
    search_string += member['person']['firstname'] + ' ' + member['person']['lastname']
    
    suggestions = collect_autosuggestions("google", "com", "en", search_string)
    auto_suggestion.append(suggestions)
    
un = []
for r in rank:
    if r not in un:
        un.append(r)
print un

{u'senator_rank': u'junior', u'extra': {u'rss_url': u'http://www.blunt.senate.gov/public/?a=rss.feed', u'fax': u'202-224-8149', u'contact_form': u'http://www.blunt.senate.gov/public/index.cfm/contact-form?p=contact-roy', u'office': u'260 Russell Senate Office Building', u'address': u'260 Russell Senate Office Building Washington DC 20510'}, u'congress_numbers': [112, 113, 114], u'id': 268, u'startdate': u'2011-01-05', u'senator_class_label': u'Class 3', u'district': None, u'title': u'Sen.', u'title_long': u'Senator', u'current': True, u'state': u'MO', u'party': u'Republican', u'leadership_title': None, u'website': u'http://www.blunt.senate.gov', u'description': u'Junior Senator from Missouri', u'phone': u'202-224-5721', u'role_type': u'senator', u'role_type_label': u'Senator', u'enddate': u'2017-01-03', u'senator_rank_label': u'Junior', u'person': {u'name': u'Sen. Roy Blunt [R-MO]', u'firstname': u'Roy', u'twitterid': u'RoyBlunt', u'middlename': u'', u'gender': u'male', u'bioguideid': 

In [5]:
# creating two lists from a source list: male rep & female rep

dict_total = {"name": name, "gender": gender, "suggestion": auto_suggestion, 'party': party, 'rank': rank, 'firstname': firstname, 'lastname': lastname, 'title': title, 'title_long': title_long}
df = pd.DataFrame(dict_total)

In [1]:
def find_people_for_term(df_in, term):
    found = []
    for row in df_in['suggestion']:
        for i in row:
            if term.lower() in i:
                found.append(i)
    return found

In [6]:
def compare_lists(list1, list2, len1, len2):
    dict1 = {}
    dict2 = {}
    
    for word in list1:
        if word in dict1:
            dict1[word] += 1
        else:
            dict1[word] = 1
            
    for word in list2:
        if word in dict2:
            dict2[word] += 1
        else:
            dict2[word] = 1
            
    combined_dict = {}
    
    len1 = float(len1)
    len2 = float(len2)
    
    for word in dict1:
        if word in dict2:
            combined_dict[word] = dict1[word] / len1 - dict2[word] / len2
        else:
            combined_dict[word] = dict1[word] / len1
    for word in dict2:
        if word not in dict1:
            combined_dict[word] = -dict2[word] / len2
            
    return (combined_dict, dict1, dict2)

In [35]:
def get_terms_list(df_in):
    text_list = []
    fin_list = []

    for idx, i in df_in.iterrows():
        for e in i['suggestion']:
            skip_words = ['congressman','congresswoman']
            skip_words += [i['firstname'],i['lastname'], re.sub(r'\W+', '', i['title']), i['title_long']]
            skip_words += i['name'].split(' ')
            for k, s in enumerate(skip_words):
                skip_words[k] = s.lower()
            
            tokens = e.split(' ')
            for t in tokens:
                if len(t) > 3 and  t not in skip_words:
                    text_list.append([t])
        #text_list.append(([e.split(' ', 3)[3:] for e in i]))
    
    for i in text_list:
        for k in i:
            fin_list.append(k)
    
    return fin_list

In [8]:
def compare_suggestions(df_a, df_b, output = True):
    terms_a = get_terms_list(df_a)
    terms_b = get_terms_list(df_b)
    
    compared_dict, dict_a, dict_b = compare_lists(terms_a, terms_b, len(df_a), len(df_b))
    sorted_keys = sorted(compared_dict, key=compared_dict.get)
    
    if output:
        for key in sorted_keys:
            print key + ": " + str(compared_dict[key])
            print "        a: %d b: %d" % (dict_a[key] if key in dict_a else 0, dict_b[key] if key in dict_b else 0)
#             print dict_a[key] if key in dict_a else 0
            #test1.append(dict_a[key] if key in dict_a else 0)
            #test2.append(dict_b[key] if key in dict_b else 0)
            #key_list.append(key)
    
    return sorted_keys, compared_dict

In [9]:
def get_caucus_members(dataframe, member_list_url, prefix, title_class, replacements={}):
    members_page = requests.get(member_list_url)
    bs = BeautifulSoup(members_page.text)

    member_titles = [m.text for m in bs.find_all(class_=title_class)]
    stripped_member_titles = []
    for member in member_titles:
        s = member[len(prefix):member.find(" (")]
        s = " ".join([p.strip(",") for p in s.split(" ") if "." not in p and len(p.strip("I")) > 0])
        stripped_member_titles.append(s)
        
    for i in range(len(stripped_member_titles)):
        if stripped_member_titles[i] in replacements:
            stripped_member_titles[i] = replacements[stripped_member_titles[i]]
    
    slice = []
    inv_slice = []
    not_found = stripped_member_titles[:]
    for (i, n) in enumerate(dataframe["name"]):
        stripped_n = unidecode(n)
        found = False
        for m in stripped_member_titles:
            failed = False
            terms = unidecode(m).split(" ")
            for j in [terms[0], terms[len(terms) - 1]]:
                if j not in stripped_n:
                    failed = True
                    break
            if not failed:
                not_found.remove(m)
                slice.append(i)
                found = True
                break
        if not found:
            inv_slice.append(i)
            
    print not_found
            
    return (dataframe.iloc[slice], dataframe.iloc[inv_slice])

# Comparison for possible gender bias

* word cloud source: https://github.com/amueller/word_cloud

In [10]:
df_male = df[df["gender"]=="male"]
df_female = df[df["gender"]=="female"]

print df_female.shape
print df_male.shape

(108, 9)
(434, 9)


In [36]:
compare_suggestions(df_female, df_male)

office: -0.255205666496
        a: 13 b: 163
staff: -0.0952807646356
        a: 35 b: 182
texas: -0.0897764123571
        a: 2 b: 47
mike: -0.0645161290323
        a: 0 b: 28
facebook: -0.0548728451954
        a: 10 b: 64
raul: -0.0437788018433
        a: 0 b: 19
voting: -0.0413893155829
        a: 2 b: 26
record: -0.0413893155829
        a: 2 b: 26
chris: -0.0391705069124
        a: 0 b: 17
email: -0.0317033623485
        a: 67 b: 283
committees: -0.0299539170507
        a: 0 b: 13
georgia: -0.0299539170507
        a: 0 b: 13
biography: -0.0297832394607
        a: 4 b: 29
district: -0.0276497695853
        a: 0 b: 12
oklahoma: -0.0253456221198
        a: 0 b: 11
andre: -0.0230414746544
        a: 0 b: 10
ruben: -0.0230414746544
        a: 0 b: 10
pennsylvania: -0.0230414746544
        a: 0 b: 10
jersey: -0.0230414746544
        a: 0 b: 10
johnny: -0.0230414746544
        a: 0 b: 10
michigan: -0.0229988052569
        a: 1 b: 14
jose: -0.0207373271889
        a: 0 b: 9
florida: -0.02052

([u'office',
  u'staff',
  u'texas',
  u'mike',
  u'facebook',
  u'raul',
  u'voting',
  u'record',
  u'chris',
  u'email',
  u'committees',
  u'georgia',
  u'biography',
  u'district',
  u'oklahoma',
  u'andre',
  u'ruben',
  u'pennsylvania',
  u'jersey',
  u'johnny',
  u'michigan',
  u'jose',
  u'florida',
  u'senator',
  u'menendez',
  u'assignments',
  u'committee',
  u'steve',
  u'military',
  u'gutierrez',
  u'service',
  u'virginia',
  u'indiana',
  u'colorado',
  u'louisiana',
  u'gerry',
  u'dave',
  u'kentucky',
  u'william',
  u'wife',
  u'phone',
  u'number',
  u'illinois',
  u'bernie',
  u'diaz',
  u'idaho',
  u'houston',
  u'balart',
  u'montana',
  u'patrick',
  u'delaware',
  u'mississippi',
  u'education',
  u'robert',
  u'chief',
  u'wisconsin',
  u'global',
  u'oregon',
  u'obama',
  u'r-ga',
  u'letter',
  u'nevada',
  u'minnesota',
  u'west',
  u'mexico',
  u'warming',
  u'amendment',
  u'mich',
  u'maryland',
  u'south',
  u'immigration',
  u'massachusetts',
  u'y

In [39]:
temp_list = []
temp0_list = []
male_list = []
female_list = []

for i in df_male['suggestion']:
    temp_list.append(([e.split(' ')[2:] for e in i]))

for i in temp_list:
    for k in i:
        male_list.extend(k)

for i in df_female['suggestion']:
    temp0_list.append(([e.split(' ')[2:] for e in i]))

for i in temp0_list:
    for k in i:
        female_list.extend(k)

female_list

[u'boxer',
 u'boxer',
 u'contact',
 u'info',
 u'boxer',
 u'net',
 u'worth',
 u'boxer',
 u'email',
 u'boxer',
 u'contact',
 u'boxer',
 u'twitter',
 u'boxer',
 u'biography',
 u'boxer',
 u'd',
 u'ca',
 u'boxer',
 u'bio',
 u'boxer',
 u'wiki',
 u'mikulski',
 u'email',
 u'mikulski',
 u'monsanto',
 u'mikulski',
 u'staff',
 u'mikulski',
 u'mikulski',
 u'email',
 u'mikulski',
 u'office',
 u'mikulski',
 u'bio',
 u'mikulski',
 u'address',
 u'mikulski',
 u'email',
 u'address',
 u'mikulski',
 u'mailing',
 u'address',
 u'murkowski',
 u'contact',
 u'murkowski',
 u'email',
 u'murkowski',
 u'twitter',
 u'murkowski',
 u'murkowski',
 u'contact',
 u'murkowski',
 u'email',
 u'murkowski',
 u'contact',
 u'information',
 u'murkowski',
 u'email',
 u'address',
 u'murkowski',
 u'anchorage',
 u'office',
 u'murkowski',
 u'staff',
 u'murray',
 u'murray',
 u'email',
 u'murray',
 u'wiki',
 u'murray',
 u'email',
 u'address',
 u'murray',
 u'seattle',
 u'office',
 u'murray',
 u'net',
 u'worth',
 u'murray',
 u'd',
 u'wa'

In [37]:
test1 = []
test2 = []
key_list = []

import csv
from itertools import izip

with open('input.csv', 'wb') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(key_list)
    wr.writerow(test1)
    wr.writerow(test2)
    myfile.close()

a = izip(*csv.reader(open("input.csv", "rb")))
csv.writer(open("output1.csv", "wb")).writerows(a)

# Comparison between Democrat and Republican

In [38]:
test1 = []
test2 = []
key_list = []

df_democrat = df[df["party"]=="Democrat"]
df_republic = df[df["party"]=="Republican"]

compare_suggestions(df_democrat, df_republic)

texas: -0.0725034199726
        a: 12 b: 37
email: -0.0576509673637
        a: 146 b: 202
republican: -0.0564784053156
        a: 0 b: 17
mike: -0.0478796169631
        a: 6 b: 22
facebook: -0.0427008012507
        a: 27 b: 47
staff: -0.0404533906586
        a: 90 b: 126
wiki: -0.0371311315224
        a: 22 b: 39
oklahoma: -0.0365448504983
        a: 0 b: 11
johnny: -0.0332225913621
        a: 0 b: 10
carolina: -0.030584326754
        a: 3 b: 13
utah: -0.0299003322259
        a: 0 b: 9
ohio: -0.0295094782099
        a: 8 b: 19
phone: -0.0290209106899
        a: 1 b: 10
number: -0.0290209106899
        a: 1 b: 10
michigan: -0.0272620676177
        a: 3 b: 12
alabama: -0.0256986515536
        a: 1 b: 9
indiana: -0.0248192300176
        a: 2 b: 10
steve: -0.0232558139535
        a: 0 b: 7
iowa: -0.0232558139535
        a: 0 b: 7
kansas: -0.0232558139535
        a: 0 b: 7
florida: -0.0226695329295
        a: 12 b: 22
voting: -0.0219855384014
        a: 9 b: 18
record: -0.0219855384014
    

([u'texas',
  u'email',
  u'republican',
  u'mike',
  u'facebook',
  u'staff',
  u'wiki',
  u'oklahoma',
  u'johnny',
  u'carolina',
  u'utah',
  u'ohio',
  u'phone',
  u'number',
  u'michigan',
  u'alabama',
  u'indiana',
  u'steve',
  u'iowa',
  u'kansas',
  u'florida',
  u'voting',
  u'record',
  u'north',
  u'assignments',
  u'committee',
  u'u.s.',
  u'south',
  u'senate',
  u'diaz',
  u'idaho',
  u'balart',
  u'military',
  u'service',
  u'tennessee',
  u'wyoming',
  u'lehtinen',
  u'robert',
  u'georgia',
  u'louisiana',
  u'wife',
  u'youtube',
  u'pennsylvania',
  u'global',
  u'obama',
  u'r-ga',
  u'letter',
  u'nevada',
  u'dakota',
  u'warming',
  u'education',
  u'biography',
  u'virginia',
  u'missouri',
  u'information',
  u'arkansas',
  u'baby',
  u'modesto',
  u'snowball',
  u'david',
  u'fort',
  u'plan',
  u'impeachment',
  u'curt',
  u'nebraska',
  u'tenn',
  u'committees',
  u'kentucky',
  u'senator',
  u'address',
  u'thomas',
  u'dalton',
  u'roanoke',
  u'birth

In [39]:
test1 = []
test2 = []
key_list = []


black_caucus_df, not_black_caucus_df = get_caucus_members(df, 'https://cbc-butterfield.house.gov/members', "The Honorable ", "member-title")

compare_suggestions(black_caucus_df, not_black_caucus_df)
print
print len(black_caucus_df)
print len(not_black_caucus_df)

[]
email: -0.634379382889
        a: 3 b: 347
address: -0.269898316971
        a: 4 b: 177
contact: -0.204856241234
        a: 12 b: 231
staff: -0.176192145863
        a: 11 b: 206
voting: -0.0564516129032
        a: 0 b: 28
mike: -0.0564516129032
        a: 0 b: 28
record: -0.0564516129032
        a: 0 b: 28
wiki: -0.0537342215989
        a: 3 b: 59
raul: -0.0383064516129
        a: 0 b: 19
republican: -0.0342741935484
        a: 0 b: 17
chris: -0.0342741935484
        a: 0 b: 17
facebook: -0.0304172510519
        a: 5 b: 69
michigan: -0.0302419354839
        a: 0 b: 15
mailing: -0.0282258064516
        a: 0 b: 14
texas: -0.0275245441795
        a: 3 b: 46
assignments: -0.0262096774194
        a: 0 b: 13
committee: -0.0262096774194
        a: 0 b: 13
committees: -0.0262096774194
        a: 0 b: 13
district: -0.0241935483871
        a: 0 b: 12
colorado: -0.0241935483871
        a: 0 b: 12
phone: -0.0221774193548
        a: 0 b: 11
number: -0.0221774193548
        a: 0 b: 11
oklahoma: -



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html.parser")

  markup_type=markup_type))


In [40]:
import csv
from itertools import izip

with open('input.csv', 'wb') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(key_list)
    wr.writerow(test1)
    wr.writerow(test2)
    myfile.close()

a = izip(*csv.reader(open("input.csv", "rb")))
csv.writer(open("output3_race.csv", "wb")).writerows(a)

In [47]:
import csv
from itertools import izip

with open('input.csv', 'wb') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(key_list)
    wr.writerow(test2)
    wr.writerow(test1)
    myfile.close()

a = izip(*csv.reader(open("input.csv", "rb")))
csv.writer(open("output2.csv", "wb")).writerows(a)

In [41]:
hisp_caucus_df, not_hisp_caucus_df = get_caucus_members(df, 'http://congressionalhispaniccaucus-sanchez.house.gov/members',
                                                        "Rep. ", "views-field-title", )

compare_suggestions(hisp_caucus_df, not_hisp_caucus_df)
print
print len(hisp_caucus_df)
print len(not_hisp_caucus_df)

[]
email: -0.476296958855
        a: 5 b: 345
address: -0.310375670841
        a: 1 b: 180
contact: -0.188133571855
        a: 7 b: 236
twitter: -0.115086463924
        a: 9 b: 238
office: -0.0986881335719
        a: 6 b: 170
wiki: -0.079755515802
        a: 1 b: 61
florida: -0.0658914728682
        a: 0 b: 34
mike: -0.0542635658915
        a: 0 b: 28
ohio: -0.0523255813953
        a: 0 b: 27
worth: -0.0406976744186
        a: 0 b: 21
information: -0.0348837209302
        a: 0 b: 18
republican: -0.0329457364341
        a: 0 b: 17
chris: -0.0329457364341
        a: 0 b: 17
carolina: -0.031007751938
        a: 0 b: 16
michigan: -0.0290697674419
        a: 0 b: 15
mailing: -0.0271317829457
        a: 0 b: 14
assignments: -0.0251937984496
        a: 0 b: 13
committee: -0.0251937984496
        a: 0 b: 13
committees: -0.0251937984496
        a: 0 b: 13
georgia: -0.0251937984496
        a: 0 b: 13
biography: -0.0235539654144
        a: 1 b: 32
indiana: -0.0232558139535
        a: 0 b: 12
colo

In [None]:
# The Congressional Hispanic Caucus and Congressional Hispanic Conference are similar
# The Caucus is Democrat-controlled, while the Conference is Republican
# The website of the Conference is a little more challenging to parse
# https://hispanicconference-mariodiazbalart.house.gov/membership