# Autosuggestion Collection
This function handles the core process of collecting autosuggestion data from Google or Bing.

In [36]:
import requests
import urllib
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt
import unicodedata
from wordcloud import WordCloud
from bs4 import BeautifulSoup
import re
from unidecode import unidecode
import scipy
import scipy.stats

%matplotlib inline

INCLUDE_TITLE = False

# ----------------------------------------------------------------------------------------------------------------
# collect_autosuggestions
#
# parameters:
# "source" is either "google" or "bing"
# "tld" stands for "top level domain" and can be any of the 2-letter country codes listed here where google operates: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
# "lang" is the language of the suggestions returned, should be two letter codes from here: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
# "query" is the query that you would like to see autocompleted
# ----------------------------------------------------------------------------------------------------------------

def collect_autosuggestions(source, tld, lang, query):
    if source == "google":
        # Some info on this api: http://shreyaschand.com/blog/2013/01/03/google-autocomplete-api/
        url = 'http://www.google.'+tld+'/complete/search?&client=firefox&%s' % (urllib.urlencode({'q': query.encode('utf-8'), 'hl': lang}))
       
    elif source == "bing":
        # Note: for Bing the language is controlled by the tld, so the lang parameter will have no effect on its own
        url = 'http://api.bing.com/osjson.aspx?%s' % (urllib.urlencode({'query': query.encode('utf-8'), 'cc': tld}))
   
    r = requests.get(url)
    suggestions = r.json()[1]
    return suggestions

In [2]:
members_text = requests.get("https://www.govtrack.us/api/v2/role?current=true&limit=800")
members_data = json.loads(members_text.text) # First 100 for now
members = members_data['objects']
# print members

In [3]:
len(members)

542

In [4]:
# import string
# Making lists of name, gender, and auto_suggestion in govtrack.us

name = []
auto_suggestion = []
gender = []
party = []
rank = []
caucus = []
firstname = []
lastname = []
title = []
title_long = []

print members[0]

for member in members:
    name.append(member['person']['name'])
    gender.append(member['person']['gender'])
    party.append(member['party'])
    rank.append(member['senator_rank'])
    caucus.append(member['caucus'])
    firstname.append(member['person']['firstname'])
    lastname.append(member['person']['lastname'])
    title.append(member['title'])
    title_long.append(member['title_long'])
    
    search_string = member['title'] + ' ' if INCLUDE_TITLE else ''
    search_string += member['person']['firstname'] + ' ' + member['person']['lastname']
    
    suggestions = collect_autosuggestions("google", "com", "en", search_string)
    auto_suggestion.append(suggestions)
    
un = []
for r in rank:
    if r not in un:
        un.append(r)
print un

{u'senator_rank': u'junior', u'extra': {u'rss_url': u'http://www.blunt.senate.gov/public/?a=rss.feed', u'fax': u'202-224-8149', u'contact_form': u'http://www.blunt.senate.gov/public/index.cfm/contact-form?p=contact-roy', u'office': u'260 Russell Senate Office Building', u'address': u'260 Russell Senate Office Building Washington DC 20510'}, u'congress_numbers': [112, 113, 114], u'id': 268, u'startdate': u'2011-01-05', u'senator_class_label': u'Class 3', u'district': None, u'title': u'Sen.', u'title_long': u'Senator', u'current': True, u'state': u'MO', u'party': u'Republican', u'leadership_title': None, u'website': u'http://www.blunt.senate.gov', u'description': u'Junior Senator from Missouri', u'phone': u'202-224-5721', u'role_type': u'senator', u'role_type_label': u'Senator', u'enddate': u'2017-01-03', u'senator_rank_label': u'Junior', u'person': {u'name': u'Sen. Roy Blunt [R-MO]', u'firstname': u'Roy', u'twitterid': u'RoyBlunt', u'middlename': u'', u'gender': u'male', u'bioguideid': 

In [5]:
# creating two lists from a source list: male rep & female rep

dict_total = {"name": name, "gender": gender, "suggestion": auto_suggestion, 'party': party, 'rank': rank, 'firstname': firstname, 'lastname': lastname, 'title': title, 'title_long': title_long}
df = pd.DataFrame(dict_total)

In [6]:
def find_people_for_term(df_in, term):
    found = []
    people = []
    for idx, row in df_in.iterrows():
        for i in row['suggestion']:
            if term.lower() in i:
                found.append(i)
                people.append(row)
    return found, people

In [53]:
def compare_lists(list1, list2, len1, len2):
    dict1 = {}
    dict2 = {}
    
    for word in list1:
        if word in dict1:
            dict1[word] += 1
        else:
            dict1[word] = 1
            
    for word in list2:
        if word in dict2:
            dict2[word] += 1
        else:
            dict2[word] = 1
            
    combined_dict = {}
    
    len1 = float(len1)
    len2 = float(len2)
    
    """for word in dict1:
        if word in dict2:
            combined_dict[word] = dict1[word] / len1 - dict2[word] / len2
        else:
            combined_dict[word] = dict1[word] / len1
    for word in dict2:
        if word not in dict1:
            combined_dict[word] = -dict2[word] / len2"""
    for word in dict2:
        if word not in dict1:
            dict1[word] = 0
    for word in dict1:
        if word not in dict2:
            dict2[word] = 0
        #expected_percent = (dict1[word]/len1 + dict2[word]/len2) / 2
        expected_percent = (dict1[word] + dict2[word])/(len1 + len2)
        _, combined_dict[word] = scipy.stats.chisquare([dict1[word],dict2[word]],[expected_percent * len1, expected_percent * len2])
        combined_dict[word] = 1/combined_dict[word]
        
        if dict2[word]/len2 > dict1[word]/len1:
            combined_dict[word] *= -1
            
    return (combined_dict, dict1, dict2)

In [8]:
def get_terms_list(df_in):
    text_list = []
    fin_list = []

    for idx, i in df_in.iterrows():
        new_words = {}
        for e in i['suggestion']:
            skip_words = ['congressman','congresswoman']
            skip_words += [i['firstname'],i['lastname'], re.sub(r'\W+', '', i['title']), i['title_long']]
            skip_words += i['name'].split(' ')
            for k, s in enumerate(skip_words):
                skip_words[k] = s.lower()
            
            fd = []
            tokens = e.split(' ')
            for t in tokens:
                if len(t) > 2 and t not in skip_words:
                    fd.append(t)
            new_words[" ".join(fd)] = True
                    
        for k in new_words.keys():
            fin_list.append(k)
    
    """for i in text_list:
        for k in i:
            fin_list.append(k)"""
    
    return fin_list

In [9]:
def compare_suggestions(df_a, df_b, output = True):
    terms_a = get_terms_list(df_a)
    terms_b = get_terms_list(df_b)
    
    compared_dict, dict_a, dict_b = compare_lists(terms_a, terms_b, len(df_a), len(df_b))
    sorted_keys = sorted(compared_dict, key=compared_dict.get)
    
    if output:
        for key in sorted_keys:
            print key + ": " + str(compared_dict[key])
            print "        a: %d b: %d" % (dict_a[key] if key in dict_a else 0, dict_b[key] if key in dict_b else 0)
#             print dict_a[key] if key in dict_a else 0
            #test1.append(dict_a[key] if key in dict_a else 0)
            #test2.append(dict_b[key] if key in dict_b else 0)
            #key_list.append(key)
    
    return sorted_keys, compared_dict

In [10]:
def get_caucus_members(dataframe, member_list_url, prefix, title_class, replacements={}):
    members_page = requests.get(member_list_url)
    bs = BeautifulSoup(members_page.text)

    member_titles = [m.text for m in bs.find_all(class_=title_class)]
    stripped_member_titles = []
    for member in member_titles:
        s = member[len(prefix):member.find(" (")]
        s = " ".join([p.strip(",") for p in s.split(" ") if "." not in p and len(p.strip("I")) > 0])
        stripped_member_titles.append(s)
        
    for i in range(len(stripped_member_titles)):
        if stripped_member_titles[i] in replacements:
            stripped_member_titles[i] = replacements[stripped_member_titles[i]]
    
    slice = []
    inv_slice = []
    not_found = stripped_member_titles[:]
    for (i, n) in enumerate(dataframe["name"]):
        stripped_n = unidecode(n)
        found = False
        for m in stripped_member_titles:
            failed = False
            terms = unidecode(m).split(" ")
            for j in [terms[0], terms[len(terms) - 1]]:
                if j not in stripped_n:
                    failed = True
                    break
            if not failed:
                not_found.remove(m)
                slice.append(i)
                found = True
                break
        if not found:
            inv_slice.append(i)
            
    print not_found
            
    return (dataframe.iloc[slice], dataframe.iloc[inv_slice])

# Comparison for possible gender bias

* word cloud source: https://github.com/amueller/word_cloud

In [11]:
df_male = df[df["gender"]=="male"]
df_female = df[df["gender"]=="female"]

print df_female.shape
print df_male.shape

(108, 9)
(434, 9)


In [54]:
suggestions, values = compare_suggestions(df_female, df_male)

wife: -182.529940977
        a: 0 b: 31
linkedin: -21.7387737049
        a: 0 b: 16
committee assignments: -18.9498772628
        a: 2 b: 30
obituary: -14.1024689701
        a: 1 b: 21
actor: -13.873610678
        a: 0 b: 13
congress: -9.44905912512
        a: 10 b: 69
ted cruz: -6.31875632099
        a: 0 b: 8
ballotpedia: -5.35063300284
        a: 0 b: 7
artist: -5.35063300284
        a: 0 b: 7
family: -5.03596135324
        a: 1 b: 14
iii: -4.50982640154
        a: 0 b: 6
texas: -4.50982640154
        a: 0 b: 6
nfl: -4.50982640154
        a: 0 b: 6
rubio: -3.77849644943
        a: 0 b: 5
gun control: -3.77849644943
        a: 0 b: 5
house: -3.77849644943
        a: 0 b: 5
georgetown: -3.77849644943
        a: 0 b: 5
photography: -3.77849644943
        a: 0 b: 5
virginia: -3.77849644943
        a: 0 b: 5
dds: -3.77849644943
        a: 0 b: 5
donald trump: -3.71095942484
        a: 1 b: 12
supreme court: -3.51744330275
        a: 3 b: 23
lawyer: -3.14042581524
        a: 0 b: 4
baseba

In [44]:
for term in suggestions[0:13] + suggestions[-10:]:
    print term
    searches, people = find_people_for_term(df, term)
    print searches
#print people

wife
[u'john mccain wife', u'ron wyden wife', u'john thune wife', u'marco rubio wife', u'rand paul wife', u'ted cruz wife', u'bernie sanders wife', u'thad cochran wife', u'michael einziger wife', u'elijah cummings wife', u'chaka fattah wife', u'chaka fattah jr wife', u'sam johnson wife', u'frank lucas wife', u'gregory meeks wife', u'david price wife', u'dana rohrabacher wife', u'paul ryan wife', u'adrian smith wife', u'jason chaffetz wife', u'duncan hunter wife', u'david schweikert wife', u'rob woodall wife', u'trey gowdy wife', u'sean duffy wife', u'tom cotton wife', u'patrick murphy wife', u'hakeem jeffries wife', u"beto o'rourke wife", u'joaquin castro wife', u'denny hecker wife', u'mark desaulnier wife', u'garret graves wife', u'bruce poliquin wife', u'david rouzer wife']
linkedin
[u'michael doyle linkedin', u'james mcgovern linkedin', u'michael simpson linkedin', u'daniel lipinski linkedin', u'brian higgins linkedin', u'robert wittman linkedin', u'thomas rooney linkedin', u'glenn 

In [45]:
test1 = []
test2 = []
key_list = []

import csv
from itertools import izip

with open('input.csv', 'wb') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(key_list)
    wr.writerow(test1)
    wr.writerow(test2)
    myfile.close()

a = izip(*csv.reader(open("input.csv", "rb")))
csv.writer(open("output1.csv", "wb")).writerows(a)

# Comparison between Democrat and Republican

In [55]:
test1 = []
test2 = []
key_list = []

df_democrat = df[df["party"]=="Democrat"]
df_republic = df[df["party"]=="Republican"]

suggestions, values = compare_suggestions(df_democrat, df_republic)

trump: -12539.9444358
        a: 1 b: 23
donald trump: -743.230483113
        a: 0 b: 13
primary: -249.505300929
        a: 13 b: 40
supreme court: -66.6439962153
        a: 5 b: 20
phone number: -53.6451789208
        a: 0 b: 7
voting record: -49.7282747752
        a: 15 b: 38
rubio: -21.3792568876
        a: 0 b: 5
photography: -21.3792568876
        a: 0 b: 5
ted cruz: -14.0104996708
        a: 1 b: 7
republican: -13.2741658766
        a: 0 b: 4
freedom caucus: -13.2741658766
        a: 0 b: 4
artist: -8.96861327712
        a: 1 b: 6
election results: -8.09576697918
        a: 0 b: 3
faa: -8.09576697918
        a: 0 b: 3
marco rubio: -8.09576697918
        a: 0 b: 3
merrick garland: -8.09576697918
        a: 0 b: 3
puerto rico: -8.09576697918
        a: 0 b: 3
basketball: -8.09576697918
        a: 0 b: 3
singer: -8.09576697918
        a: 0 b: 3
ohio: -8.09576697918
        a: 0 b: 3
scalia: -8.09576697918
        a: 0 b: 3
global warming: -8.09576697918
        a: 0 b: 3
alabama: -8

In [51]:
searches, people = find_people_for_term(df_democrat, "trump")
print searches

[u'harry reid trump']


In [52]:
test1 = []
test2 = []
key_list = []


black_caucus_df, not_black_caucus_df = get_caucus_members(df, 'https://cbc-butterfield.house.gov/members', "The Honorable ", "member-title")

suggestions, values = compare_suggestions(black_caucus_df, not_black_caucus_df)
print
print len(black_caucus_df)
print len(not_black_caucus_df)

[]
address: -10893.8566937
        a: 0 b: 28
supreme court: -6102.09518528
        a: 0 b: 26
contact: -4562.95565143
        a: 0 b: 25
senate: -1100.3043413
        a: 1 b: 44
tpp: -433.611346122
        a: 0 b: 17
email: -321.57001321
        a: 0 b: 16
linkedin: -321.57001321
        a: 0 b: 16
donald trump: -129.933139322
        a: 0 b: 13
actor: -129.933139322
        a: 0 b: 13
voting record: -93.9335871044
        a: 2 b: 51
iran: -70.3018467592
        a: 0 b: 11
wiki: -51.4200105649
        a: 6 b: 106
committees: -41.3462003307
        a: 18 b: 259
2016: -37.6103249925
        a: 0 b: 9
ted cruz: -27.3551219409
        a: 0 b: 8
campaign: -27.3551219409
        a: 0 b: 8
immigration: -27.3551219409
        a: 0 b: 8
age: -27.3551219409
        a: 0 b: 8
staff: -26.4601605868
        a: 27 b: 363
artist: -19.7998242317
        a: 0 b: 7
phone number: -19.7998242317
        a: 0 b: 7
email address: -14.2440436
        a: 0 b: 6
california: -14.2440436
        a: 0 b: 6
heigh

In [17]:
import csv
from itertools import izip

with open('input.csv', 'wb') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(key_list)
    wr.writerow(test1)
    wr.writerow(test2)
    myfile.close()

a = izip(*csv.reader(open("input.csv", "rb")))
csv.writer(open("output3_race.csv", "wb")).writerows(a)

In [18]:
import csv
from itertools import izip

with open('input.csv', 'wb') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(key_list)
    wr.writerow(test2)
    wr.writerow(test1)
    myfile.close()

a = izip(*csv.reader(open("input.csv", "rb")))
csv.writer(open("output2.csv", "wb")).writerows(a)

In [19]:
hisp_caucus_df, not_hisp_caucus_df = get_caucus_members(df, 'http://congressionalhispaniccaucus-sanchez.house.gov/members',
                                                        "Rep. ", "views-field-title", )

compare_suggestions(hisp_caucus_df, not_hisp_caucus_df)
print
print len(hisp_caucus_df)
print len(not_hisp_caucus_df)

[]
staff: -0.190220632081
        a: 14 b: 376
office: -0.153846153846
        a: 9 b: 258
for congress: -0.148032200358
        a: 9 b: 255
twitter: -0.140131186643
        a: 15 b: 370
primary: -0.102713178295
        a: 0 b: 53
voting record: -0.102713178295
        a: 0 b: 53
net worth: -0.0910852713178
        a: 0 b: 47
senate: -0.0872093023256
        a: 0 b: 45
wiki: -0.0554561717352
        a: 4 b: 108
address: -0.0542635658915
        a: 0 b: 28
committees: -0.0520274299344
        a: 12 b: 265
supreme court: -0.0503875968992
        a: 0 b: 26
contact: -0.0484496124031
        a: 0 b: 25
trump: -0.046511627907
        a: 0 b: 24
district: -0.0435301132976
        a: 3 b: 82
obituary: -0.0426356589147
        a: 0 b: 22
internship: -0.0399522957662
        a: 4 b: 100
congress: -0.0319022063208
        a: 3 b: 76
linkedin: -0.031007751938
        a: 0 b: 16
email: -0.031007751938
        a: 0 b: 16
: -0.030709600477
        a: 25 b: 512
facebook: -0.0295169946333
        a: 8

In [20]:
# The Congressional Hispanic Caucus and Congressional Hispanic Conference are similar
# The Caucus is Democrat-controlled, while the Conference is Republican
# The website of the Conference is a little more challenging to parse
# https://hispanicconference-mariodiazbalart.house.gov/membership