# Autosuggestion Collection
This function handles the core process of collecting autosuggestion data from Google or Bing.

In [2]:
import requests
import urllib
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt
import unicodedata
from wordcloud import WordCloud
from bs4 import BeautifulSoup
from unidecode import unidecode

%matplotlib inline

# ----------------------------------------------------------------------------------------------------------------
# collect_autosuggestions
#
# parameters:
# "source" is either "google" or "bing"
# "tld" stands for "top level domain" and can be any of the 2-letter country codes listed here where google operates: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
# "lang" is the language of the suggestions returned, should be two letter codes from here: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
# "query" is the query that you would like to see autocompleted
# ----------------------------------------------------------------------------------------------------------------

def collect_autosuggestions(source, tld, lang, query):
    if source == "google":
        # Some info on this api: http://shreyaschand.com/blog/2013/01/03/google-autocomplete-api/
        url = 'http://www.google.'+tld+'/complete/search?&client=firefox&%s' % (urllib.urlencode({'q': query.encode('utf-8'), 'hl': lang}))
       
    elif source == "bing":
        # Note: for Bing the language is controlled by the tld, so the lang parameter will have no effect on its own
        url = 'http://api.bing.com/osjson.aspx?%s' % (urllib.urlencode({'query': query.encode('utf-8'), 'cc': tld}))
   
    r = requests.get(url)
    suggestions = r.json()[1]
    return suggestions

In [3]:
members_text = requests.get("https://www.govtrack.us/api/v2/role?current=true&limit=800")
members_data = json.loads(members_text.text) # First 100 for now
members = members_data['objects']
# print members

In [10]:
len(members)

542

In [4]:
# import string
# Making lists of name, gender, and auto_suggestion in govtrack.us

name = []
auto_suggestion = []
gender = []
party = []
rank = []
caucus = []

print members[0]

for member in members:
    name.append(member['person']['name'])
    gender.append(member['person']['gender'])
    party.append(member['party'])
    rank.append(member['senator_rank'])
    caucus.append(member['caucus'])
    
    suggestions = collect_autosuggestions("google", "com", "en", member['person']['firstname'] + ' ' + member['person']['lastname'])
    auto_suggestion.append(suggestions)
    
un = []
for r in rank:
    if r not in un:
        un.append(r)
print un

{u'senator_rank': u'junior', u'extra': {u'rss_url': u'http://www.blunt.senate.gov/public/?a=rss.feed', u'fax': u'202-224-8149', u'contact_form': u'http://www.blunt.senate.gov/public/index.cfm/contact-form?p=contact-roy', u'office': u'260 Russell Senate Office Building', u'address': u'260 Russell Senate Office Building Washington DC 20510'}, u'congress_numbers': [112, 113, 114], u'id': 268, u'startdate': u'2011-01-05', u'senator_class_label': u'Class 3', u'district': None, u'title': u'Sen.', u'title_long': u'Senator', u'current': True, u'state': u'MO', u'party': u'Republican', u'leadership_title': None, u'website': u'http://www.blunt.senate.gov', u'description': u'Junior Senator from Missouri', u'phone': u'202-224-5721', u'role_type': u'senator', u'role_type_label': u'Senator', u'enddate': u'2017-01-03', u'senator_rank_label': u'Junior', u'person': {u'name': u'Sen. Roy Blunt [R-MO]', u'firstname': u'Roy', u'twitterid': u'RoyBlunt', u'middlename': u'', u'gender': u'male', u'bioguideid': 

In [5]:
# creating two lists from a source list: male rep & female rep

dict_total = {"name": name, "gender": gender, "suggestion": auto_suggestion, 'party': party, 'rank': rank}
df = pd.DataFrame(dict_total)

In [6]:
def compare_lists(list1, list2, len1, len2):
    dict1 = {}
    dict2 = {}
    
    for word in list1:
        if word in dict1:
            dict1[word] += 1
        else:
            dict1[word] = 1
            
    for word in list2:
        if word in dict2:
            dict2[word] += 1
        else:
            dict2[word] = 1
            
    combined_dict = {}
    
    len1 = float(len1)
    len2 = float(len2)
    
    for word in dict1:
        if word in dict2:
            combined_dict[word] = dict1[word] / len1 - dict2[word] / len2
        else:
            combined_dict[word] = dict1[word] / len1
    for word in dict2:
        if word not in dict1:
            combined_dict[word] = -dict2[word] / len2
            
    return (combined_dict, dict1, dict2)

In [7]:
def get_terms_list(df_in):
    text_list = []
    fin_list = []

    for i in df_in['suggestion']:
        text_list.append(([e.split(' ', 2)[2:] for e in i]))
        
    for i in text_list:
        for k in i:
            fin_list.extend(k)
    
    return fin_list

In [33]:
test1 = []
test2 = []
key_list = []

def compare_suggestions(df_a, df_b, output = True):
    terms_a = get_terms_list(df_a)
    terms_b = get_terms_list(df_b)
    
    compared_dict, dict_a, dict_b = compare_lists(terms_a, terms_b, len(df_a), len(df_b))
    sorted_keys = sorted(compared_dict, key=compared_dict.get)
    
    if output:
        for key in sorted_keys:
            print key + ": " + str(compared_dict[key])
            print "        a: %d b: %d" % (dict_a[key] if key in dict_a else 0, dict_b[key] if key in dict_b else 0)
#             print dict_a[key] if key in dict_a else 0
            test1.append(dict_a[key] if key in dict_a else 0)
            test2.append(dict_b[key] if key in dict_b else 0)
            key_list.append(key)
    
    return sorted_keys, compared_dict

In [9]:
def get_caucus_members(dataframe, member_list_url, prefix, title_class, replacements={}):
    members_page = requests.get(member_list_url)
    bs = BeautifulSoup(members_page.text)

    member_titles = [m.text for m in bs.find_all(class_=title_class)]
    stripped_member_titles = []
    for member in member_titles:
        s = member[len(prefix):member.find(" (")]
        s = " ".join([p.strip(",") for p in s.split(" ") if "." not in p and len(p.strip("I")) > 0])
        stripped_member_titles.append(s)
        
    for i in range(len(stripped_member_titles)):
        if stripped_member_titles[i] in replacements:
            stripped_member_titles[i] = replacements[stripped_member_titles[i]]
    
    slice = []
    inv_slice = []
    not_found = stripped_member_titles[:]
    for (i, n) in enumerate(dataframe["name"]):
        stripped_n = unidecode(n)
        found = False
        for m in stripped_member_titles:
            failed = False
            terms = unidecode(m).split(" ")
            for j in [terms[0], terms[len(terms) - 1]]:
                if j not in stripped_n:
                    failed = True
                    break
            if not failed:
                not_found.remove(m)
                slice.append(i)
                found = True
                break
        if not found:
            inv_slice.append(i)
            
    print not_found
            
    return (dataframe.iloc[slice], dataframe.iloc[inv_slice])

# Comparison for possible gender bias

* word cloud source: https://github.com/amueller/word_cloud

In [34]:
df_male = df[df["gender"]=="male"]
df_female = df[df["gender"]=="female"]

compare_suggestions(df_female, df_male)

wife: -0.0852534562212
        a: 0 b: 37
congress: -0.0640467656597
        a: 11 b: 72
wiki: -0.0637480798771
        a: 18 b: 100
congressman: -0.0506912442396
        a: 0 b: 22
linkedin: -0.0483870967742
        a: 0 b: 21
obituary: -0.0437361324458
        a: 1 b: 23
primary: -0.0435654548558
        a: 5 b: 39
md: -0.036866359447
        a: 0 b: 16
actor: -0.0322580645161
        a: 0 b: 14
jr: -0.0299112476532
        a: 1 b: 17
net worth: -0.0296125618706
        a: 8 b: 45
committee assignments: -0.0274790919952
        a: 4 b: 28
senator: -0.0252602833248
        a: 2 b: 19
email address: -0.0230414746544
        a: 0 b: 10
iran: -0.0230414746544
        a: 0 b: 10
ballotpedia: -0.0230414746544
        a: 0 b: 10
contact: -0.0229134664619
        a: 3 b: 22
donald trump: -0.0207373271889
        a: 0 b: 9
ted cruz: -0.0184331797235
        a: 0 b: 8
sr: -0.0184331797235
        a: 0 b: 8
texas: -0.0184331797235
        a: 0 b: 8
trump: -0.016043693463
        a: 2 b: 15
on t

([u'wife',
  u'congress',
  u'wiki',
  u'congressman',
  u'linkedin',
  u'obituary',
  u'primary',
  u'md',
  u'actor',
  u'jr',
  u'net worth',
  u'committee assignments',
  u'senator',
  u'email address',
  u'iran',
  u'ballotpedia',
  u'contact',
  u'donald trump',
  u'ted cruz',
  u'sr',
  u'texas',
  u'trump',
  u'on the issues',
  u'iii',
  u'contact information',
  u'freedom caucus',
  u'stats',
  u'virginia',
  u'artist',
  u'ga',
  u'frankenstein',
  u'faa',
  u'lawyer',
  u'baseball',
  u'hudl',
  u'photography',
  u'sensenbrenner',
  u'nfl',
  u'chicken',
  u'art',
  u'tennessee',
  u'puerto rico',
  u'planned parenthood',
  u'mother',
  u'height',
  u'attorney',
  u'tx',
  u'georgia',
  u'renacci',
  u'india',
  u'usaid',
  u'salary',
  u'brookings',
  u'books',
  u'definition',
  u'ways and means',
  u'georgetown',
  u'new york',
  u'college',
  u'climate',
  u'polls',
  u'kildee',
  u'leadership pac',
  u'new mexico',
  u'nj',
  u'iran deal',
  u'dds',
  u'missouri',
  u'

In [35]:
key_list

[u'wife',
 u'congress',
 u'wiki',
 u'congressman',
 u'linkedin',
 u'obituary',
 u'primary',
 u'md',
 u'actor',
 u'jr',
 u'net worth',
 u'committee assignments',
 u'senator',
 u'email address',
 u'iran',
 u'ballotpedia',
 u'contact',
 u'donald trump',
 u'ted cruz',
 u'sr',
 u'texas',
 u'trump',
 u'on the issues',
 u'iii',
 u'contact information',
 u'freedom caucus',
 u'stats',
 u'virginia',
 u'artist',
 u'ga',
 u'frankenstein',
 u'faa',
 u'lawyer',
 u'baseball',
 u'hudl',
 u'photography',
 u'sensenbrenner',
 u'nfl',
 u'chicken',
 u'art',
 u'tennessee',
 u'puerto rico',
 u'planned parenthood',
 u'mother',
 u'height',
 u'attorney',
 u'tx',
 u'georgia',
 u'renacci',
 u'india',
 u'usaid',
 u'salary',
 u'brookings',
 u'books',
 u'definition',
 u'ways and means',
 u'georgetown',
 u'new york',
 u'college',
 u'climate',
 u'polls',
 u'kildee',
 u'leadership pac',
 u'new mexico',
 u'nj',
 u'iran deal',
 u'dds',
 u'missouri',
 u'songs',
 u'representative',
 u'immigration',
 u'age',
 u'cincinnati',

In [13]:
temp_list = []
temp0_list = []
male_list = []
female_list = []

for i in df_male['suggestion']:
    temp_list.append(([e.split(' ')[2:] for e in i]))

for i in temp_list:
    for k in i:
        male_list.extend(k)

for i in df_female['suggestion']:
    temp0_list.append(([e.split(' ')[2:] for e in i]))

for i in temp0_list:
    for k in i:
        female_list.extend(k)

female_list

[u'staff',
 u'twitter',
 u'office',
 u'internship',
 u'retire',
 u'2016',
 u'net',
 u'worth',
 u'hillary',
 u'clinton',
 u'press',
 u'secretary',
 u'email',
 u'retiring',
 u'staff',
 u'height',
 u'scholarship',
 u'internship',
 u'twitter',
 u'committees',
 u'quotes',
 u'staff',
 u'facebook',
 u'twitter',
 u'committees',
 u'for',
 u'senate',
 u'energy',
 u'bill',
 u'biography',
 u'primary',
 u'2016',
 u'dc',
 u'office',
 u'staff',
 u'for',
 u'senate',
 u'twitter',
 u'committee',
 u'2016',
 u'appropriations',
 u'internship',
 u'help',
 u'committee',
 u'for',
 u'senate',
 u'twitter',
 u'committees',
 u'endorsement',
 u'staff',
 u'trump',
 u'vice',
 u'president',
 u'family',
 u'district',
 u'staff',
 u'chief',
 u'of',
 u'staff',
 u'dc',
 u'office',
 u'committees',
 u'twitter',
 u'office',
 u'for',
 u'senate',
 u'supreme',
 u'court',
 u'scalia',
 u'staff',
 u'senate',
 u'twitter',
 u'office',
 u'family',
 u'governor',
 u'gun',
 u'control',
 u'approval',
 u'rating',
 u'website',
 u'hillary',

In [36]:
import csv
from itertools import izip

with open('input.csv', 'wb') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(key_list)
    wr.writerow(test2)
    wr.writerow(test1)
    myfile.close()

a = izip(*csv.reader(open("input.csv", "rb")))
csv.writer(open("output1.csv", "wb")).writerows(a)

# Comparison between Democrat and Republican

In [41]:
df_democrat = df[df["party"]=="Democrat"]
df_republic = df[df["party"]=="Republican"]

compare_suggestions(df_democrat, df_republic)

primary: -0.093511823334
        m: 7 f: 37
facebook: -0.0895055696697
        m: 72 f: 118
voting record: -0.0723079929646
        m: 16 f: 42
trump: -0.0531561461794
        m: 0 f: 16
for congress: -0.0480750439711
        m: 104 f: 146
wife: -0.0443619308188
        m: 10 f: 26
donald trump: -0.0299003322259
        m: 0 f: 9
rubio: -0.0232558139535
        m: 0 f: 7
md: -0.0230603869455
        m: 4 f: 12
scalia: -0.0199335548173
        m: 0 f: 6
for senate: -0.0195427008013
        m: 8 f: 16
ted cruz: -0.0190541332812
        m: 1 f: 7
phone number: -0.0190541332812
        m: 1 f: 7
supreme court: -0.0188587062732
        m: 5 f: 12
marco rubio: -0.0166112956811
        m: 0 f: 5
freedom caucus: -0.0166112956811
        m: 0 f: 5
endorsement: -0.0164158686731
        m: 4 f: 10
actor: -0.0164158686731
        m: 4 f: 10
family: -0.0164158686731
        m: 4 f: 10
committee assignments: -0.016025014657
        m: 12 f: 20
address: -0.015341020129
        m: 9 f: 16
net worth: -

([u'primary',
  u'facebook',
  u'voting record',
  u'trump',
  u'for congress',
  u'wife',
  u'donald trump',
  u'rubio',
  u'md',
  u'scalia',
  u'for senate',
  u'ted cruz',
  u'phone number',
  u'supreme court',
  u'marco rubio',
  u'freedom caucus',
  u'endorsement',
  u'actor',
  u'family',
  u'committee assignments',
  u'address',
  u'net worth',
  u'district',
  u'rand paul',
  u'faa',
  u'hudl',
  u'sensenbrenner',
  u'nfl',
  u'art',
  u'photography',
  u'staff',
  u'height',
  u'campaign',
  u'artist',
  u'tx',
  u'books',
  u'son',
  u'college',
  u'climate',
  u'renacci',
  u'twitter',
  u'poll',
  u'ad',
  u'football',
  u'llc',
  u'tn',
  u'mobile al',
  u'budget',
  u'az',
  u'primary challenger',
  u'alabama',
  u'company',
  u'sister',
  u'2015',
  u'crapo',
  u'kkk',
  u'canada',
  u'views',
  u'mma',
  u'beard',
  u'conaway',
  u'imdb',
  u'letter',
  u'nevada',
  u'realtor',
  u'michigan',
  u'watson',
  u'north korea',
  u'ohio',
  u'singer',
  u'pakistan',
  u'wor

In [56]:
black_caucus_df, not_black_caucus_df = get_caucus_members(df, 'https://cbc-butterfield.house.gov/members', "The Honorable ", "member-title")

compare_suggestions(black_caucus_df, not_black_caucus_df)
print
print len(black_caucus_df)
print len(not_black_caucus_df)

committees: -0.193636044881
        a: 14 b: 247
staff: -0.154540673212
        a: 26 b: 357
twitter: -0.146914446003
        a: 27 b: 364
facebook: -0.121756661992
        a: 11 b: 179
wiki: -0.0933555399719
        a: 6 b: 111
internship: -0.0788043478261
        a: 5 b: 93
email: -0.0584677419355
        a: 0 b: 29
for congress: -0.0526823281907
        a: 19 b: 231
address: -0.0504032258065
        a: 0 b: 25
contact: -0.0504032258065
        a: 0 b: 25
senate: -0.047247545582
        a: 2 b: 45
dc office: -0.0462833099579
        a: 9 b: 120
senator: -0.0423387096774
        a: 0 b: 21
linkedin: -0.0423387096774
        a: 0 b: 21
tpp: -0.0383064516129
        a: 0 b: 19
trump: -0.0342741935484
        a: 0 b: 17
supreme court: -0.0342741935484
        a: 0 b: 17
md: -0.0322580645161
        a: 0 b: 16
office: -0.0294530154278
        a: 12 b: 144
actor: -0.0282258064516
        a: 0 b: 14
2016: -0.0262096774194
        a: 0 b: 13
for senate: -0.0246318373072
        a: 1 b: 23
ob

In [100]:
hisp_caucus_df, not_hisp_caucus_df = get_caucus_members(df, 'http://congressionalhispaniccaucus-sanchez.house.gov/members',
                                                        "Rep. ", "views-field-title", )

compare_suggestions(hisp_caucus_df, not_hisp_caucus_df)
print
print len(hisp_caucus_df)
print len(not_hisp_caucus_df)

[]
dc office: -0.209600477042
        a: 1 b: 128
staff: -0.176654740608
        a: 14 b: 369
district: -0.116577221228
        a: 1 b: 80
voting record: -0.112403100775
        a: 0 b: 58
net worth: -0.102713178295
        a: 0 b: 53
committees: -0.101818723912
        a: 10 b: 251
senate: -0.0910852713178
        a: 0 b: 47
primary: -0.0852713178295
        a: 0 b: 44
for congress: -0.0805008944544
        a: 10 b: 240
wife: -0.0697674418605
        a: 0 b: 36
address: -0.0484496124031
        a: 0 b: 25
trump: -0.0329457364341
        a: 0 b: 17
supreme court: -0.0329457364341
        a: 0 b: 17
md: -0.031007751938
        a: 0 b: 16
retiring: -0.031007751938
        a: 0 b: 16
family: -0.0271317829457
        a: 0 b: 14
endorsement: -0.0271317829457
        a: 0 b: 14
actor: -0.0271317829457
        a: 0 b: 14
wiki: -0.0247465712582
        a: 5 b: 112
email address: -0.0193798449612
        a: 0 b: 10
artist: -0.0193798449612
        a: 0 b: 10
website: -0.0193798449612
        a:

In [None]:
# The Congressional Hispanic Caucus and Congressional Hispanic Conference are similar
# The Caucus is Democrat-controlled, while the Conference is Republican
# The website of the Conference is a little more challenging to parse
# https://hispanicconference-mariodiazbalart.house.gov/membership