# District distances

**Procedure**
1. create a DF of the congress with
    1. all twitter id pairs 
    2. all district pairs
2. use geopandas to calculate distance between districts

In [10]:
import pandas as pd
from congress import Congress
import numpy as np

In [11]:
congress = Congress('qa1qfTzGinGGLcNCbinp2OTlI2SvzBe4KFgxYobm')

### Load the congress data into DF

In [12]:
#load the congress data
df_house = pd.DataFrame(congress.members.filter('house', congress=115)[0]['members'])

In [13]:
#drop rows if twitter_account == None
house_twonly = df_house.dropna(subset=['twitter_account'])
#drop those who are not in office anymore
house_twonly = house_twonly[house_twonly.in_office].reset_index(drop=True)

### Create pairs for each congresswomen

- for each member a (1 row), take another member b and add her information (with concat)

In [14]:
lastcol = house_twonly.shape[0]
member_pairs = pd.DataFrame()
df_list = []

for i in range(lastcol):
    mem = pd.DataFrame([house_twonly.iloc[i,:]]*(lastcol)).reset_index(drop=True) #create df with member i 
    mem = pd.concat([mem, house_twonly], axis=1) #merge member and others
    mem = mem.drop(mem.index[i]).reset_index(drop=True) #remove relf-reference
    df_list.append(mem) #store in list containing dataframes
    
#merge them all together
member_pairs = pd.concat(df_list) 

In [15]:
member_pairs = member_pairs.reset_index(drop=True)

In [131]:
member_pairs.id.tail(3)

Unnamed: 0,id,id.1
178503,Z000017,Y000065
178504,Z000017,Y000066
178505,Z000017,Y000033


#### finally, drop duplicates

In [168]:
#use frozenset() to ignore order of tuples
def create_idtuples(id_a, id_b):
    pair = frozenset((id_a, id_b))
    return pair

In [161]:
#assign id_a and _b for convenience
member_pairs['id_a'] = member_pairs.id.iloc[:,0]
member_pairs['id_b'] = member_pairs.id.iloc[:,1]

In [169]:
member_pairs['id_tuples'] = member_pairs.apply(lambda x: create_idtuples(x.id_a, x.id_b), axis=1)

In [170]:
member_pairs.id_tuples.head(3)

0    (A000370, A000374)
1    (A000374, A000055)
2    (A000374, A000371)
Name: id_tuples, dtype: object

In [184]:
member_pairs= member_pairs.drop_duplicates('id_tuples')

In [186]:
member_pairs.shape

(89253, 92)

### Load voting districts into geopandas

In [2]:
import json
import os
import requests

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# spatial stuff
import geopandas as gpd
import fiona
import folium
import shapely

- shapefiles from https://www.census.gov/geo/maps-data/data/cbf/cbf_cds.html

In [188]:
#os.chdir("..\")
os.getcwd()

'C:\\Users\\Philipp\\GDrive\\Coding\\Python\\SocialDataScience\\Project_tsds\\Philipp'

In [189]:
ppath = 'C:/Users/Philipp/GDrive/Coding/Python/SocialDataScience/Project_tsds/rawdata/congress_disticts2017_500k/'
districts = gpd.read_file(ppath+'cb_2017_us_cd115_500k.shp')

In [190]:
districts.head(3)
#GEOID or STATEFP gives congressional districts

Unnamed: 0,STATEFP,CD115FP,AFFGEOID,GEOID,LSAD,CDSESSN,ALAND,AWATER,geometry
0,13,9,5001500US1309,1309,C2,115,13497964615,411754565,"POLYGON ((-84.65622499999999 34.730984, -84.65..."
1,19,4,5001500US1904,1904,C2,115,58937921470,264842664,"POLYGON ((-96.63970399999999 42.737071, -96.63..."
2,17,10,5001500US1710,1710,C2,115,777307694,31695461,"POLYGON ((-88.19882 42.41557, -88.198601 42.41..."


### Match districts via geoid 

In [191]:
#generate centroids
def get_centroids(x):
    return x.centroid

districts['centroids'] = districts.geometry.apply(get_centroids)

In [192]:
districts.centroids[0].distance(districts.centroids[1])

13.634726788599925

In [193]:
districts.geometry[0].distance(districts.geometry[1])

10.771112182700213

>Do we better use centroids or polygons to calculate distances?

1. load data
1. take geoid of member A and member B
    1. there are two columns of geoid, the first is the reference member and the second all other matched members
2. insert centroids
2. calculate distances of centroids

In [194]:
#how to select centroid
districts[districts.GEOID == '3202'].centroids.reset_index(drop=True)

0    POINT (-117.3060329020225 40.64564001854976)
Name: centroids, dtype: object

> Insert centroids for member A and B

In [195]:
def insert_geoinfo(gid):
    #find geoinformation of A and of B in districts df, append to member_pairs df 
    geoinfo = districts[districts.GEOID == gid].centroids.reset_index(drop=True)
    return geoinfo

#df_list = [insert_geoinfo]

member_pairs['geo_a'] = member_pairs.geoid.iloc[:,0].apply(insert_geoinfo) #reference member
member_pairs['geo_b'] = member_pairs.geoid.iloc[:,1].apply(insert_geoinfo) #all other members

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [196]:
member_pairs.geo_a[-2:]

177661    POINT (-94.63734260563108 41.20393250776424)
178083    POINT (-152.2211789507757 64.21754588435303)
Name: geo_a, dtype: object

In [197]:
member_pairs.geo_b[-2:]

177661    POINT (-72.67688914616767 40.90605078664418)
178083    POINT (-72.67688914616767 40.90605078664418)
Name: geo_b, dtype: object

In [198]:
#calculate distance between geo_a and geo_b (both centroids)
def get_dist(centr_a, centr_b):
    dist = centr_a.distance(centr_b)
    return dist

member_pairs['distance'] = member_pairs.apply(lambda x: get_dist(x.geo_a, x.geo_b), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [199]:
member_pairs.to_csv('member_pairs_distances.csv')

In [201]:
member_pairs.shape

(89253, 92)

### 89253 is exactly all possible combinations of 423 members

In [1]:
path = os.getcwd()
df = pd.read_csv(path+"\\member_pairs_distances.csv")
df = df.reset_index(drop=True)

NameError: name 'os' is not defined

In [28]:
print (df.columns)

Index(['Unnamed: 0', 'api_uri', 'at_large', 'contact_form', 'crp_id',
       'cspan_id', 'date_of_birth', 'district', 'dw_nominate',
       'facebook_account', 'fax', 'fec_candidate_id', 'first_name', 'gender',
       'geoid', 'google_entity_id', 'govtrack_id', 'icpsr_id', 'id',
       'ideal_point', 'in_office', 'last_name', 'leadership_role',
       'middle_name', 'missed_votes', 'missed_votes_pct', 'next_election',
       'ocd_id', 'office', 'party', 'phone', 'rss_url', 'seniority',
       'short_title', 'state', 'suffix', 'title', 'total_present',
       'total_votes', 'twitter_account', 'url', 'votes_with_party_pct',
       'votesmart_id', 'youtube_account', 'api_uri.1', 'at_large.1',
       'contact_form.1', 'crp_id.1', 'cspan_id.1', 'date_of_birth.1',
       'district.1', 'dw_nominate.1', 'facebook_account.1', 'fax.1',
       'fec_candidate_id.1', 'first_name.1', 'gender.1', 'geoid.1',
       'google_entity_id.1', 'govtrack_id.1', 'icpsr_id.1', 'id.1',
       'ideal_point.1', 'i

In [29]:
df = df.filter(['id_a', 'id_b', 'twitter_account', 'twitter_account.1', 'distance'])

In [31]:
df = df.rename(index=str, columns={'twitter_account': 'twitter_account_a', 'twitter_account.1': 'twitter_account_b'})

In [32]:
df.head()

Unnamed: 0,id_a,id_b,twitter_account_a,twitter_account_b,distance
0,A000374,A000370,RepAbraham,RepAdams,11.516319
1,A000374,A000055,RepAbraham,Robert_Aderholt,5.188269
2,A000374,A000371,RepAbraham,reppeteaguilar,25.635701
3,A000374,A000372,RepAbraham,reprickallen,9.62533
4,A000374,A000369,RepAbraham,MarkAmodeiNV2,26.983409


In [34]:
df.to_csv('member_pairs_distances_reduced.csv')

## Mess around with centroids

In [69]:
df.columns

Index(['Unnamed: 0', 'api_uri', 'at_large', 'contact_form', 'crp_id',
       'cspan_id', 'date_of_birth', 'district', 'dw_nominate',
       'facebook_account', 'fax', 'fec_candidate_id', 'first_name', 'gender',
       'geoid', 'google_entity_id', 'govtrack_id', 'icpsr_id', 'id',
       'ideal_point', 'in_office', 'last_name', 'leadership_role',
       'middle_name', 'missed_votes', 'missed_votes_pct', 'next_election',
       'ocd_id', 'office', 'party', 'phone', 'rss_url', 'seniority',
       'short_title', 'state', 'suffix', 'title', 'total_present',
       'total_votes', 'twitter_account', 'url', 'votes_with_party_pct',
       'votesmart_id', 'youtube_account', 'api_uri.1', 'at_large.1',
       'contact_form.1', 'crp_id.1', 'cspan_id.1', 'date_of_birth.1',
       'district.1', 'dw_nominate.1', 'facebook_account.1', 'fax.1',
       'fec_candidate_id.1', 'first_name.1', 'gender.1', 'geoid.1',
       'google_entity_id.1', 'govtrack_id.1', 'icpsr_id.1', 'id.1',
       'ideal_point.1', 'i

In [67]:
districts

NameError: name 'districts' is not defined

In [75]:
select = df[df.geo_a]
select.plot(markersize=5,figsize=(10, 10), alpha=.8)

KeyError: "['POINT (-91.82491765269089 31.7677401747569)'\n 'POINT (-91.82491765269089 31.7677401747569)'\n 'POINT (-91.82491765269089 31.7677401747569)' ...\n 'POINT (-94.63734260563108 41.20393250776424)'\n 'POINT (-94.63734260563108 41.20393250776424)'\n 'POINT (-152.2211789507757 64.21754588435303)'] not in index"