# District distances

**Procedure**
1. create a DF of the congress with
    1. all twitter id pairs 
    2. all district pairs
2. use geopandas to calculate distance between districts

In [10]:
import pandas as pd
from congress import Congress
import numpy as np

In [11]:
congress = Congress('qa1qfTzGinGGLcNCbinp2OTlI2SvzBe4KFgxYobm')

### Load the congress data into DF

In [12]:
#load the congress data
df_house = pd.DataFrame(congress.members.filter('house', congress=115)[0]['members'])

In [13]:
#drop rows if twitter_account == None
house_twonly = df_house.dropna(subset=['twitter_account'])
#drop those who are not in office anymore
house_twonly = house_twonly[house_twonly.in_office].reset_index(drop=True)

### Create pairs for each congresswomen

- for each member a (1 row), take another member b and add her information (with concat)

In [14]:
lastcol = house_twonly.shape[0]
member_pairs = pd.DataFrame()
df_list = []

for i in range(lastcol):
    mem = pd.DataFrame([house_twonly.iloc[i,:]]*(lastcol)).reset_index(drop=True) #create df with member i 
    mem = pd.concat([mem, house_twonly], axis=1) #merge member and others
    mem = mem.drop(mem.index[i]).reset_index(drop=True) #remove relf-reference
    df_list.append(mem) #store in list containing dataframes
    
#merge them all together
member_pairs = pd.concat(df_list) 

In [15]:
member_pairs = member_pairs.reset_index(drop=True)

In [16]:
member_pairs.twitter_account.tail(3)

Unnamed: 0,twitter_account,twitter_account.1
178503,RepLeeZeldin,RepTedYoho
178504,RepLeeZeldin,RepDavidYoung
178505,RepLeeZeldin,RepDonYoung


In [17]:
member_pairs.to_csv('member_pairs.csv')

### Load voting districts into geopandas

In [1]:
import json
import os
import requests

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# spatial stuff
import geopandas as gpd
import fiona
import folium
import shapely

- shapefiles from https://www.census.gov/geo/maps-data/data/cbf/cbf_cds.html

In [2]:
#os.chdir("..\")
os.getcwd()

'C:\\Users\\Philipp\\GDrive\\Coding\\Python\\SocialDataScience\\Project_tsds\\Philipp'

In [3]:
ppath = 'C:/Users/Philipp/GDrive/Coding/Python/SocialDataScience/Project_tsds/rawdata/congress_disticts2017_500k/'
districts = gpd.read_file(ppath+'cb_2017_us_cd115_500k.shp')

In [4]:
districts.head(3)
#GEOID or STATEFP gives congressional districts

Unnamed: 0,STATEFP,CD115FP,AFFGEOID,GEOID,LSAD,CDSESSN,ALAND,AWATER,geometry
0,13,9,5001500US1309,1309,C2,115,13497964615,411754565,"POLYGON ((-84.65622499999999 34.730984, -84.65..."
1,19,4,5001500US1904,1904,C2,115,58937921470,264842664,"POLYGON ((-96.63970399999999 42.737071, -96.63..."
2,17,10,5001500US1710,1710,C2,115,777307694,31695461,"POLYGON ((-88.19882 42.41557, -88.198601 42.41..."


### Match districts via geoid 

In [5]:
#generate centroids
def get_centroids(x):
    return x.centroid

districts['centroids'] = districts.geometry.apply(get_centroids)

In [6]:
districts.centroids[0].distance(districts.centroids[1])

13.634726788599925

In [7]:
districts.geometry[0].distance(districts.geometry[1])

10.771112182700213

>Do we better use centroids or polygons to calculate distances?

1. load data
1. take geoid of member A and member B
    1. there are two columns of geoid, the first is the reference member and the second all other matched members
2. insert centroids
2. calculate distances of centroids

In [83]:
#how to select centroid
districts[districts.GEOID == '3202'].centroids.reset_index(drop=True)

0    POINT (-117.3060329020225 40.64564001854976)
Name: centroids, dtype: object

> Insert centroids for member A and B

In [90]:
def insert_geoinfo(gid):
    #find geoinformation of A and of B in districts df, append to member_pairs df 
    geoinfo = districts[districts.GEOID == gid].centroids.reset_index(drop=True)
    return geoinfo

#df_list = [insert_geoinfo]

member_pairs['geo_a'] = member_pairs.geoid.iloc[:,0].apply(insert_geoinfo) #reference member
member_pairs['geo_b'] = member_pairs.geoid.iloc[:,1].apply(insert_geoinfo) #all other members

In [93]:
member_pairs.geo_a[-2:]

178504    POINT (-72.67688914616767 40.90605078664418)
178505    POINT (-72.67688914616767 40.90605078664418)
Name: geo_a, dtype: object

In [94]:
member_pairs.geo_b[-2:]

178504    POINT (-94.63734260563108 41.20393250776424)
178505    POINT (-152.2211789507757 64.21754588435303)
Name: geo_b, dtype: object

In [109]:
#calculate distance between geo_a and geo_b (both centroids)
def get_dist(centr_a, centr_b):
    dist = centr_a.distance(centr_b)
    return dist

member_pairs['distance'] = member_pairs.apply(lambda x: get_dist(x.geo_a, x.geo_b), axis=1)

In [124]:
member_pairs.to_csv('member_pairs_distances.csv')

In [130]:
member_pairs.distance

0         11.516319
1          5.188269
2         25.635701
3          9.625330
4         26.983409
5          9.507826
6          2.903558
7         10.407425
8         11.428105
9         17.873344
10         9.907373
11        26.496168
12         4.871841
13        26.634646
14        12.082821
15        30.150932
16        15.093971
17        16.279354
18        19.951584
19        10.015735
20         7.413664
21        13.435587
22        21.554851
23         7.494357
24         5.559608
25        10.757532
26        33.294883
27        17.849584
28        34.411386
29         6.666260
            ...    
178476    25.742207
178477    28.273255
178478     1.290382
178479    14.491955
178480    18.004862
178481    11.715521
178482    46.968461
178483     8.376286
178484    13.560138
178485    45.566752
178486    21.238326
178487    16.681015
178488    46.192155
178489     1.968108
178490    25.005443
178491    15.395293
178492     3.169168
178493    11.050747
178494    21.594680
