## Notebook exploring the geodata extracted from adam4adam profiles based in MA
### by Mary Ruth Ngo under supervision of Professor Octavio Gonzalez

1. Using geopy, I can use town data I scraped and convert it to lat/long values

In [2]:
!pip install geopy



In [4]:
from geopy.geocoders import Nominatim
geolocator = Nominatim()
location = geolocator.geocode("Chicago Illinois")
print(location.raw)
help(location)

{u'display_name': u'Chicago, Cook County, Illinois, United States of America', u'importance': 1.0026476104889, u'place_id': u'143390527', u'lon': u'-87.6244211', u'lat': u'41.8755546', u'osm_type': u'relation', u'licence': u'Data \xa9 OpenStreetMap contributors, ODbL 1.0. http://www.openstreetmap.org/copyright', u'osm_id': u'122604', u'boundingbox': [u'41.643919', u'42.0230219', u'-87.9401009', u'-87.523984'], u'type': u'city', u'class': u'place', u'icon': u'https://nominatim.openstreetmap.org/images/mapicons/poi_place_city.p.20.png'}
Help on Location in module geopy.location object:

class Location(__builtin__.object)
 |  Contains a parsed geocoder response. Can be iterated over as
 |  (location<String>, (latitude<float>, longitude<Float)). Or one can access
 |  the properties `address`, `latitude`, `longitude`, or `raw`. The last
 |  is a dictionary of the geocoder's response for this item.
 |  
 |  .. versionadded:: 0.98
 |  
 |  Methods defined here:
 |  
 |  __eq__(self, other)
 |

In [5]:
import pandas as pd
import plotly

csv file ma_plus_race.csv stores data (last updated to include a section on race/ethnicity for each profile)

In [6]:
ma = pd.DataFrame.from_csv("ma_plus_race.csv")

In [7]:
one_city_list = list(set([i + " Massachusetts" for i in ma["town"].values.tolist()\
                 if len(str(i).split(", ")) == 1 and str(i) != "nan"]))

In [8]:
lats = []
lons = []
# print(len(one_city_list))
for i in one_city_list:
    location = geolocator.geocode(i, timeout=10)
    lats.append(location.latitude)
    lons.append(location.longitude)

In [9]:
two_city_list = list(set([i + " Massachusetts" for i in ma["town"].values.tolist()\
                 if len(str(i).split(", ")) == 2 and str(i) != "nan"]))

In [10]:
for i in two_city_list:
    if "Other" in i:
        two_city_list.remove(i)

In [11]:
two_city_list = [i.split(", ")[1] for i in two_city_list]

In [12]:
all_cities = one_city_list + two_city_list
len(all_cities)

134

**Edge cases:**

In [None]:
for i,v in enumerate(all_cities):
    if "Leather" in v:
        all_cities[i] = "Leather District, Boston"
    elif "Bay Village" in v:
        all_cities[i] = "Bay Village"
    elif "Brickell" in v:
        all_cities.remove(v)
    elif "Boston Airport" in v:
        all_cities[i] = "Boston Logan International Airport, Boston"

Store conversion data in dictionary and pandas frame

In [None]:
geo_dict = {}
for i in all_cities:
    location = geolocator.geocode(i, timeout=10)
    try:
        lat = location.latitude
        lon = location.longitude
    except AttributeError:
        print(i)
    geo_dict[i] = {"lat": lat, "lon": lon}

In [None]:
geo_dict["Boston Logan Airport"] = {"lat": 42.366828, "lon": -71.027330}
geo_dict["Leather District"] = {"lat": 42.350807, "lon": -71.057969}

In [None]:
geo_frame = pd.DataFrame(geo_dict).T

In [None]:
geo_frame.index

use conversion dictionary to add a lat and long value to each profile in pandas dataframe

In [None]:
clean_towns = []
lats_list = []
lons_list = []
ma_city_values = [str(i) for i in ma["town"].values.tolist()]
for ind,i in enumerate(ma_city_values):
    if str(i) == "nan" or ":" in i:
        clean_towns.append("nan")
        lats_list.append("nan")
        lons_list.append("nan")
    else:
        if "," in i:
            val = i.split(", ")[1]
        else:
            val = i
        try:
            lats_list.append(geo_dict[val + " Massachusetts"]["lat"])
            lons_list.append(geo_dict[val + " Massachusetts"]["lon"])
            clean_towns.append(val + " Massachusetts")
        except KeyError:
            if "Bay Village" in val:
                lats_list.append(42.349176)
                lons_list.append(-71.069591)
                clean_towns.append("Bay Village")
            elif "Leather" in val:
                lats_list.append(42.350807)
                lons_list.append(-71.057969)
                clean_towns.append("Leather District")
            elif "Airport" in val:
                lats_list.append(42.366828)
                lons_list.append(-71.027330)
                clean_towns.append("Boston Logan Airport")
            else:
                clean_towns.append("nan")
                lats_list.append("nan")
                lons_list.append("nan")

In [None]:
ma["towns"] = clean_towns
ma["lat"] = lats_list
ma["lon"] = lons_list
len(ma["towns"])

In [None]:
from itertools import groupby
lengtsh = [len(list(group)) for key, group in groupby(clean_towns)]

**Check out the frequency breakdown for users around the state**

In [None]:
import collections
counter=collections.Counter(clean_towns)
# print(counter)
# print(counter.values())
# print(counter.keys())
frequencies = (counter.most_common(134))

In [None]:
geo_dict[u'Westborough Massachusetts']['lat']

In [None]:
freq = []
for i,v in enumerate(frequencies):
    try:
        freq.append([geo_dict[v[0]]["lat"], geo_dict[v[0]]["lon"], v[1]])
    except KeyError:
        print v

In [None]:
prep_towns = ma["towns"][ma["hiv_status"] == "HIV Negative, on PrEP"].values.tolist()

In [None]:
counter=collections.Counter(prep_towns)
frequencies = (counter.most_common(58))
freq = []
for i,v in enumerate(frequencies):
    try:
        freq.append([geo_dict[v[0]]["lat"], geo_dict[v[0]]["lon"], v[1], frequencies[i][0]])
    except KeyError:
        print v

In [None]:
freq

Use plotly's geoscatter plot to try to visualize that frequency

In [None]:
import plotly.tools as tls
tls.set_credentials_file(username='mrngos', api_key='r2pmvn0qhu')

## Biggest issue:
The map data in plotly doesn't scale to city/state scope, stays only at state/country level in the States. must find alternative

In [None]:
import plotly.plotly as py
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_february_us_airport_traffic.csv')
df.head()

# ma['text'] = df['town'] + '' + df['city'] + ', ' + df['state'] + '' + 'Arrivals: ' + df['cnt'].astype(str)

scl = [ [0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
    [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"] ]

data = [ dict(
        type = 'scattergeo',
        locationmode = 'MA-cities',
        lon = ma['lon'],
        lat = ma['lat'],
        mode = 'markers',
        marker = dict( 
            size = 8, 
            opacity = 0.8,
            reversescale = True,
            autocolorscale = False,
            symbol = 'circle',
            line = dict(
                width=1,
                color='rgba(102, 102, 102)'
            ),
            colorscale = scl,
            cmin = 0,
            color = df['cnt'],
            cmax = df['cnt'].max(),
            colorbar=dict(
                title="Incoming flightsFebruary 2011"
            )
        ))]

layout = dict(
        title = 'Most trafficked US airports<br>(Hover for airport names)',
        colorbar = True,   
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = "rgb(250, 250, 250)",
            subunitcolor = "rgb(217, 217, 217)",
            countrycolor = "rgb(217, 217, 217)",
            countrywidth = 0.5,
            subunitwidth = 0.5        
        ),
    )

fig = dict( data=data, layout=layout )
py.iplot( fig, validate=False, filename='d3-airports' )