## Notebook exploring the geodata extracted from adam4adam profiles based in MA
### by Mary Ruth Ngo under supervision of Professor Octavio Gonzalez

1. Using geopy, I can use town data I scraped and convert it to lat/long values

In [2]:
!pip install geopy



In [4]:
from geopy.geocoders import Nominatim
geolocator = Nominatim()
location = geolocator.geocode("Chicago Illinois")
print(location.raw)
help(location)

{u'display_name': u'Chicago, Cook County, Illinois, United States of America', u'importance': 1.0026476104889, u'place_id': u'143390527', u'lon': u'-87.6244211', u'lat': u'41.8755546', u'osm_type': u'relation', u'licence': u'Data \xa9 OpenStreetMap contributors, ODbL 1.0. http://www.openstreetmap.org/copyright', u'osm_id': u'122604', u'boundingbox': [u'41.643919', u'42.0230219', u'-87.9401009', u'-87.523984'], u'type': u'city', u'class': u'place', u'icon': u'https://nominatim.openstreetmap.org/images/mapicons/poi_place_city.p.20.png'}
Help on Location in module geopy.location object:

class Location(__builtin__.object)
 |  Contains a parsed geocoder response. Can be iterated over as
 |  (location<String>, (latitude<float>, longitude<Float)). Or one can access
 |  the properties `address`, `latitude`, `longitude`, or `raw`. The last
 |  is a dictionary of the geocoder's response for this item.
 |  
 |  .. versionadded:: 0.98
 |  
 |  Methods defined here:
 |  
 |  __eq__(self, other)
 |

In [5]:
import pandas as pd
import plotly

csv file ma_plus_race.csv stores data (last updated to include a section on race/ethnicity for each profile)

In [6]:
ma = pd.DataFrame.from_csv("ma_plus_race.csv")

In [7]:
one_city_list = list(set([i + " Massachusetts" for i in ma["town"].values.tolist()\
                 if len(str(i).split(", ")) == 1 and str(i) != "nan"]))

In [8]:
lats = []
lons = []
# print(len(one_city_list))
for i in one_city_list:
    location = geolocator.geocode(i, timeout=10)
    lats.append(location.latitude)
    lons.append(location.longitude)

In [9]:
two_city_list = list(set([i + " Massachusetts" for i in ma["town"].values.tolist()\
                 if len(str(i).split(", ")) == 2 and str(i) != "nan"]))

In [10]:
for i in two_city_list:
    if "Other" in i:
        two_city_list.remove(i)

In [11]:
two_city_list = [i.split(", ")[1] for i in two_city_list]

In [12]:
all_cities = one_city_list + two_city_list
len(all_cities)

134

**Edge cases:**

In [13]:
for i,v in enumerate(all_cities):
    if "Leather" in v:
        all_cities[i] = "Leather District, Boston"
    elif "Bay Village" in v:
        all_cities[i] = "Bay Village"
    elif "Brickell" in v:
        all_cities.remove(v)
    elif "Boston Airport" in v:
        all_cities[i] = "Boston Logan International Airport, Boston"

In [134]:
new_areas = [i + " Massachusetts" for i in new_areas]

In [136]:
everything = all_cities + new_areas

Store conversion data in dictionary and pandas frame

In [137]:
geo_dict = {}
for i in everything:
    location = geolocator.geocode(i, timeout=10)
    try:
        lat = location.latitude
        lon = location.longitude
    except AttributeError:
        print(i)
    geo_dict[i] = {"lat": lat, "lon": lon}

Boston Logan International Airport, Boston
Leather District, Boston


In [138]:
geo_dict["Boston Logan Airport"] = {"lat": 42.366828, "lon": -71.027330}
geo_dict["Leather District"] = {"lat": 42.350807, "lon": -71.057969}

In [139]:
geo_frame = pd.DataFrame(geo_dict).T

In [140]:
geo_frame.index

Index([u'Abington Massachusetts', u'Acton Massachusetts',
       u'Acushnet Massachusetts', u'Adams Massachusetts',
       u'Agawam Massachusetts', u'Allston/Brighton Massachusetts',
       u'Amesbury Massachusetts', u'Amherst Center Massachusetts',
       u'Amherst Massachusetts', u'Andover Massachusetts',
       ...
       u'Williamstown Massachusetts', u'Wilmington Massachusetts',
       u'Winchendon Massachusetts', u'Winchester Massachusetts',
       u'Woburn Massachusetts', u'Worcester Massachusetts',
       u'Wrentham Massachusetts', u'Yarmouth Massachusetts',
       u'boxborough Massachusetts', u'winthrop Massachusetts'],
      dtype='object', length=279)

use conversion dictionary to add a lat and long value to each profile in pandas dataframe

In [18]:
clean_towns = []
lats_list = []
lons_list = []
ma_city_values = [str(i) for i in ma["town"].values.tolist()]
for ind,i in enumerate(ma_city_values):
    if str(i) == "nan" or ":" in i:
        clean_towns.append("nan")
        lats_list.append("nan")
        lons_list.append("nan")
    else:
        if "," in i:
            val = i.split(", ")[1]
        else:
            val = i
        try:
            lats_list.append(geo_dict[val + " Massachusetts"]["lat"])
            lons_list.append(geo_dict[val + " Massachusetts"]["lon"])
            clean_towns.append(val + " Massachusetts")
        except KeyError:
            if "Bay Village" in val:
                lats_list.append(42.349176)
                lons_list.append(-71.069591)
                clean_towns.append("Bay Village")
            elif "Leather" in val:
                lats_list.append(42.350807)
                lons_list.append(-71.057969)
                clean_towns.append("Leather District")
            elif "Airport" in val:
                lats_list.append(42.366828)
                lons_list.append(-71.027330)
                clean_towns.append("Boston Logan Airport")
            else:
                clean_towns.append("nan")
                lats_list.append("nan")
                lons_list.append("nan")

In [19]:
ma["towns"] = clean_towns
ma["lat"] = lats_list
ma["lon"] = lons_list
len(ma["towns"])

22805

In [20]:
from itertools import groupby
lengtsh = [len(list(group)) for key, group in groupby(clean_towns)]

**Check out the frequency breakdown for users around the state**

In [21]:
import collections
counter=collections.Counter(clean_towns)
# print(counter)
# print(counter.values())
# print(counter.keys())
frequencies = (counter.most_common(134))

In [22]:
geo_dict[u'Westborough Massachusetts']['lat']

42.2694308

In [23]:
freq = []
for i,v in enumerate(frequencies):
    try:
        freq.append([geo_dict[v[0]]["lat"], geo_dict[v[0]]["lon"], v[1]])
    except KeyError:
        print v

('nan', 9216)


In [24]:
prep_towns = ma["towns"][ma["hiv_status"] == "HIV Negative, on PrEP"].values.tolist()

In [88]:
prep_areas = ma["area"][ma["hiv_status"] == "HIV Negative, on PrEP"].values.tolist()

In [83]:
geolocater = []
together = [ma["towns"] + ", " + ma["area"]][0]

In [84]:
for i in together[0:10]:
    print i

nan, Other in: Massachusetts
Malden Massachusetts, Boston Metro
Framingham Massachusetts, MetroWest
Cambridge Massachusetts, Boston Metro
nan, Boston Metro
North Andover Massachusetts, Merrimack Valley
Wellfleet Massachusetts, Cape Cod
Dorchester Massachusetts, Boston Metro
Lynn Massachusetts, Boston Metro
Quincy Massachusetts, Boston Metro


In [108]:
geo = []
new_areas = []
for ind,i in enumerate(together):
    try:
        splt = i.split(", ")
        if splt[0] == "nan":
            if "Other" not in splt[1]:
                geo.append(splt[1])
                if splt[1] not in new_areas:
                    new_areas.append(splt[1])
            else:
                geo.append("nan")
        else:
            geo.append(splt[0])
    except:
        print ind, i
        geo.append("nan")
print new_areas

11170 nan
['Boston Metro', 'Nantucket', 'Chicopee', 'New Bedford', 'Worcester', 'Bedford', 'Gardner', 'Westport', 'Leominster', 'Springfield', 'Amherst', 'South Yarmouth', 'Dartmouth', 'Plymouth', 'Whitinsville', 'Holyoke', 'Brockton', 'Danvers', 'Somerset', 'Bridgewater', 'North Adams', 'Abington', 'Randolph', 'Leicester', 'Pembroke', 'Clinton', 'Westfield', 'Fitchburg', 'Topsfield', 'Needham', 'Merrimack Valley', 'Agawam', 'Lynnfield', 'Northampton', 'Pittsfield', 'Rockport', 'Great Barrington', 'Fall River', 'North Attleborough', 'Taunton', 'Rutland', 'Wareham', 'Douglas', 'Burlington', 'Attleboro', 'West Springfield', 'Ware', 'Sturbridge', 'MetroWest', 'Shirley', 'Canton', 'Lunenburg', 'Reading', 'Ayer', 'Ipswich', 'Dighton', 'Auburn', 'Williamstown', 'Norwood', 'Millbury', 'Belchertown', 'Wilbraham', 'Southbridge', 'Greenfield', 'Rockland', 'Rehoboth', 'Easton', 'Marshfield', 'Ludlow', 'Norwell', 'Holbrook', 'Stoughton', 'Middleton', 'Fairhaven', 'Palmer', 'Freetown', 'Lancaster',

In [155]:
add_mass = [ma["area"] + " Massachusetts"]

<type 'str'>


In [156]:
together = [ma["towns"] + ", " + ma["area"]][0]

In [148]:
important_geos = []
for i in together:
    try:
        s = i.split(", ")
        if "nan" in s[0]:
            if "Other" in s[1]:
                important_geos.append("nan")
            else:
                important_geos.append(s[1])
        else:
            important_geos.append(s[0])
    except AttributeError:
        important_geos.append("nan")

In [149]:
important_geos

['nan',
 'Malden Massachusetts',
 'Framingham Massachusetts',
 'Cambridge Massachusetts',
 'Boston Metro',
 'North Andover Massachusetts',
 'Wellfleet Massachusetts',
 'Dorchester Massachusetts',
 'Lynn Massachusetts',
 'Quincy Massachusetts',
 'Beacon Hill Massachusetts',
 'Dorchester Massachusetts',
 'Barnstable Massachusetts',
 'Dorchester Massachusetts',
 'Watertown Massachusetts',
 'Nantucket',
 'Nantucket',
 'Chicopee',
 'New Bedford',
 'Worcester',
 'Salem Massachusetts',
 'Methuen Massachusetts',
 'Bedford',
 'South End Massachusetts',
 'Framingham Massachusetts',
 'Cambridge Massachusetts',
 'Downtown Crossing Massachusetts',
 'Boston Metro',
 'Gardner',
 'Boston Metro',
 'South End Massachusetts',
 'Somerville Massachusetts',
 'Lowell Massachusetts',
 'Dorchester Massachusetts',
 'Westport',
 'Leominster',
 'nan',
 'Boston Metro',
 'Back Bay Massachusetts',
 'Springfield',
 'Amherst',
 'Downtown Crossing Massachusetts',
 'Boston Metro',
 'Boston Metro',
 'Boston Massachusetts

In [159]:
for i in important_geos:
    location = geolocator.geocode(i+" Massachusetts", timeout=10)
    try:
        lat = location.latitude
        lon = location.longitude
    except AttributeError:
        print(i)
    geo_dict[i+ " Massachusetts"] = {"lat": lat, "lon": lon}

KeyboardInterrupt: 

In [128]:
for key, item in geo_dict.items():
    if "Massachusetts" not in i:
        del geo_dict[key]

In [178]:
updated = []
for i in important_geos:
    if "Massachusetts" not in i and "nan" not in i:
        if "Logan" in i or "Leather" in i or "Bay" in i:
            updated.append(i)
        else:
            updated.append(i + " Massachusetts")
    else:
        updated.append(i)

In [187]:
ma["geos"] = updated

In [162]:
geo_dict

{'Abington Massachusetts': {'lat': 42.1048228, 'lon': -70.9453217},
 'Acton Massachusetts': {'lat': 42.4850931, 'lon': -71.4328399},
 'Acushnet Massachusetts': {'lat': 41.6806593, 'lon': -70.9078159},
 'Adams Massachusetts': {'lat': 42.6242495, 'lon': -73.1176029},
 'Agawam Massachusetts': {'lat': 42.0695391, 'lon': -72.6148116},
 'Allston/Brighton Massachusetts': {'lat': 42.3547903, 'lon': -71.1181804},
 'Amesbury Massachusetts': {'lat': 42.8579536, 'lon': -70.930092},
 'Amherst Center Massachusetts': {'lat': 42.3583961, 'lon': -71.0956777766393},
 'Amherst Massachusetts': {'lat': 42.3803676, 'lon': -72.5231429},
 'Andover Massachusetts': {'lat': 42.65717, 'lon': -71.1408775},
 'Arlington Massachusetts': {'lat': 42.4153739, 'lon': -71.1564427},
 'Ashland Massachusetts': {'lat': 42.2612067, 'lon': -71.4633955},
 'Athol Massachusetts': {'lat': 42.5959203, 'lon': -72.2267496},
 'Attleboro Massachusetts': {'lat': 41.9445441, 'lon': -71.2856081},
 'Auburn Massachusetts': {'lat': 42.1945385

In [189]:
prep_geos = ma["geos"][ma["hiv_status"] == "HIV Negative, on PrEP"]

In [192]:
und_geos = ma["geos"][ma["hiv_status"] == "HIV Undetectable"]
pos_geos = ma["geos"][ma["hiv_status"] == "HIV Positive"]
neg_geos = ma["geos"][ma["hiv_status"] == "HIV Negative"]

In [207]:
def print_geos(geos_list):
    counter=collections.Counter(geos_list)
    frequencies = (counter.most_common(300))
    freq = []
    for i,v in enumerate(frequencies):
        try:
            freq.append([geo_dict[v[0]]["lat"], geo_dict[v[0]]["lon"], v[1], frequencies[i][0]])
        except KeyError:
            print v
    return freq

In [208]:
print len(print_geos(neg_geos))
print_geos(neg_geos)

('nan', 527)
272
('nan', 527)


[[42.3084007, -71.1081289, 1410, 'Boston Metro Massachusetts'],
 [42.3750997, -71.1056156, 803, 'Cambridge Massachusetts'],
 [42.2625932, -71.8022933, 753, 'Worcester Massachusetts'],
 [42.2973205, -71.0744951, 747, 'Dorchester Massachusetts'],
 [42.0959276, -72.5828662, 682, 'South End Massachusetts'],
 [42.1014831, -72.5898109, 631, 'Springfield Massachusetts'],
 [42.3507067, -71.0797296, 519, 'Back Bay Massachusetts'],
 [42.3547903, -71.1181804, 404, 'Allston/Brighton Massachusetts'],
 [42.3875968, -71.0994967, 326, 'Somerville Massachusetts'],
 [42.3098201, -71.1203298, 324, 'Jamaica Plain Massachusetts'],
 [42.058436, -70.1786374, 310, 'Provincetown Massachusetts'],
 [42.3750973, -71.0392172, 269, 'East Boston Massachusetts'],
 [42.3334312, -71.0494948, 267, 'South Boston Massachusetts'],
 [42.6334247, -71.3161717, 262, 'Lowell Massachusetts'],
 [42.34406895, -71.0945223672873, 243, 'Fenway/Kenmore Massachusetts'],
 [42.2528772, -71.0022704, 240, 'Quincy Massachusetts'],
 [41.7010

In [191]:
freq

[[42.3084007, -71.1081289, 45, 'Boston Metro Massachusetts'],
 [42.2973205, -71.0744951, 37, 'Dorchester Massachusetts'],
 [42.0959276, -72.5828662, 16, 'South End Massachusetts'],
 [42.058436, -70.1786374, 13, 'Provincetown Massachusetts'],
 [42.3750997, -71.1056156, 11, 'Cambridge Massachusetts'],
 [42.3098201, -71.1203298, 10, 'Jamaica Plain Massachusetts'],
 [42.3334312, -71.0494948, 10, 'South Boston Massachusetts'],
 [42.3875968, -71.0994967, 9, 'Somerville Massachusetts'],
 [42.3547903, -71.1181804, 8, 'Allston/Brighton Massachusetts'],
 [42.1014831, -72.5898109, 8, 'Springfield Massachusetts'],
 [42.2625932, -71.8022933, 8, 'Worcester Massachusetts'],
 [42.4084302, -71.0119947, 8, 'Revere Massachusetts'],
 [42.3551473, -71.0599538, 6, 'Downtown Crossing Massachusetts'],
 [42.3750973, -71.0392172, 5, 'East Boston Massachusetts'],
 [42.3756401, -71.2358003, 5, 'Waltham Massachusetts'],
 [42.3507067, -71.0797296, 5, 'Back Bay Massachusetts'],
 [42.34406895, -71.0945223672873, 5, '

Use plotly's geoscatter plot to try to visualize that frequency

In [27]:
import plotly.tools as tls
tls.set_credentials_file(username='mrngos', api_key='r2pmvn0qhu')

## Biggest issue:
The map data in plotly doesn't scale to city/state scope, stays only at state/country level in the States. must find alternative

In [28]:
import plotly.plotly as py
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_february_us_airport_traffic.csv')
df.head()

# ma['text'] = df['town'] + '' + df['city'] + ', ' + df['state'] + '' + 'Arrivals: ' + df['cnt'].astype(str)

scl = [ [0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
    [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"] ]

data = [ dict(
        type = 'scattergeo',
        locationmode = 'MA-cities',
        lon = ma['lon'],
        lat = ma['lat'],
        mode = 'markers',
        marker = dict( 
            size = 8, 
            opacity = 0.8,
            reversescale = True,
            autocolorscale = False,
            symbol = 'circle',
            line = dict(
                width=1,
                color='rgba(102, 102, 102)'
            ),
            colorscale = scl,
            cmin = 0,
            color = df['cnt'],
            cmax = df['cnt'].max(),
            colorbar=dict(
                title="Incoming flightsFebruary 2011"
            )
        ))]

layout = dict(
        title = 'Most trafficked US airports<br>(Hover for airport names)',
        colorbar = True,   
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = "rgb(250, 250, 250)",
            subunitcolor = "rgb(217, 217, 217)",
            countrycolor = "rgb(217, 217, 217)",
            countrywidth = 0.5,
            subunitwidth = 0.5        
        ),
    )

fig = dict( data=data, layout=layout )
py.iplot( fig, validate=False, filename='d3-airports' )

High five! You successfuly sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~mrngos/0 or inside your plot.ly account where it is named 'd3-airports'
