<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Get-map-data-for-JS-map" data-toc-modified-id="Get-map-data-for-JS-map-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Get map data for JS map</a></span></li></ul></div>

# Get map data for JS map

In [1]:
import pandas as pd
import numpy as np
import collections

In [2]:
data = pd.read_csv("../mediaflux_query_dump/output/individuals_2018-05-30.tsv", sep="\t")

In [3]:
data.columns

Index([u'KDR_assay', u'asset_id', u'barcode_reference_list',
       u'common_name_species', u'country', u'date_collection',
       u'date_current_update', u'dev_stage_collected', u'dev_stage_stored',
       u'dna_tube1_conc', u'dna_tube1_label', u'dna_tube1_location',
       u'dna_tube2_conc', u'dna_tube2_label', u'dna_tube2_location',
       u'individual_code', u'individual_code_alias', u'individual_description',
       u'initial_storage_solution', u'isolate', u'isolation_source',
       u'latitude', u'longitude', u'mol_type', u'p1_barcode_code_sequence',
       u'p2_barcode_code_sequence', u'person_collection',
       u'person_current_update', u'person_identified', u'publication_id',
       u'radseq_library_alias', u'radseq_library_name',
       u'raw_sequence_filename', u'sampling_scheme', u'sampling_type',
       u'scientific_name_species', u'sex', u'specific_location', u'subregion',
       u'wolbachia_assay'],
      dtype='object')

In [4]:
df = data[["asset_id", "common_name_species", "country", "specific_location", "latitude", "longitude"]].copy()

In [5]:
# Some records don't have lat/long
missing_latlong = df[np.isnan(df["latitude"]) | np.isnan(df["longitude"])]
missing_latlong.head()

Unnamed: 0,asset_id,common_name_species,country,specific_location,latitude,longitude
333,35547183,Yellow Fever Mosquito,australia,,,
334,35547201,Yellow Fever Mosquito,australia,,,
335,35547202,Yellow Fever Mosquito,australia,,,
336,35547203,Yellow Fever Mosquito,australia,,,
337,35547204,Yellow Fever Mosquito,australia,,,


In [6]:
# Which countries?
collections.Counter(missing_latlong["country"])

Counter({'australia': 67, 'brazil': 50, 'vietnam': 99})

In [7]:
# Specific locations?
collections.Counter(missing_latlong["specific_location"])

Counter({nan: 216})

In [8]:
# If missing lat/long, use values from:
# https://developers.google.com/public-data/docs/canonical/countries_csv

In [9]:
countries_latlong = pd.read_csv("countries.csv")
countries_latlong.head()

Unnamed: 0,country,latitude,longitude,name
0,AD,42.546245,1.601554,Andorra
1,AE,23.424076,53.847818,United Arab Emirates
2,AF,33.93911,67.709953,Afghanistan
3,AG,17.060816,-61.796428,Antigua and Barbuda
4,AI,18.220554,-63.068615,Anguilla


In [10]:
# Change name column to be lower case
countries_latlong["name"] = [x.lower() for x in list(countries_latlong["name"])]

In [11]:
# Check all countries with missing lat/long can be found in the list
countries = collections.Counter(missing_latlong["country"]).keys()
for country in countries:
    if country in list(countries_latlong["name"]):
        print(country + ": OK")
    else:
        print(country + ": MISSING")

brazil: OK
australia: OK
vietnam: OK


In [12]:
for i in df.index:
    if np.isnan(df["latitude"].loc[i]) or np.isnan(df["longitude"].loc[i]):
        country = df["country"].loc[i]
        lat = countries_latlong[countries_latlong["name"] == country]["latitude"].values[0]
        lng = countries_latlong[countries_latlong["name"] == country]["longitude"].values[0]
        df.loc[i,"latitude"] = lat
        df.loc[i,"longitude"] = lng

In [13]:
# Create column for popup text in map
# Change asset_id to string
df["asset_id_str"] = ["({})".format(x) for x in list(df["asset_id"])]

In [14]:
df.head()

Unnamed: 0,asset_id,common_name_species,country,specific_location,latitude,longitude,asset_id_str
0,35280960,Asian Tiger Mosquito,China,Jiuwei Village,23.106714,113.445278,(35280960)
1,35281030,Asian Tiger Mosquito,China,Jiuwei Village,23.106714,113.445278,(35281030)
2,35281033,Asian Tiger Mosquito,China,Jiuwei Village,23.106714,113.445278,(35281033)
3,35281798,Asian Tiger Mosquito,China,Jiuwei Village,23.106714,113.445278,(35281798)
4,35282328,Asian Tiger Mosquito,China,Jiuwei Village,23.106714,113.445278,(35282328)


In [15]:
df["text"] = df["common_name_species"] + " " + df["asset_id_str"]

In [16]:
df.head()

Unnamed: 0,asset_id,common_name_species,country,specific_location,latitude,longitude,asset_id_str,text
0,35280960,Asian Tiger Mosquito,China,Jiuwei Village,23.106714,113.445278,(35280960),Asian Tiger Mosquito (35280960)
1,35281030,Asian Tiger Mosquito,China,Jiuwei Village,23.106714,113.445278,(35281030),Asian Tiger Mosquito (35281030)
2,35281033,Asian Tiger Mosquito,China,Jiuwei Village,23.106714,113.445278,(35281033),Asian Tiger Mosquito (35281033)
3,35281798,Asian Tiger Mosquito,China,Jiuwei Village,23.106714,113.445278,(35281798),Asian Tiger Mosquito (35281798)
4,35282328,Asian Tiger Mosquito,China,Jiuwei Village,23.106714,113.445278,(35282328),Asian Tiger Mosquito (35282328)


In [17]:
# Round latitude and longitude to three decimal places to preserve some anonymity
# since some coordinates pinpoint particular buildings
df["latitude"] = [round(x, 3) for x in list(df["latitude"])]
df["longitude"] = [round(x, 3) for x in list(df["longitude"])]

In [18]:
df.head()

Unnamed: 0,asset_id,common_name_species,country,specific_location,latitude,longitude,asset_id_str,text
0,35280960,Asian Tiger Mosquito,China,Jiuwei Village,23.107,113.445,(35280960),Asian Tiger Mosquito (35280960)
1,35281030,Asian Tiger Mosquito,China,Jiuwei Village,23.107,113.445,(35281030),Asian Tiger Mosquito (35281030)
2,35281033,Asian Tiger Mosquito,China,Jiuwei Village,23.107,113.445,(35281033),Asian Tiger Mosquito (35281033)
3,35281798,Asian Tiger Mosquito,China,Jiuwei Village,23.107,113.445,(35281798),Asian Tiger Mosquito (35281798)
4,35282328,Asian Tiger Mosquito,China,Jiuwei Village,23.107,113.445,(35282328),Asian Tiger Mosquito (35282328)


In [19]:
# Convert to list of objects
json = []
for i in df.index:
    json.append('{"latlng":[%.03f,%.03f],"text":"%s"}' % 
                (df.loc[i,"latitude"], df.loc[i,"longitude"], df.loc[i,"text"]))

In [20]:
json[:5]

['{"latlng":[23.107,113.445],"text":"Asian Tiger Mosquito (35280960)"}',
 '{"latlng":[23.107,113.445],"text":"Asian Tiger Mosquito (35281030)"}',
 '{"latlng":[23.107,113.445],"text":"Asian Tiger Mosquito (35281033)"}',
 '{"latlng":[23.107,113.445],"text":"Asian Tiger Mosquito (35281798)"}',
 '{"latlng":[23.107,113.445],"text":"Asian Tiger Mosquito (35282328)"}']

In [21]:
# Convert to string
js_string_array = "var sampleData = [{}]".format(", ".join(json))

In [22]:
# js_string_array

In [23]:
# Save JSON
df[["text", "latitude", "longitude"]].to_json("resources/data/data.json")