# Surveillance survey indexes

TODO:
1. Load data from CSV into DataFrame
2. Visualize data statistics

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from plotly.graph_objs import *
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt

from collections import Counter
import numpy as np
import pandas as pd
import os

## 1. Load data from CSV into DataFrame

In [2]:
df = pd.read_csv('../data/breeding-sites/larval-survey.csv') 
df = df.replace(0, np.nan)
df = df.dropna(axis=0, how='any')
df = df.reset_index(drop=True)
df = df.loc[df['province'] == 'นครศรีธรรมราช']
df = df.drop('province', axis=1)
df['date'] = pd.to_datetime(df['date'], format='%Y-%m')
df = df.set_index('date')
df = df.sort_index()
# df = df['2015':'2018']
df.head()
df.tail()
print('Total:', len(df))

Unnamed: 0_level_0,district,subdist,village,hi,ci,bi
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2011-02-01,ทุ่งสง,นาหลวงเสน,ประดู่,25.0,2.77,25.0
2011-02-01,ฉวาง,นาแว,ทุ่งกระจูด,15.0,3.06,15.0
2011-02-01,ฉวาง,นาแว,ควนสวรรค์,17.5,2.86,17.5
2011-02-01,ฉวาง,นาแว,ควนยูง,22.5,3.35,22.5
2011-02-01,ทุ่งสง,นาไม้ไผ่,บอมอง,20.0,2.91,22.5


Unnamed: 0_level_0,district,subdist,village,hi,ci,bi
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-06-01,ร่อนพิบูลย์,หินตก,ปลายราง,10.0,4.33,37.5
2017-06-01,ร่อนพิบูลย์,หินตก,พรุชิง,11.25,4.99,45.0
2017-06-01,ร่อนพิบูลย์,หินตก,คีรีใหม่,10.0,3.98,36.25
2017-06-01,บางขัน,บางขัน,พรุเตย,10.0,1.61,10.0
2017-06-01,ฉวาง,ห้วยปริก,ปากกา,17.0,3.18,17.0


Total: 873


##  2. Visualize data statistics

In [3]:
count = dict(Counter(df.index.year))
key, val = [], []
for k in count:
    key.append(k)
    val.append(count[k])

trace_bar_actual = go.Bar( 
    x = key,
    y = val,
    text = val,
    textposition = 'auto',
    marker=dict(
                color='rgb(158,202,225)',
                line=dict(
                    color='rgb(8,48,107)',
                    width=1.5),
            ),
    opacity=0.8
)
layout = go.Layout(
    title='Data Points for each Year',
    height=550,
    width=750,
    yaxis= dict(title='Frequency'),
    xaxis= dict(title='Year')
)
fig = go.Figure(data=[trace_bar_actual], layout=layout)
iplot(fig)

In [4]:
count = dict(Counter(df.index.month))
key, val = [], []
for k in count:
    key.append(k)
    val.append(count[k])

trace_bar_actual = go.Bar( 
    x = key,
    y = val,
    text = val,
    textposition = 'auto',
    marker=dict(
                color='rgb(158,202,225)',
                line=dict(
                    color='rgb(8,48,107)',
                    width=1.5),
            ),
    opacity=0.8
)
layout = go.Layout(
    title='Data Points for each Month',
    height=550,
    width=750,
    yaxis= dict(title='Frequency'),
    xaxis= dict(title='month')
)
fig = go.Figure(data=[trace_bar_actual], layout=layout)
iplot(fig)

In [5]:
data = []
subdist_list = df['subdist'].unique()
for subdist in subdist_list:
    tmp = df.loc[df['subdist'] == subdist].copy()
#     print(tmp['bi'])
    trace = go.Box(
        y=tmp['bi'].values,
        name=subdist,
        boxmean=True,
#         boxpoints='all',
#         jitter=0.5,
    )
    data.append(trace)
    
layout = go.Layout(
    title='Box Plot Styling Standard Deviation',
    height=600,
#     width=1700,
    yaxis= dict(title='Frequency'),
    xaxis= dict(title='Year')
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [6]:
arr = []
subdist_list = df['subdist'].unique()
for subdist in subdist_list:
    tmp = df.loc[df['subdist'] == subdist].copy()
    arr.append([subdist, len(tmp['bi'])])

arr = pd.DataFrame.from_records(arr)
arr.columns = ['subdist', 'freq']
# arr = arr.sort_values('freq', ascending=0)

trace = go.Bar( 
    x = arr['subdist'],
    y = arr['freq'],
    text = arr['freq'],
    textposition = 'auto',
    marker=dict(
                color='rgb(158,202,225)',
                line=dict(
                    color='rgb(8,48,107)',
                    width=1.5),
            ),
    opacity=0.8
)
layout = go.Layout(
    title='Data Points for each Subdistrict',
    height=550,
    width=1700,
    yaxis= dict(title='Frequency'),
    xaxis= dict(title='Year')
)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [7]:
df_filtered = []
subdist_list = df['subdist'].unique()
for subdist in subdist_list:
    tmp = df.loc[df['subdist'] == subdist].copy()
    df_filtered.append(tmp[np.abs(tmp['bi']-tmp['bi'].mean()) <= (1*tmp['bi'].std())].copy())
    
df_filtered = pd.concat(df_filtered, axis=0)
df_filtered.head()
df_filtered.tail()

Unnamed: 0_level_0,district,subdist,village,hi,ci,bi
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2011-02-01,ทุ่งสง,นาหลวงเสน,ประดู่,25.0,2.77,25.0
2011-07-01,ทุ่งสง,นาหลวงเสน,คอกช้าง,30.0,6.65,62.5
2012-02-01,ทุ่งสง,นาหลวงเสน,ใต้,18.0,4.59,30.0
2012-05-01,ทุ่งสง,นาหลวงเสน,ใต้,12.0,3.54,23.0
2012-08-01,ทุ่งสง,นาหลวงเสน,นาหลวงเสน,16.0,2.76,33.0


Unnamed: 0_level_0,district,subdist,village,hi,ci,bi
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-03-01,ทุ่งสง,นาโพธิ์,นาโพธิ์,22.5,5.65,32.5
2017-03-01,ทุ่งสง,นาโพธิ์,บนควน,32.5,6.93,35.0
2017-06-01,ทุ่งสง,นาโพธิ์,เกาะปราง,15.0,8.95,42.5
2017-02-01,พรหมคีรี,อินคีรี,น้ำแคบ,12.5,7.02,52.5
2017-04-01,พรหมคีรี,อินคีรี,น้ำแคบ,10.0,6.51,47.5


In [8]:
# df_meta = []
# DIR = '../tensorflow/object_detection/GSV/นครศรีธรรมราช/'
# districts = os.listdir(DIR)
# for district in districts:
#     subdists = os.listdir(os.path.join(DIR, district))
    
#     for subdist in subdists:     
#         path = os.path.join(DIR, district, subdist, 'original')
#         if not os.path.exists(path):
#             continue
        
#         files = os.listdir(path)
#         for file in files:
#             file = file[:-4]
#             lat, lng, degree, date = file.split('_')
#             df_meta.append([lat, lng, degree, date])
            
# df_meta = pd.DataFrame.from_records(df_meta)
# df_meta.columns = ['lat', 'lng', 'degree', 'date']
# df_meta['date'] = pd.to_datetime(df_meta['date'], format='%Y-%m')

In [9]:
df_meta = pd.read_csv('../data/gsv_meta.csv')
df_meta['date'] = pd.to_datetime(df_meta['date'], format='%Y-%m')
df_meta = df_meta.set_index('date')
df_meta = df_meta.sort_index()
df_meta = df_meta['2016']
df_meta.head()
len(df_meta)

Unnamed: 0_level_0,lat,lng,degree
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-01-01,9.116177,99.670922,72
2016-01-01,9.281042,99.733532,0
2016-01-01,9.312261,99.710716,216
2016-01-01,9.125009,99.731638,72
2016-01-01,9.105161,99.716666,216


957717

In [10]:
month = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']

In [11]:
count = dict(Counter(df_meta.index.month))
key, val = [], []
for k in count:
    key.append(month[k-1])
    val.append(count[k])
    
trace_bar_actual = go.Bar( 
    x = key,
    y = val,
    text = val,
    textposition = 'auto',
    marker=dict(
                color='rgb(158,202,225)',
                line=dict(
                    color='rgb(8,48,107)',
                    width=1.5),
            ),
    opacity=0.8
)
layout = go.Layout(
    title='Data Points for each Year',
    height=550,
#     width=1500,
    yaxis= dict(title='Frequency'),
    xaxis= dict(title='Year')
)
fig = go.Figure(data=[trace_bar_actual], layout=layout)
iplot(fig)

In [12]:
URL = 'https://raw.githubusercontent.com/pcrete/Mosquito_Breeding_Sites_Detector/master/geojson/province/%E0%B8%99%E0%B8%84%E0%B8%A3%E0%B8%A8%E0%B8%A3%E0%B8%B5%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%A3%E0%B8%B2%E0%B8%8A.geojson'
import urllib.request, json 
with urllib.request.urlopen(URL) as url:
    data_polygon = json.loads(url.read().decode())

In [13]:
mapbox_access_token = 'pk.eyJ1IjoiYWxpc2hvYmVpcmkiLCJhIjoiY2ozYnM3YTUxMDAxeDMzcGNjbmZyMmplZiJ9.ZjmQ0C2MNs1AzEBC_Syadg'

mean, sd = df.bi.mean(), df.bi.std()
print(mean, sd)

norm = mpl.colors.Normalize(vmin=mean-sd, vmax=mean+sd)
cmap = cm.Blues

polygons = []
for feature in data_polygon['features']:
    prop = feature['properties']
    province = prop['PV_TN']
    district = prop['AP_TN']
    subdist = prop['TB_TN']
    
    value = df[(df.district == district) & (df.subdist == subdist)].bi.mean()
    m = cm.ScalarMappable(norm=norm, cmap=cmap)
    r,g,b,a = m.to_rgba(value)
    r,g,b,a = str(int(r*255)), str(int(g*255)), str(int(b*255)), str(1.0)
    rgba = 'rgba('+r+','+g+','+b+','+a+')'
    
    polygons.append(
        dict(
            sourcetype = 'geojson',
            source = feature,
            type = 'fill',
            color = rgba
        )
    )

data = Data([
    Scattermapbox(
        lat=1,
        lon=1,
        mode='markers',
        marker=Marker(
            size=0
        ),
        text=['Montreal'],
    )
])

layout = Layout(
    autosize=True,
    hovermode='closest',
    width=1500,
    height=800,
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=df_meta.lat[0],
            lon=df_meta.lng[0]
        ),
        pitch=0,
        zoom=8,
        style='light', # dark,satellite,streets,light
        layers=polygons,
    ),
)

fig = dict(data=data, layout=layout)
iplot(fig, filename='Montreal Mapbox')

28.453699885452462 31.437258193014088
