In [1]:
import pandas as pd
from datetime import date
import numpy as np

import bokeh as bk
from bokeh.plotting import figure
from bokeh.layouts import column, row
from bokeh.models import ColumnDataSource, Slider, TextInput
from ipywidgets import interact
from bokeh.io import push_notebook, output_notebook, show
from bokeh.layouts import gridplot

In [2]:
df = pd.read_csv('derived_data/clean_obesity_risk_factors.csv')
df.head()

Unnamed: 0,yearstart,yearend,locationabbr,locationdesc,datasource,class,topic,question,data_value_unit,data_value_type,...,geolocation,classid,topicid,questionid,datavaluetypeid,locationid,stratificationcategory1,stratification1,stratificationcategoryid1,stratificationid1
0,2020,2020,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,,PA,PA1,Q047,VALUE,59,Race/Ethnicity,Hispanic,RACE,RACEHIS
1,2014,2014,GU,Guam,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(13.444304, 144.793731)",OWS,OWS1,Q036,VALUE,66,Education,High school graduate,EDU,EDUHSGRAD
2,2013,2013,US,National,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,,OWS,OWS1,Q036,VALUE,59,Income,"$50,000 - $74,999",INC,INC5075
3,2013,2013,US,National,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,,OWS,OWS1,Q037,VALUE,59,Income,Data not reported,INC,INCNR
4,2015,2015,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who achieve at least 300 min...,,Value,...,,PA,PA1,Q045,VALUE,59,Income,"Less than $15,000",INC,INCLESS15


In [3]:
df.columns

Index(['yearstart', 'yearend', 'locationabbr', 'locationdesc', 'datasource',
       'class', 'topic', 'question', 'data_value_unit', 'data_value_type',
       'data_value', 'data_value_alt', 'data_value_footnote_symbol',
       'data_value_footnote', 'low_confidence_limit', 'high_confidence_limit',
       'sample_size', 'total', 'age_years_', 'education', 'gender', 'income',
       'race_ethnicity', 'geolocation', 'classid', 'topicid', 'questionid',
       'datavaluetypeid', 'locationid', 'stratificationcategory1',
       'stratification1', 'stratificationcategoryid1', 'stratificationid1'],
      dtype='object')

In [4]:
formatted = df[['yearstart', 'locationabbr', 'questionid', 'data_value', 'stratification1']]
formatted = pd.pivot_table(
    formatted
    , values='data_value'
    , index=['yearstart', 'locationabbr']
    , columns=['questionid', 'stratification1']
    , fill_value=0
)

In [5]:
formatted.values

array([[ 0. ,  0. ,  0. , ..., 35.6, 21.2, 22. ],
       [ 0. ,  0. ,  0. , ...,  0. , 31.6, 32.6],
       [ 0. ,  0. ,  0. , ...,  0. , 26.8, 30.9],
       ...,
       [ 0. ,  0. ,  0. , ...,  0. , 20.9, 20.5],
       [ 0. ,  0. ,  0. , ...,  0. , 24.7, 29.7],
       [ 0. ,  0. ,  0. , ...,  0. , 22.6, 22.4]])

In [6]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(formatted.values)
print(pca.explained_variance_ratio_)

[7.06367774e-01 1.22453195e-01 3.00546561e-02 2.49185747e-02
 2.02314277e-02 1.42014141e-02 7.50823333e-03 7.15960396e-03
 6.28179912e-03 6.00602033e-03 4.90753323e-03 4.08771633e-03
 3.24399248e-03 2.85721201e-03 2.39869322e-03 1.98753763e-03
 1.98358501e-03 1.75048073e-03 1.62386757e-03 1.50150867e-03
 1.38706379e-03 1.33543066e-03 1.27970145e-03 1.12820174e-03
 9.60265066e-04 9.48300407e-04 8.51617726e-04 7.85768413e-04
 7.22159725e-04 6.87645234e-04 6.62376114e-04 6.23827172e-04
 5.92498248e-04 5.72255385e-04 5.26220059e-04 5.19076262e-04
 5.00552988e-04 4.77990625e-04 4.57231710e-04 4.49364598e-04
 4.05348202e-04 3.99108293e-04 3.73889097e-04 3.57019185e-04
 3.40111378e-04 3.36741150e-04 3.29875501e-04 3.06926944e-04
 2.95958018e-04 2.83435637e-04 2.75242243e-04 2.62645007e-04
 2.50185430e-04 2.47243432e-04 2.31820323e-04 2.25619549e-04
 2.19125308e-04 2.13416473e-04 2.04901380e-04 2.00331642e-04
 1.97005048e-04 1.92614212e-04 1.84471566e-04 1.82252669e-04
 1.79510568e-04 1.733607

In [44]:
pca.components_[0]

array([ 5.44343040e-02,  5.34313602e-02,  5.22597103e-02,  5.08010221e-02,
        4.65481962e-02,  5.68812286e-02,  5.06760852e-02,  5.36474769e-02,
        5.08626721e-02,  5.40083117e-02,  5.14823570e-02,  4.34171272e-02,
        3.73834978e-02,  3.08125754e-02,  4.18887705e-02,  5.12252733e-02,
        4.66966318e-02,  3.41866119e-03,  5.70812184e-02,  4.53076576e-02,
        6.04267796e-02,  5.71594698e-02,  5.57154516e-02,  4.60674067e-02,
        5.01402819e-02,  1.43062179e-02,  5.21297025e-02,  5.11090857e-02,
        3.36640618e-02,  3.01450569e-02,  2.53051157e-02,  2.21160084e-02,
        1.73001156e-02,  3.42444466e-02,  2.40859156e-02,  2.64425041e-02,
        2.36617999e-02,  2.49024079e-02,  2.48197362e-02,  2.39137250e-02,
        1.77514734e-02,  1.48143207e-02,  1.70445156e-02,  3.06487814e-02,
        2.27212897e-02,  2.05382139e-03,  3.06852740e-02,  3.84689290e-02,
        4.07305967e-02,  4.26949509e-02,  2.91966930e-02,  2.90123896e-02,
        2.12735851e-02,  

In [7]:
formatted.shape

(535, 252)

In [8]:
print(pca.explained_variance_ratio_.cumsum())

[0.70636777 0.82882097 0.85887562 0.8837942  0.90402563 0.91822704
 0.92573527 0.93289488 0.93917668 0.9451827  0.95009023 0.95417795
 0.95742194 0.96027915 0.96267785 0.96466538 0.96664897 0.96839945
 0.97002332 0.97152482 0.97291189 0.97424732 0.97552702 0.97665522
 0.97761549 0.97856379 0.97941541 0.98020117 0.98092333 0.98161098
 0.98227336 0.98289718 0.98348968 0.98406194 0.98458816 0.98510723
 0.98560779 0.98608578 0.98654301 0.98699237 0.98739772 0.98779683
 0.98817072 0.98852774 0.98886785 0.98920459 0.98953447 0.98984139
 0.99013735 0.99042079 0.99069603 0.99095867 0.99120886 0.9914561
 0.99168792 0.99191354 0.99213267 0.99234608 0.99255098 0.99275132
 0.99294832 0.99314094 0.99332541 0.99350766 0.99368717 0.99386053
 0.99402615 0.99418637 0.99434332 0.99449481 0.99464364 0.99478791
 0.99492934 0.99506553 0.99519735 0.99532365 0.99544574 0.9955672
 0.99568268 0.99579574 0.99590588 0.99601578 0.99612227 0.99622541
 0.9963254  0.99642068 0.99651426 0.99660596 0.99669647 0.996782

In [26]:
len(pca.components_[0])

252

In [45]:
pca.n_components_

252

In [46]:
len(pca.components_)

252

In [48]:
len(pca.components_[251])

252

In [27]:
formatted.shape

(535, 252)

In [9]:
plot_data = formatted.reset_index()[['locationabbr', 'yearstart']]\
    .join(
        pd.DataFrame(
            {'PC1': pca.components_[0], 'PC2': pca.components_[1]}
        )
    )
plot_data.head()

  plot_data = formatted.reset_index()[['locationabbr', 'yearstart']]\


Unnamed: 0,"(locationabbr, )","(yearstart, )",PC1,PC2
0,AK,2011,0.054434,0.175641
1,AL,2011,0.053431,0.171641
2,AR,2011,0.05226,0.168841
3,AZ,2011,0.050801,0.164946
4,CA,2011,0.046548,0.150727


In [10]:
plot_data.columns

Index([('locationabbr', ''), ('yearstart', ''), 'PC1', 'PC2'], dtype='object')

In [11]:
plot_data.columns = ['locationabbr', 'yearstart', 'PC1', 'PC2']
plot_data.head()

Unnamed: 0,locationabbr,yearstart,PC1,PC2
0,AK,2011,0.054434,0.175641
1,AL,2011,0.053431,0.171641
2,AR,2011,0.05226,0.168841
3,AZ,2011,0.050801,0.164946
4,CA,2011,0.046548,0.150727


In [12]:
from bokeh.palettes import inferno

In [13]:
n_colors = plot_data.locationabbr.nunique() 
color_map = {l:c for l,c in zip(plot_data.locationabbr.unique(), inferno(n=n_colors))}

In [14]:
plot_data['color'] = plot_data.locationabbr.map(color_map)

In [15]:
plot_data.head()

Unnamed: 0,locationabbr,yearstart,PC1,PC2,color
0,AK,2011,0.054434,0.175641,#000003
1,AL,2011,0.053431,0.171641,#010109
2,AR,2011,0.05226,0.168841,#040314
3,AZ,2011,0.050801,0.164946,#08061F
4,CA,2011,0.046548,0.150727,#0D0828


In [16]:
from bokeh.models import Label, LabelSet

In [24]:
plot_data[plot_data.yearstart == 2017].head()

Unnamed: 0,locationabbr,yearstart,PC1,PC2,color
320,AK,2017,,,#000003
321,AL,2017,,,#010109
322,AR,2017,,,#040314
323,AZ,2017,,,#08061F
324,CA,2017,,,#0D0828


In [20]:
source = ColumnDataSource(data=dict(
    PC1=plot_data.PC1
    , PC2=plot_data.PC2
    , locationabbr=plot_data.locationabbr
    , color=plot_data.color
))

plot = figure(width=400, 
              height=400, 
              tools="crosshair,pan,reset,save,wheel_zoom,box_zoom", 
              x_axis_label="PC1", 
              y_axis_label="PC2", 
              title="PCA Components");
plot.scatter('PC1','PC2',source=source, fill_color='color', fill_alpha=0.6);
labels = LabelSet(x='PC1', y='PC2', text='locationabbr',
              x_offset=5, y_offset=5, source=source)
plot.add_layout(labels)

p = gridplot([plot],ncols=2, merge_tools=True)
    
def update(y):
    filtered_data = plot_data[plot_data.yearstart == y]
    source.data = dict(
        PC1=filtered_data.PC1
        , PC2=filtered_data.PC2
        , locationabbr=filtered_data.locationabbr
        , color=filtered_data.color
    )
    output_notebook();
    show(p);
    push_notebook();

interact(update, y=(2011,2020), continuous_update=False)

interactive(children=(IntSlider(value=2015, description='y', max=2020, min=2011), Output()), _dom_classes=('wi…

<function __main__.update(y)>

Notes:
- In 2011, they are in like a diagonal line? What does that mean? I'm guessing it means I'm including a variable I shouldn't be...
- In 2012 we've got a few clusters,
       - one includes NC, NJ, DC, FL, MT, CA, ID
       - another includes OK, OH, ME, NY, NM, TX, VA
       - AR, AK far from the rest
- 2013 more clusters
    - one has NC, NM, HI, LA, NH, MT
    - another has SD TX WV, PA
    - third has AK, AR, CA, DC, AZ
- 2014 has a cluster with NJ, NH, HI, NY and more
- 2015 has one with NC, NJ, LA, MD, I wouldn't be surprised if this is the same one we saw in 2014...
- what happens in 2016???