# 2.0 Define Upgraded Vis
Define 
- data-requirements to reproduce the visualizations
- the output formats for the data and the visualization objects

_Paulo G. Martinez_ 9/21/2020

In [1]:
# import packages
# for json loading and dumping
import json
# for os/platform independent path handling
from pathlib import Path
# for table manipulations
import pandas as pd
# for html friendly interactive visualizations
import plotly.graph_objects as go
# for pretty printing
import pprint
# for string manipulations
import re

## Target Visualizations to Enhance:
![Monthly Milk Production 24 Selected States](pictures/national-milk-cows-prod-2019-2020.png)

## Get data from QS UI like so
Commodity: MILK, CATTLE

Data Item: 
- CATTLE, COWS, MILK - INVENTORY, AVG, MEASURED IN HEAD
- MILK - PRODUCTION, MEASURED IN LB
- MILK - PRODUCTION, MEASURED IN LB / HEAD

Geographic Level: National

Year: (All)

Period Type: MONTHLY

Period: JAN, FEB, MAR, APR, MAY, JUN, JUL, AUG, SEP, OCT, NOV, DEC
![QS National Monthly Milk Prod in Lbs, Milk Prod in Lbs per Head, Milk Cows](pictures/qs-nat-mo-mlkLbs-mlkLbsPerHead-mlkHead-allYears.png)
**QS UI Bug** If you select "Commodity: CATTLE" before command+selecting "Commodity: MILK" then the "Data Item:" Menu seems to "truncate" or filter out some of the "Commodity: MILK" "Data Item:"s. 
- I was able to "get around" this bug by selecting "Commodity: MILK" before command+selecting "Commodity: CATTLE"

## Read csv into data frame

In [2]:
# declare and handle path to data for local os
path_to_data = Path('data/nat-mo-mlkLbs-mlkLbsPerHead-mlkHead-allYears-F0ACA575-6D37-3687-AD58-E3B7F165E3A3.csv')
# read raw QS table into pandas
qs_df = pd.read_csv(path_to_data)
# show first rows
qs_df.head(3)

Unnamed: 0,Program,Year,Period,Week Ending,Geo Level,State,State ANSI,Ag District,Ag District Code,County,...,Zip Code,Region,watershed_code,Watershed,Commodity,Data Item,Domain,Domain Category,Value,CV (%)
0,SURVEY,2020,JAN,,NATIONAL,US TOTAL,,,,,...,,,0,,CATTLE,"CATTLE, COWS, MILK - INVENTORY, AVG, MEASURED ...",TOTAL,NOT SPECIFIED,9361000,
1,SURVEY,2020,JAN,,NATIONAL,US TOTAL,,,,,...,,,0,,MILK,"MILK - PRODUCTION, MEASURED IN LB",TOTAL,NOT SPECIFIED,18860000000,
2,SURVEY,2020,JAN,,NATIONAL,US TOTAL,,,,,...,,,0,,MILK,"MILK - PRODUCTION, MEASURED IN LB / HEAD",TOTAL,NOT SPECIFIED,2015,


# NOTE: _TECHNICALLY_ THIS IS THE MINIMUM AMMOUNT OF "PREPROCESSING" THAT DATA-MART REQUIRES DATA-BASE TO SERVE
# ----------------------------------------------
## PROVIDED THAT DATA-MART HAS ITS OWN COMPUTE RESOURCES FOR THE REMAINING ETL

In [3]:
# show basic info
qs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2797 entries, 0 to 2796
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Program           2797 non-null   object 
 1   Year              2797 non-null   int64  
 2   Period            2797 non-null   object 
 3   Week Ending       0 non-null      float64
 4   Geo Level         2797 non-null   object 
 5   State             2797 non-null   object 
 6   State ANSI        0 non-null      float64
 7   Ag District       0 non-null      float64
 8   Ag District Code  0 non-null      float64
 9   County            0 non-null      float64
 10  County ANSI       0 non-null      float64
 11  Zip Code          0 non-null      float64
 12  Region            0 non-null      float64
 13  watershed_code    2797 non-null   int64  
 14  Watershed         0 non-null      float64
 15  Commodity         2797 non-null   object 
 16  Data Item         2797 non-null   object 


### correct the datatype for the 'Value' column from strings to numeric

In [4]:
# see what the values look like
qs_df.Value.head().values

array(['9,361,000', '18,860,000,000', '2,015', '9,375,000',
       '17,890,000,000'], dtype=object)

In [5]:
# test out comma substitution
re.sub(',', '', qs_df.Value[0])

'9361000'

In [6]:
# apply the comma stripping to the whole column
qs_df.Value.apply(
    lambda s: re.sub(',', '', s)
)

0           9361000
1       18860000000
2              2015
3           9375000
4       17890000000
           ...     
2792     7091000000
2793            315
2794       22562000
2795     7391000000
2796            328
Name: Value, Length: 2797, dtype: object

**Check if there are any non-numeric values that need to be handled**

In [7]:
# set comprehension to collect all the strings that aren't numeric
print('Checking for non-numeric values in "Value" column...')
non_numerics = {s for s in qs_df.Value if not re.sub(',', '', s).isnumeric()}
if len(non_numerics) == 0:
    print('Good to go! Casting "Value" column to int64')
    qs_df['Value'] = pd.to_numeric(
        qs_df.Value.apply(
            lambda s: re.sub(',', '', s)
        ))
else:
    print('Need to handle:')
    print(non_numerics)

Checking for non-numeric values in "Value" column...
Good to go! Casting "Value" column to int64


# Declutter the Pivotted Data
### Drop empty columns

In [8]:
# drop columns where all cells are empty (not a number, nan)
qs_df = qs_df.dropna(
    axis = 'columns', 
    how = 'all',
)

### Split non-varying columns into a metadata object

In [9]:
# init a metadata object
metadata_dct = {}

# find columns where there is only one unique value
for col in qs_df:
    if len(qs_df[col].unique()) == 1:
        # print column header and unique value counts
        print(col, ":", qs_df[col].unique()[0])
        # add the column header and its unique value to the metadata
        metadata_dct[col] = qs_df[col].unique()[0]
        
        # drop it from the data frame
        qs_df = qs_df.drop(columns = [col])

Program : SURVEY
Geo Level : NATIONAL
State : US TOTAL
watershed_code : 0
Domain : TOTAL
Domain Category : NOT SPECIFIED


#### drop null-metadata

**get dictionary of known null-sentinel-values**

In [10]:
# check for the existence of null-sentinel-dictionary
path_to_sentinel_nulls_dct = Path('sentinel-nulls.json')
# if the file is in existence
if path_to_sentinel_nulls_dct.is_file():
    print('Found', path_to_sentinel_nulls_dct, 'in active directory. Reading file...')
    # read it into a dict
    with open(path_to_sentinel_nulls_dct, 'r') as file_path:
        sentinel_nulls_dct = json.load(file_path)
    assert type(sentinel_nulls_dct) == dict
# else, initialize the dict
else:
    print('Did not find', path_to_sentinel_nulls_dct, 'in active directory. Initializing file...')
    sentinel_nulls_dct = {}

# document the known sentinel values
print('Adding undocumented sentinel-null value entries, if any...')
if 'watershed_code' not in sentinel_nulls_dct:
    sentinel_nulls_dct['watershed_code'] = 0
if 'Domain Category' not in sentinel_nulls_dct:
    sentinel_nulls_dct['Domain Category'] = 'NOT SPECIFIED'
if 'State' not in sentinel_nulls_dct:
    sentinel_nulls_dct['State'] = 'US TOTAL'

# save the additions to the null-sentinel-dictionary
print('Writing', path_to_sentinel_nulls_dct, 'to active directory...')
with open(path_to_sentinel_nulls_dct, 'w') as file_path:
    json.dump(sentinel_nulls_dct, file_path)
print('Done.')

Found sentinel-nulls.json in active directory. Reading file...
Adding undocumented sentinel-null value entries, if any...
Writing sentinel-nulls.json to active directory...
Done.


**drop values known to be sentinels for NULL from the metadata**

In [11]:
for attribute in sentinel_nulls_dct:
    if attribute in metadata_dct:
        if sentinel_nulls_dct[attribute] == metadata_dct[attribute]:
            del metadata_dct[attribute]

**display the metadata as dictionary (json)**

In [12]:
# display the metadata
feedback = 'Auto-detected Meta-Data: (formatted as JSON)\n'
feedback = feedback + '-'*len(feedback) + '\n'
print(feedback)
pp = pprint.PrettyPrinter()
pp.pprint(metadata_dct)

Auto-detected Meta-Data: (formatted as JSON)
---------------------------------------------

{'Domain': 'TOTAL', 'Geo Level': 'NATIONAL', 'Program': 'SURVEY'}


**Display the metadata as a table**

In [13]:
feedback = 'Auto-detected Meta-Data: (formatted as table)\n'
feedback = feedback + '-'*len(feedback) + '\n'
print(feedback)
metadata_df = pd.DataFrame({k: [metadata_dct[k]] for k in metadata_dct})
metadata_df

Auto-detected Meta-Data: (formatted as table)
----------------------------------------------



Unnamed: 0,Program,Geo Level,Domain
0,SURVEY,NATIONAL,TOTAL


**Display the first few rows of the pivotted data**

In [14]:
qs_df.head()

Unnamed: 0,Year,Period,Commodity,Data Item,Value
0,2020,JAN,CATTLE,"CATTLE, COWS, MILK - INVENTORY, AVG, MEASURED ...",9361000
1,2020,JAN,MILK,"MILK - PRODUCTION, MEASURED IN LB",18860000000
2,2020,JAN,MILK,"MILK - PRODUCTION, MEASURED IN LB / HEAD",2015
3,2020,FEB,CATTLE,"CATTLE, COWS, MILK - INVENTORY, AVG, MEASURED ...",9375000
4,2020,FEB,MILK,"MILK - PRODUCTION, MEASURED IN LB",17890000000


**Drop "Commodity" column, if redundant**
- notice it's value is the first component of 'Data Item'
- notice it is not an attribute of an observation, but rather an attribute of the 'Data Item' (i.e. of the variable), thus it belongs in a data dictionary (which appropriately describes the variables and not the observations.)

In [15]:
print('Checking "Commodity" column for redundancy and droppability...')
# check if any value in the 'Commodity' column is not redundant by generating a boolean index of rows to check
non_rendundants = [
    commodity != data_item[:len(commodity)] for commodity, data_item in qs_df[['Commodity', 'Data Item']].values
]
if sum(non_rendundants):
    print('Found', sum(non_rendundants), 'non-redundant rows that need to be handled.')
else:
    print('column is redundant, dropped from data')
    qs_df = qs_df.drop(columns = ['Commodity'])

Checking "Commodity" column for redundancy and droppability...
column is redundant, dropped from data


**Display Declutterd Data**

In [16]:
qs_df.head()

Unnamed: 0,Year,Period,Data Item,Value
0,2020,JAN,"CATTLE, COWS, MILK - INVENTORY, AVG, MEASURED ...",9361000
1,2020,JAN,"MILK - PRODUCTION, MEASURED IN LB",18860000000
2,2020,JAN,"MILK - PRODUCTION, MEASURED IN LB / HEAD",2015
3,2020,FEB,"CATTLE, COWS, MILK - INVENTORY, AVG, MEASURED ...",9375000
4,2020,FEB,"MILK - PRODUCTION, MEASURED IN LB",17890000000


## "Practice example" of how I'll reshape the data into "tidy data format."

In [17]:
# define and display a df
foo_df = pd.DataFrame(
    {'yr':['2019', '2020', '2019', '2020', '2019', '2020'],
     'mo': ['J', 'J', 'J', 'J', 'J', 'J'],
     'var': ['twos', 'twos', 'threes', 'threes', 'fives', 'fives'],
     'val':[2,4,3,6,5,10]}
)
foo_df

Unnamed: 0,yr,mo,var,val
0,2019,J,twos,2
1,2020,J,twos,4
2,2019,J,threes,3
3,2020,J,threes,6
4,2019,J,fives,5
5,2020,J,fives,10


**practice splitting the df by variable and joining them back into tidy data format**

In [18]:
# set a flag
first = True
# for each unique variable in the variable column
for v in foo_df['var'].unique():
    if first:
        # filter down to rows corresponding to each unique variable
        buzz_df = foo_df[foo_df['var'] == v].rename(
            # rename the column holding the values with the variable they describe
            columns = {'val': v}
        ).drop(
            # drop the now redundant column describing the variable name
            columns = ['var']
        )
        
        # mark the flag
        first = False
        
    if not first:
        # do a full outer join
        buzz_df = pd.merge(
            # of the accumulated tidy data so far
            left = buzz_df, 
            # and of the tidied up data rows for the variable we're working on 
            right = foo_df[foo_df['var'] == v].rename(columns = {'val': v}).drop(columns = ['var']),
            how = 'outer'
        )
buzz_df

Unnamed: 0,yr,mo,twos,threes,fives
0,2019,J,2,3,5
1,2020,J,4,6,10


**alternative way to tidy the data**

In [19]:
# convert the pivotted index back into propper columns
baz_df = foo_df.pivot(index = ['yr', 'mo'], columns=['var'], values = ['val'])['val'].reset_index()
# drop the value column name from the resulting index
baz_df.columns.name = None
baz_df

Unnamed: 0,yr,mo,fives,threes,twos
0,2019,J,5,3,2
1,2020,J,10,6,4


## Reshape the table into "Tidy Data Format"
Note: There are many ways to do this, I picked an approach I thought most easy to interpret. There may be more efficient but less transparent ways to accomplish this with pivottting.

In [20]:
# set a flag
first = True
# for each unique variable in the variable column
for v in qs_df['Data Item'].unique():
    if first:
        # filter down to rows corresponding to each unique variable
            tidy_df = qs_df[qs_df['Data Item'] == v].rename(
            # rename the column holding the values with the variable they describe
            columns = {'Value': v}
            ).drop(
                # drop the now redundant column describing the variable name
                columns = ['Data Item']
            )

            # mark the flag
            first = False
        
    if not first:
        # do a full outer join
        tidy_df = pd.merge(
            # of the accumulated tidy data so far
            left = tidy_df, 
            # and of the tidied up data rows for the variable we're working on 
            right = qs_df[qs_df['Data Item'] == v].rename(columns = {'Value': v}).drop(columns = ['Data Item']),
            how = 'outer'
        )
tidy_df.head(20)

Unnamed: 0,Year,Period,"CATTLE, COWS, MILK - INVENTORY, AVG, MEASURED IN HEAD","MILK - PRODUCTION, MEASURED IN LB","MILK - PRODUCTION, MEASURED IN LB / HEAD"
0,2020,JAN,9361000.0,18860000000,2015.0
1,2020,FEB,9375000.0,17890000000,1908.0
2,2020,MAR,9385000.0,19380000000,2065.0
3,2020,APR,9375000.0,18675000000,1992.0
4,2020,MAY,9360000.0,18955000000,2025.0
5,2020,JUN,9350000.0,18367000000,1964.0
6,2020,JUL,9360000.0,18735000000,2002.0
7,2020,AUG,9360000.0,18600000000,1987.0
8,2019,JAN,9354000.0,18612000000,1990.0
9,2019,FEB,9352000.0,16966000000,1814.0


alternative tidying applied to qs_df (output looses the chronological order implicit in original qs_df)

In [21]:
'''# reshape the qs data frame into tidy data format
tidy_df = qs_df.pivot(
    # preserve the tidy columns by storing them into the index
    index = ['Year', 'Period'],
    # pivot on the 'Data Item' column
    columns = ['Data Item'],
    # declare the values to pivot into the 'Data Item' column
    values = ['Value']
)[
    # drop down a level in the resulting columns index by selecting the Value column
    'Value'
].reset_index(
    # convert the pivotted index back into propper columns
    drop = False
)
# drop the Value column name from the resulting index
tidy_df.columns.name = None
# sort the resulting tidy_df by Year descending and reset the index to avoid confusion
tidy_df = tidy_df.sort_values('Year', ascending = False).reset_index(drop = True)
# display the results
tidy_df.head(12)'''

"# reshape the qs data frame into tidy data format\ntidy_df = qs_df.pivot(\n    # preserve the tidy columns by storing them into the index\n    index = ['Year', 'Period'],\n    # pivot on the 'Data Item' column\n    columns = ['Data Item'],\n    # declare the values to pivot into the 'Data Item' column\n    values = ['Value']\n)[\n    # drop down a level in the resulting columns index by selecting the Value column\n    'Value'\n].reset_index(\n    # convert the pivotted index back into propper columns\n    drop = False\n)\n# drop the Value column name from the resulting index\ntidy_df.columns.name = None\n# sort the resulting tidy_df by Year descending and reset the index to avoid confusion\ntidy_df = tidy_df.sort_values('Year', ascending = False).reset_index(drop = True)\n# display the results\ntidy_df.head(12)"

### add a numeric month column

In [22]:
# create a map from month string to int
month_to_int_dct = {
    mo:i+1 for i,mo in enumerate([
        'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN',
        'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC',])
}
# add the column of months as ints
tidy_df['Month'] = tidy_df.Period.map(month_to_int_dct)
tidy_df.head()

Unnamed: 0,Year,Period,"CATTLE, COWS, MILK - INVENTORY, AVG, MEASURED IN HEAD","MILK - PRODUCTION, MEASURED IN LB","MILK - PRODUCTION, MEASURED IN LB / HEAD",Month
0,2020,JAN,9361000.0,18860000000,2015.0,1
1,2020,FEB,9375000.0,17890000000,1908.0,2
2,2020,MAR,9385000.0,19380000000,2065.0,3
3,2020,APR,9375000.0,18675000000,1992.0,4
4,2020,MAY,9360000.0,18955000000,2025.0,5


## Init data dictionary

In [23]:
tidy_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 935 entries, 0 to 934
Data columns (total 6 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   Year                                                   935 non-null    int64  
 1   Period                                                 935 non-null    object 
 2   CATTLE, COWS, MILK - INVENTORY, AVG, MEASURED IN HEAD  931 non-null    float64
 3   MILK - PRODUCTION, MEASURED IN LB                      935 non-null    int64  
 4   MILK - PRODUCTION, MEASURED IN LB / HEAD               931 non-null    float64
 5   Month                                                  935 non-null    int64  
dtypes: float64(2), int64(3), object(1)
memory usage: 51.1+ KB


In [24]:
# manually define data dict
tidy_data_dict = pd.DataFrame({
    'Estimate' : ['Year', 'Period', 'Milk Cows', 'Milk Production Lbs', 'Milk Per Cow', 'Month'],
    'Description':['Calendar year of survey estimate', 'Calendar month of survey estimate', 'Heads of milk-producing cattle. It is unclear why the "Data Item" includes the aggregation "AVG"', "Total pounds of milk produced", "Total pounds of milk produced divided by number of milk producing cows", "Calendar month as integer"],
    'Units': ['Calendar year', 'Calendar month', 'Heads of cattle', 'Lbs', 'Lbs per cattle-head', 'Calendar month'],
    'Aggregation': [None, None, 'Unclear, presumably sum of state counts but the "Data Item" name icludes "AVG" which is confusing', 'Total', 'Average', None],
    'Data Item': ['Year', 'Period', 'CATTLE, COWS, MILK - INVENTORY, AVG, MEASURED IN HEAD', 'MILK - PRODUCTION, MEASURED IN LB', 'MILK - PRODUCTION, MEASURED IN LB / HEAD', 'Month'],
    'Commodity': [None, None, 'CATTLE', 'MILK', 'MILK', None]
})
tidy_data_dict

Unnamed: 0,Estimate,Description,Units,Aggregation,Data Item,Commodity
0,Year,Calendar year of survey estimate,Calendar year,,Year,
1,Period,Calendar month of survey estimate,Calendar month,,Period,
2,Milk Cows,Heads of milk-producing cattle. It is unclear ...,Heads of cattle,"Unclear, presumably sum of state counts but th...","CATTLE, COWS, MILK - INVENTORY, AVG, MEASURED ...",CATTLE
3,Milk Production Lbs,Total pounds of milk produced,Lbs,Total,"MILK - PRODUCTION, MEASURED IN LB",MILK
4,Milk Per Cow,Total pounds of milk produced divided by numbe...,Lbs per cattle-head,Average,"MILK - PRODUCTION, MEASURED IN LB / HEAD",MILK
5,Month,Calendar month as integer,Calendar month,,Month,


## Rename columns to be more user-friendly


In [25]:
tidy_df = tidy_df.rename(columns = {
    col : tidy_data_dict[tidy_data_dict['Data Item'] == col].Estimate.values[0] for col in tidy_data_dict['Data Item']
})
tidy_df.head()

Unnamed: 0,Year,Period,Milk Cows,Milk Production Lbs,Milk Per Cow,Month
0,2020,JAN,9361000.0,18860000000,2015.0,1
1,2020,FEB,9375000.0,17890000000,1908.0,2
2,2020,MAR,9385000.0,19380000000,2065.0,3
3,2020,APR,9375000.0,18675000000,1992.0,4
4,2020,MAY,9360000.0,18955000000,2025.0,5


# NOTE: IDEALLY DB WOULD SERVE DATA-MART THE FOLLOWING THREE OBJECTS:
# -------------------------------
- ## DATA DICTIOANRY (see above)
- ## TIDY DATA (see above)
- ## META DATA (see below)

This is only the first instance of what is likely to be a more genralizable workflow, but if the data mart is defined by containing the data necessary to replicate the three time series plots in this milk-report then it needs to contain the tidy data (and should contain the meta-data and data dictionary to provide helpful context to its clients). 

# -------------------------------

In [26]:
metadata_df

Unnamed: 0,Program,Geo Level,Domain
0,SURVEY,NATIONAL,TOTAL


In [27]:
# save the three tables defining data-marts data-requiremets
metadata_df.to_csv(Path('data-mart-tech-specs/meta-data.csv'), index = False)
tidy_data_dict.to_csv(Path('data-mart-tech-specs/data-dict.csv'), index = False)
tidy_df.to_csv(Path('data-mart-tech-specs/tidy-data.csv'), index = False)

# DEFINE PLOT OBJECTS FOR DATA MART TO SERVE TO API
# ------------------------------------------------

## Replicate the plots

**Milk Production Lbs**

In [28]:
col = 'Milk Production Lbs'

In [29]:
# init the figure
fig = go.FigureWidget()
# add some traces
for yr in tidy_df.Year.unique():
    if yr in {2020, 2019}:
        fig.add_trace(
            go.Scatter(
                x = tidy_df[tidy_df.Year == yr].sort_values(by = ['Month'])['Period'],
                y = tidy_df[tidy_df.Year == yr].sort_values(by = ['Month'])[col],
                mode = 'lines+markers',
                name = str(yr),
            )
        )
    # add the rest of the traces toggled off
    else:
        fig.add_trace(
            go.Scatter(
                x = tidy_df[tidy_df.Year == yr].sort_values(by = ['Month'])['Period'],
                y = tidy_df[tidy_df.Year == yr].sort_values(by = ['Month'])[col],
                mode = 'lines+markers',
                name = str(yr),
                visible='legendonly'
            )
        )
# Add title
fig.update_layout(
    title = 'Monthly ' + col + ', National',
    yaxis_title = col,
)
# display the figure
fig

FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': '2020',
              'type': 'sca…

**save vis objects**

In [30]:
# init extensionless file path
fp = 'data-mart-tech-specs/' + re.sub(' ', '-', col.lower())
print(fp)
# save to html
with open(Path(fp + '.html'), 'w') as file_path:
    file_path.write(fig.to_html())
# save to json
with open(Path(fp + '.json'), 'w') as file_path:
    file_path.write(fig.to_json())
# save to png
with open(Path(fp + '.png'), 'wb') as file_path:
    file_path.write(fig.to_image('png'))

data-mart-tech-specs/milk-production-lbs


In [31]:
file_path.name

'data-mart-tech-specs/milk-production-lbs.png'

**Milk Cows**

In [32]:
col = 'Milk Cows'

In [33]:
# init the figure
fig = go.FigureWidget()
# add some traces
for yr in tidy_df.Year.unique():
    if yr in {2020, 2019}:
        fig.add_trace(
            go.Scatter(
                x = tidy_df[tidy_df.Year == yr].sort_values(by = ['Month'])['Period'],
                y = tidy_df[tidy_df.Year == yr].sort_values(by = ['Month'])[col],
                mode = 'lines+markers',
                name = str(yr),
            )
        )
    # add the rest of the traces toggled off
    else:
        fig.add_trace(
            go.Scatter(
                x = tidy_df[tidy_df.Year == yr].sort_values(by = ['Month'])['Period'],
                y = tidy_df[tidy_df.Year == yr].sort_values(by = ['Month'])[col],
                mode = 'lines+markers',
                name = str(yr),
                visible='legendonly'
            )
        )
# Add title
fig.update_layout(
    title = 'Monthly ' + col + ', National',
    yaxis_title = col,
)
# display the figure
fig

FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': '2020',
              'type': 'sca…

**save the vis objects**

In [34]:
# init extensionless file path
fp = 'data-mart-tech-specs/' + re.sub(' ', '-', col.lower())
print(fp)
# save to html
with open(Path(fp + '.html'), 'w') as file_path:
    file_path.write(fig.to_html())
# save to json
with open(Path(fp + '.json'), 'w') as file_path:
    file_path.write(fig.to_json())
# save to png
with open(Path(fp + '.png'), 'wb') as file_path:
    file_path.write(fig.to_image('png'))

data-mart-tech-specs/milk-cows


**Milk Per Cow**

In [35]:
col = 'Milk Per Cow'

In [36]:
# init the figure
fig = go.FigureWidget()
# add some traces
for yr in tidy_df.Year.unique():
    if yr in {2020, 2019}:
        fig.add_trace(
            go.Scatter(
                x = tidy_df[tidy_df.Year == yr].sort_values(by = ['Month'])['Period'],
                y = tidy_df[tidy_df.Year == yr].sort_values(by = ['Month'])[col],
                mode = 'lines+markers',
                name = str(yr),
            )
        )
    # add the rest of the traces toggled off
    else:
        fig.add_trace(
            go.Scatter(
                x = tidy_df[tidy_df.Year == yr].sort_values(by = ['Month'])['Period'],
                y = tidy_df[tidy_df.Year == yr].sort_values(by = ['Month'])[col],
                mode = 'lines+markers',
                name = str(yr),
                visible='legendonly'
            )
        )
# Add title
fig.update_layout(
    title = 'Monthly ' + col + ', National',
    yaxis_title = col,
)
# display the figure
fig

FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': '2020',
              'type': 'sca…

**save vis objects**

In [37]:
# init extensionless file path
fp = 'data-mart-tech-specs/' + re.sub(' ', '-', col.lower())
print(fp)
# save to html
with open(Path(fp + '.html'), 'w') as file_path:
    file_path.write(fig.to_html())
# save to json
with open(Path(fp + '.json'), 'w') as file_path:
    file_path.write(fig.to_json())
# save to png
with open(Path(fp + '.png'), 'wb') as file_path:
    file_path.write(fig.to_image('png'))

data-mart-tech-specs/milk-per-cow
