# Intro

This is a template for data visualization with altair. 

# Boilerplate

We first have to load in the modules we need. We need pandas and numpy for working with the data and altair for the visualization.

In [56]:
import pandas as pd
import numpy as np
import altair as alt

# Recession Data
Here we create the data for the recession regions so that we can highlight the recession sections

In [68]:
recessions = [
    {
            "start": 1970,
            "end": 1971,
            "event": "Recesion of 1969-1970"
    },
    {
            "start": 1973,
            "end": 1975,
            "event": "Recession of 1973"
    },
    {
            "start": 1980,
            "end": 1981,
            "event": "1980 Recession"
    },
    {
            "start": 1990,
            "end": 1991,
            "event": "Early 1990s Recession"
    },
    {
            "start": 2001,
            "end": 2002,
            "event": "Early 2000s Recession"
    },
    {
            "start": 2007,
            "end": 2009,
            "event": "The Great Recession"
    }
]
df_rec = pd.DataFrame(recessions)

highlight = alt.Chart(df_rec).mark_rect(fillOpacity=.4).encode(
    x='start:O',
    x2='end:O',
    color=alt.Color('event:N', legend=alt.Legend(title='Recession'))
)

# Loading data and sanity check

We now load in the data and check the data

In [58]:
df = pd.read_csv('agriculture-and-rural-development.csv')
df.head()

Unnamed: 0,Country Name,Country Code,Year,"Access to electricity, rural (% of rural population)",Agricultural irrigated land (% of total agricultural land),Agricultural land (% of land area),Agricultural land (sq. km),"Agricultural machinery, tractors","Agricultural machinery, tractors per 100 sq. km of arable land",Agricultural methane emissions (% of total),...,Rural land area (sq. km),Rural land area where elevation is below 5 meters (% of total land area),Rural land area where elevation is below 5 meters (sq. km),Rural population,Rural population (% of total population),Rural population growth (annual %),Rural population living in areas where elevation is below 5 meters (% of total population),Rural poverty gap at national poverty lines (%),Rural poverty headcount ratio at national poverty lines (% of rural population),Surface area (sq. km)
0,Albania,ALB,1977,0.0,0.0,40.474453,11090.0,10000,172.413793,54.990065,...,0.0,0.0,0.0,1681437,66.895,1.919234,0.0,0.0,0.0,28750.0
1,Azerbaijan,AZE,2013,100.0,29.651138,57.704545,47698.0,0,0.0,0.0,...,0.0,0.0,0.0,4318074,45.855,0.723698,0.0,0.0,0.0,86600.0
2,Azerbaijan,AZE,2015,100.0,30.001258,57.701753,47698.0,0,0.0,0.0,...,0.0,0.0,0.0,4369801,45.286,0.548506,0.0,0.0,0.0,86600.0
3,Burkina Faso,BFA,1993,0.6,0.0,34.722222,95000.0,1380,4.023324,70.446194,...,0.0,0.0,0.0,8158483,85.407,2.402177,0.0,0.0,0.0,274220.0
4,Bulgaria,BGR,1989,0.0,0.0,55.753412,61680.0,53653,139.430873,36.030421,...,0.0,0.0,0.0,3015064,33.965,-2.174805,0.0,0.0,0.0,110990.0


In [59]:
df.dtypes

Country Name                                                                                   object
Country Code                                                                                   object
Year                                                                                            int64
Access to electricity, rural (% of rural population)                                          float64
Agricultural irrigated land (% of total agricultural land)                                    float64
Agricultural land (% of land area)                                                            float64
Agricultural land (sq. km)                                                                    float64
Agricultural machinery, tractors                                                                int64
Agricultural machinery, tractors per 100 sq. km of arable land                                float64
Agricultural methane emissions (% of total)                                       

In [60]:
df_usa = df[df['Country Code']=='USA'].sort_values(by=['Year'])
print(len(df_usa))

58


In [70]:
#df_temp = df_usa.copy()
#df_temp['ptemp'] = df_temp['Patent applications, residents'].diff()

col = list(df_usa.columns[3:])

dft = df_usa[(df_usa['Year'] != 2017) & (df_usa['Year'] > 1960)]

#22, 24
#27
cur = col[27]
print(cur)
temp = alt.Chart(dft).mark_point().encode(
    x='Year:O',
    y=cur,
    #color='Third Dim',
    #tooltip=['Item1', 'Item2', 'Item3', 'Item4']
).properties(
    title='Food production index by year (2004-2006 = 100)',
    width=760,
    height=400
)#.interactive()
temp + highlight

Food production index (2004-2006 = 100)


In [62]:
df.dtypes

Country Name                                                                                   object
Country Code                                                                                   object
Year                                                                                            int64
Access to electricity, rural (% of rural population)                                          float64
Agricultural irrigated land (% of total agricultural land)                                    float64
Agricultural land (% of land area)                                                            float64
Agricultural land (sq. km)                                                                    float64
Agricultural machinery, tractors                                                                int64
Agricultural machinery, tractors per 100 sq. km of arable land                                float64
Agricultural methane emissions (% of total)                                       

# Making some real charts

We first melt the data so that we can graph trademarks by origin

## Trademarks

In [25]:
df_melted = pd.melt(df_usa, id_vars=['Year'], value_vars=['Trademark applications, direct nonresident',
                                                          'Trademark applications, direct resident',
                                                          'Trademark applications, total'],
                   var_name='Trademark Origin', value_name='Trademark Applications')

# Remove 2017 year
df_melted = df_melted[df_melted['Year'] != 2017]

KeyError: "The following 'value_vars' are not present in the DataFrame: ['Trademark applications, direct nonresident', 'Trademark applications, direct resident', 'Trademark applications, total']"

In [16]:
tdmk = alt.Chart(df_melted).mark_line().encode(
    x='Year:O',
    y='Trademark Applications',
    color=alt.Color('Trademark Origin', legend=alt.Legend(title='Trademark'))
    #tooltip=['Item1', 'Item2', 'Item3', 'Item4']
).properties(
    title='Title',
    width=760,
    height=400,
)#.interactive()

(tdmk + highlight).resolve_scale(color='independent')

NameError: name 'df_melted' is not defined

## Patents

In [33]:
df_pat = pd.melt(df_usa, id_vars=['Year'], value_vars=col[41:45],
                   var_name='Unemployment', value_name='Unemployment Rate')

# Remove 2017 year
df_pat = df_pat[(df_pat['Year'] != 2017) & (df_pat['Year'] > 1964)]


pat = alt.Chart(df_pat).mark_line().encode(
    x='Year:O',
    y='Unemployment Rate',
    color=alt.Color('Unemployment', legend=alt.Legend(title='Unemployment'))
    #tooltip=['Item1', 'Item2', 'Item3', 'Item4']
).properties(
    title='Title',
    width=760,
    height=400,
)#.interactive()

(pat + highlight).resolve_scale(color='independent')