## Information Visualization II

## School of Information, University of Michigan

-Interactive Graphs in Altair

****

In [1]:
# pip install altair==4.2.2

In [25]:
# start with the setup
import pandas as pd
import numpy as np
import altair as alt
import warnings
warnings.filterwarnings("ignore")

In [3]:
alt.__version__

'4.2.2'

In [4]:
# enable correct rendering
alt.renderers.enable('default')

RendererRegistry.enable('default')

In [5]:
# uses intermediate json files to speed things up
alt.data_transformers.enable('json')

# use the 538 theme
alt.themes.enable('fivethirtyeight')

ThemeRegistry.enable('fivethirtyeight')

## Create an interactive graph that allows selection across the top charts and filters the bottom charts

In [6]:
#sample function - provided
def createComicFrame(DCDataFile='data/dc-wikia-data.csv',marvelDatafile='data/marvel-wikia-data.csv'):
    # load up the two datasets, one for Marvel and one for DC
    dc = pd.read_csv(DCDataFile)
    marvel = pd.read_csv(marvelDatafile)
    
    # label the publisher
    dc['publisher'] = 'DC'
    marvel['publisher'] = 'Marvel'
    
    # rename some columns
    marvel.rename(columns={'Year': 'YEAR'}, inplace=True)
    
    # create the concatenated table with everything
    comic = pd.concat([dc, marvel])
    
    # drop years with na values
    comic.dropna(subset=['YEAR'], inplace=True)
    
    return(comic)

In [7]:
comic = createComicFrame()

In [8]:
comic.head()

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,YEAR,publisher
0,1422,Batman (Bruce Wayne),\/wiki\/Batman_(Bruce_Wayne),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,3093.0,"1939, May",1939.0,DC
1,23387,Superman (Clark Kent),\/wiki\/Superman_(Clark_Kent),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,2496.0,"1986, October",1986.0,DC
2,1458,Green Lantern (Hal Jordan),\/wiki\/Green_Lantern_(Hal_Jordan),Secret Identity,Good Characters,Brown Eyes,Brown Hair,Male Characters,,Living Characters,1565.0,"1959, October",1959.0,DC
3,1659,James Gordon (New Earth),\/wiki\/James_Gordon_(New_Earth),Public Identity,Good Characters,Brown Eyes,White Hair,Male Characters,,Living Characters,1316.0,"1987, February",1987.0,DC
4,1576,Richard Grayson (New Earth),\/wiki\/Richard_Grayson_(New_Earth),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,1237.0,"1940, April",1940.0,DC


In [9]:
def genStaticBars(comicDF):

    p1_bar_base = alt.Chart(comicDF).mark_bar(size=2.5).encode( 
        alt.Y('count():Q', 
              axis=alt.Axis(values=[0, 100, 200, 300, 400, 500], 
                            title=None,
                            labelFontWeight="bold",
                            labelFontSize=15),
              scale=alt.Scale(domain=[0, 500]))).properties(
                     width=240,
                     height=300
    )


    #DC
    bar_dc = p1_bar_base.encode(alt.X('YEAR:N',  # create the X axis based on year and fix the look of the axes
                                   axis=alt.Axis(values=[1940, 1960, 1980, 2000], labels=True, ticks=False,grid=True,
                                                 title="DC, New Earth continuity",
                                                 titlePadding=-347, 
                                                 labelAngle=360,
                                                 labelFontWeight="bold",
                                                 labelFontSize=15,)),
            ).transform_filter(
                # we will use Altair's filter to only keep DC for this chart
                alt.datum.publisher == 'DC'
            )

    #Marvel
    bar_marvel = p1_bar_base.mark_bar(color='#f6573f').encode(alt.X('YEAR:N', # create the X axis based on year 
                                # fix the look of the axes
                               axis=alt.Axis(values=[1940, 1960, 1980, 2000], labels=True, ticks=False,grid=True,
                                             title="Marvel, Earth-616 continuity",
                                             titlePadding=-347,
                                             labelAngle=360,
                                             labelFontWeight="bold",
                                             labelFontSize=15)),
            ).transform_filter(
                # we will use Altair's filter to only keep DC for this chart
                alt.datum.publisher == 'Marvel'
            )
    
    return(bar_dc,bar_marvel)

In [10]:
def genStaticLines(comicDF):
    
    p1_line_base = alt.Chart(comicDF).mark_line().encode( 
         # the X axis will be year
         alt.X('YEAR:N'),
         # the Y axis will be the count (the number of points that year)
         alt.Y('count():Q', axis=alt.Axis(grid=False, 
                                        labelFontWeight="bold",
                                        labelFontSize=15, 
                                        title=None)),
         # let's split the data and color by SEX
         alt.Color('SEX', 
                  scale = alt.Scale(domain=['Female Characters', 'Male Characters'], range=['#31a354', '#ce6dbd']),
                  legend=alt.Legend(orient="bottom"))
        ).properties(
                    width=240, height=80
         )

    #DC
    line_dc = p1_line_base.encode(alt.X('YEAR:N',
                                           axis=alt.Axis(values=[1940, 1960, 1980, 2000], 
                                                                  grid=True, 
                                                                  labelAngle=360,
                                                                  labelFontWeight="bold",
                                                                  labelFontSize=15,
                                                                  title = 'Dc, Female and Male characters over time',
                                                                  titlePadding=-130,
                                                                  titleFontSize = 12
                                                                 )
                                          )
                ).transform_filter(
                    # this is the DC line chart, so we only want DC
                    alt.datum.publisher == 'DC'
                )


    #Marvel
    line_marvel = p1_line_base.encode(alt.X('YEAR:N', 
                                        axis=alt.Axis(values=[1940, 1960, 1980, 2000], 
                                                              grid=True, 
                                                              labelAngle=360,
                                                              labelFontWeight="bold",
                                                              labelFontSize=15,
                                                              title = 'Marvel, Female and Male characters over time',
                                                              titlePadding=-130,
                                                              titleFontSize = 12
                                                             )
                                        )
                ).transform_filter(
                    # this is the Marvel line chart, so we only want Marvel
                    alt.datum.publisher == 'Marvel'
                )



    return(line_dc,line_marvel)

In [12]:
# the top two bar charts
bar_dc, bar_marvel = genStaticBars(comic)

# and the bottom two line charts
line_dc, line_marvel = genStaticLines(comic)

In [13]:
# let's make a function that puts it all together
def genIntroVis(b_dc, b_marvel, l_dc, l_marvel):
    # let's put everything together
    
    # top piece 
    top_charts = alt.hconcat(b_dc,b_marvel).resolve_scale(y='shared'
               ).properties(
                        title='New Comic Book Characters Introduced Per Year'
               )

    # bottom piece
    bottom_charts = alt.hconcat(l_dc,l_marvel).resolve_scale(y='shared')

    return alt.vconcat(top_charts,bottom_charts).configure_view(
        strokeWidth=0
    )

In [26]:
# and now that we have them, let's put them together
genIntroVis(bar_dc, bar_marvel, line_dc, line_marvel)

In [15]:
#brush object
brush = alt.selection_interval();

In [16]:
#brush onject modified
colorConditionDC = alt.condition(brush,alt.value('#2182bd'),alt.value("gray"))

In [18]:
# modify this cell to create the brush object
i_bar_dc = bar_dc.add_selection(brush).encode(color=colorConditionDC)

In [27]:
# if you did the last step correctly, you should be able to see the selection work for the DC bar chart
i_bar_dc

In [21]:
# modify the following two lines
colorConditionMarvel = alt.condition(brush,alt.value('#f6573f'),alt.value("gray"))
i_bar_marvel = bar_marvel.add_selection(brush).encode(color=colorConditionMarvel)

In [28]:
# top piece 
top_charts = alt.hconcat(i_bar_dc,i_bar_marvel).resolve_scale(y='shared'
           ).properties(
                    title='New Comic Book Characters Introduced Per Year'
           )
top_charts

In [23]:
i_line_dc = line_dc.add_selection(brush).transform_filter(brush)

i_line_marvel = line_marvel.add_selection(brush).transform_filter(brush)

In [29]:

genIntroVis(i_bar_dc, i_bar_marvel, i_line_dc, i_line_marvel)

In [30]:
#provided code
def generatePercentTable(comicDF, publisher):
    # input: comicDF -- dataframe of characters as described above (e.g., comic)
    # input: publisher -- a string, either DC or Marvel
    # return: a procesed percent table
    
    _df = comicDF[comicDF.publisher == publisher]
    _df = _df[['SEX','YEAR']]
    _df = pd.get_dummies(_df)
    _df.YEAR = _df.YEAR.astype('int')
    _df = _df.groupby(['YEAR']).sum()

    _df['total'] = 0
    _df['total'] = _df['total'].astype('int')
    for col in list(comicDF[comicDF.publisher == publisher].SEX.unique()):
        col = str(col)
        if (col != 'nan'):
            _df['total'] = _df['total'].astype('int') + _df["SEX_"+col].astype('int')

    _df['% Female'] = _df['SEX_Female Characters'] / _df.total
    _df = _df.reset_index()
    _df = _df[['YEAR','% Female','SEX_Female Characters','SEX_Male Characters','total']]
    _df['publisher'] = publisher
    _df = _df[(_df.YEAR >= 1979)]
    _df['Year-over-year change in % Female'] = _df['% Female'].pct_change()
    toret = _df[(_df.YEAR > 1980) & (_df.YEAR < 2013)].copy()
    t2 = toret.cumsum()
    toret['% Female characters to date'] = list(t2['SEX_Female Characters'] / t2['total'])
    return(toret)



In [31]:
#percent tabels for publisher
changedata = pd.concat([generatePercentTable(comic,"Marvel"),generatePercentTable(comic,"DC")])

#melt
changedata = pd.melt(changedata,id_vars=['YEAR','publisher'],value_vars=['% Female',
                                                             'Year-over-year change in % Female',
                                                             '% Female characters to date'])

In [32]:
# let's see what's inside
changedata.sample(5)

Unnamed: 0,YEAR,publisher,variable,value
16,1997,Marvel,% Female,0.319298
157,2010,Marvel,% Female characters to date,0.291548
161,1982,DC,% Female characters to date,0.276786
27,2008,Marvel,% Female,0.291908
180,2001,DC,% Female characters to date,0.314685


In [34]:
def generateLineChartP21(changeDF):
    
    variable_selection = alt.selection_single(
    name='Data',
    fields=['variable'],
    bind=alt.binding_select(options=list(changeDF['variable'].unique())),
    init={'variable': list(changeDF['variable'].unique())[0]}
    )


    line_chart = alt.Chart(changeDF).mark_line().encode(
        x='YEAR:O', 
        y='value:Q',
        color='publisher:N'
    ).add_selection(
        variable_selection
    ).transform_filter(
        variable_selection
    )

    return line_chart



In [35]:
#line chart with filter
generateLineChartP21(changedata)

In [36]:
def generateLineChartP22(changeDF):
    # input: changeDF -- the data frame, formatted as changedata above
    # return: an altar chart as described above
    
    # YOUR CODE HERE
    nearest = alt.selection_single(on='mouseover', nearest=True, fields=['YEAR'],empty="none")

    variable_selection = alt.selection_single(
    name='Data',
    fields=['variable'],
    bind=alt.binding_select(options=list(changeDF['variable'].unique())),
    init={'variable': list(changeDF['variable'].unique())[0]}
    )

    #create the line chart
    line = alt.Chart(changeDF).mark_line().encode(
        x='YEAR:O', 
        y='value:Q',
        color='publisher:N'
    )

    #empty == 'none' removes points from displaying at the beginning
    
    #sets all points opacity to zero and adds the selection 
    selectors = alt.Chart(changeDF).mark_point().encode(
        x='YEAR:O',
        opacity=alt.value(0),
    ).add_selection(
        nearest
    )

    #at the beginning all points are encoded. hovering with the mouse leads all other points to have an opacity of zero
    #essentially only the point nearest the mouse will be displayed
    points = line.mark_point().encode(
        opacity=alt.condition(nearest, alt.value(1), alt.value(0))
    )

    #adds text with the percentage format based on the condition of nearest otherwise no text
    #dx and dy provide location
    text = line.mark_text(align='left', dx=5, dy=-5).encode(
        text=alt.condition(nearest, alt.Text('value:Q', format='.0%'), alt.value(' '))
    )

    #adds a rule (vertical line) at the point where the line is nearest to the mouse (hover)
    rules = alt.Chart(changeDF).mark_rule(color='gray').encode(
        x='YEAR:O',
    ).transform_filter(
        nearest
    )

    #layer all of the charts
    final = alt.layer(
        line, selectors, points, rules, text
    )

    #add back the selection widget so the charts are not all dispalyed at the same time
    final = final.add_selection(
        variable_selection
    ).transform_filter(
        variable_selection
    )
    
    
    return final


In [37]:
#line chart with mouse over
generateLineChartP22(changedata)

## Additional interative chart

In [38]:
comic_filtered = comic[comic['APPEARANCES'] > 1250]
comic_filtered['Name'] = comic_filtered['name'].str.extract(r'^(.*?)(?:\s*\()')

In [39]:
#use pandas to get the min/max
appearances_min = comic_filtered['YEAR'].min()
appearances_max = comic_filtered['YEAR'].max()
print(appearances_min,appearances_max)

1939.0 1987.0


In [41]:
slider=alt.binding_range(
    min=appearances_min,  
    max=appearances_max,  
    step=1,              
    name="year of first appearances"        
    )

selector = alt.selection_single(
    bind=slider,        
    fields= ["year of first appearances"], 
    init={"year of first appearances":appearances_max}  
    )


# Create an Altair bar chart
bar_chart = alt.Chart(comic_filtered).mark_bar().encode(
    x=alt.X('YEAR:O',  title='Year of First Appearance'),  
    y=alt.Y('APPEARANCES:Q', title='Total Appearances'),
    color='Name:N'
).properties(
    height=400,
    width=300  
)


bar_chart_t = bar_chart.add_selection(
        selector
).transform_filter(alt.datum.YEAR <= selector['year of first appearances'])


#1.2 Tooltip Variant
bar_chart_t=bar_chart_t.encode(
    tooltip=['Name:N','APPEARANCES:Q']
).properties(title='Total Appearances (Minimum 1250)')


bar_chart_t = bar_chart_t.configure_title(
    fontSize=20,  
    font='Helvetica',  
    anchor='middle',  
    color='black'  
).configure_axis(
    labelFontSize=14,  
    titleFontSize=16 
)

bar_chart_t

## Chart Notes

The chart adds the interactivity of the slide bar to cut off the year of first appearance. So if you set the bar to 1963, you can see, according to the data, what superheros had appeared at by that time. I prefiltered the data to a minimum of 1250 appears so that the chart would read better. The chart expresses the name of the superhero by color. We can see that in 1961-1963 had a boom in high volume superheroes as opposed to other years. The slider, set to 1960, lets me easily see that there are only 4 superheros that have at least 1250 appearances. These are some of the most famous (Batman, Captain America, Thor, and Green Lantern).

## Additional chart below

In [44]:
bar_chart2 = bar_chart

selectOrigin=alt.selection_single(
    fields=['publisher'],
    init={"publisher":list(comic_filtered['publisher'].unique())[0]},
    # notice the binding_radio
    bind=alt.binding_radio(options=list(comic_filtered['publisher'].unique()),name="publisher"),#edit this line
    name="publisher"
)

bar_chart2 = bar_chart2.add_selection(
    selectOrigin
).encode(
    color=alt.condition(selectOrigin,alt.Color("publisher:N"),alt.value('lightgray'))
)

bar_chart2 = bar_chart2.add_selection(
        selector
).transform_filter(alt.datum.YEAR <= selector['year of first appearances'])

In [45]:
alt.hconcat(bar_chart, bar_chart2).properties(title='Total Appearances (Minimum 1250)').configure_title(
    fontSize=20,  
    font='Helvetica',  
    anchor='middle',  
    color='black'  
).configure_axis(
    labelFontSize=14,  
    titleFontSize=16 
)

## Chart Notes

This additional version has an added radio button. There are likely additional ways to perfect this display. Here, the color is used in both cases. THe radio button addjusts the selection on the right bar chart. We can see that Marvel had a period of high appearance heros from 1960 to 1975 - perhaps a golden age for Marvel comic books.