In [1]:
# Import all the things

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from ipywidgets import interactive
from IPython.display import display
import ipywidgets as widgets
from ipywidgets import interact, widgets


# %matplotlib inline

In [2]:
# Read in data

workshops  = pd.read_csv('workshops.csv')

## Preview the whole dataframe

In [3]:
# Preview data

workshops

Unnamed: 0,slug,tag_name,yr,attendance,fullname,administrator_id
0,2011-11-07-toronto,SWC,2011,22.0,University of Toronto,297.0
1,2012-01-18-stsci,SWC,2012,14.0,Space Telescope Science Institute,173.0
2,2012-02-20-itcp,SWC,2012,50.0,International Centre for Theoretical Physics,306.0
3,2012-02-23-toronto,SWC,2012,28.0,University of Toronto,297.0
4,2012-03-07-indiana,SWC,2012,39.0,Indiana University,173.0
5,2012-03-26-mbari,SWC,2012,38.0,Monterey Bay Aquarium Research Institute,173.0
6,2012-03-28-nersc,SWC,2012,35.0,NERSC,173.0
7,2012-04-02-chicago,SWC,2012,35.0,University of Chicago,173.0
8,2012-04-14-utahstate,SWC,2012,32.0,Utah State University,173.0
9,2012-04-30-ucl,SWC,2012,44.0,University College London,278.0


In [4]:
def workshops_by_year(data, year): 
    """Takes a dataframe and a year, returns dataframe filtered by year"""
    df = pd.read_csv(data)
    df = df[df['yr'] == year]
    return df



## Select a year to see a dataframe of all workshops (across Carpentries) that year

In [5]:
# The interact function takes as arguments the function and that function's arguments separately
interact(workshops_by_year, data = 'workshops.csv', year = range(2011, 2018)) 

A Jupyter Widget

<function __main__.workshops_by_year>

In [6]:
def workshops_by_carpentry(data, carpentry): 
    """Takes a dataframe and a carpentry, returns dataframe filtered by that carpentry"""
    df = pd.read_csv(data)
    df = df[df['tag_name'].str.contains(carpentry)]
    return df

## Select a Carpentry to see a dataframe of all workshops for that Carpentry (across all years)

In [7]:
interact(workshops_by_carpentry, data = 'workshops.csv', carpentry = ['SWC', 'DC', 'LC']) 

A Jupyter Widget

<function __main__.workshops_by_carpentry>

In [8]:
def attendance_by_carpentry_and_year(data, carpentry): 
    """Takes a dataframe and a carpentry, returns attendance total by year for that carpentry as pandas series and bar plot"""
    df = pd.read_csv(data)
    df = df[df['tag_name'].str.contains(carpentry)]
    attendance_by_year = df.groupby('yr')['attendance'].sum()
   
    attendance_list = attendance_by_year.tolist()
    
    offset = max(attendance_list) * .025
    ax = attendance_by_year.plot(x = 'yr', y = 'attendance', kind = 'bar')
    for i in range(len(attendance_list)):
        ax.text(i, attendance_list[i] + offset, str(int(attendance_list[i])))
    plt.show()
    
    return attendance_by_year

## Select a Carpentry to see a bar chart of attendance by year for that Carpentry

This does not account for workshops missing attendance

In [9]:
interact(attendance_by_carpentry_and_year, data = 'workshops.csv', carpentry = ['SWC', 'DC', 'LC']) 

A Jupyter Widget

<function __main__.attendance_by_carpentry_and_year>

In [10]:
def workshops_by_carpentry_and_year(data, carpentry="All", yr="All", stacked = True, agg="workshop count"): 
    """Takes a dataframe and a carpentry, returns dataframe filtered by that carpentry"""
    df = pd.read_csv(data)
    df.loc[df.administrator_id != 343, 'administrator_id'] = 'centrally'
    df.loc[df.administrator_id == 343, 'administrator_id'] = 'self'
    
    df.loc[df["tag_name"].str.contains("SWC"), 'tag_name'] = "SWC"
    df.loc[df["tag_name"].str.contains("DC"), 'tag_name'] = "DC"
    df.loc[df["tag_name"].str.contains("LC"), 'tag_name'] = "LC"
    df.loc[df["tag_name"].str.contains("TTT"), 'tag_name'] = "TTT"

    if carpentry != "All":
        df = df[df['tag_name'].str.contains(carpentry)]
    if yr != "All":
        df = df[df['yr'] == yr]
    
    if agg == "workshop count":
        attendance_by_year = df.groupby(['yr', 'tag_name', 'administrator_id'])['slug'].count()
    elif agg == "attendance sum":
        attendance_by_year = df.groupby(['yr', 'tag_name', 'administrator_id'])['attendance'].sum()

    attendance_by_year = attendance_by_year.to_frame()
    attendance_by_year.unstack().plot(kind='bar', stacked=stacked)

    return attendance_by_year


## Seelct a Carpentry and year to see a bar chart of workshop count OR attendance total for that Carpentry/year and that bar chart's source dataframe by self organized and centrally organized workshops.  Select "stacked" to toggle stacked view or side by side view.

This does not account for workshops missing attendance.  See here on how to fill na with means: 
https://chrisalbon.com/python/data_wrangling/pandas_missing_data/

Need to figure out how to make the legends and labels more human readable.

In [11]:
interact(workshops_by_carpentry_and_year, data = 'workshops.csv', stacked = [True, False], carpentry = ['All', 'SWC', 'DC', 'LC', 'TTT'], yr = ["All"] + list(range(2011, 2019)), agg=["attendance sum", "workshop count"])

A Jupyter Widget

<function __main__.workshops_by_carpentry_and_year>