In [34]:
import pandas as pd
import numpy as np

df_cigarette = pd.read_csv('cigarette.csv', index_col = 0)

In [35]:
dict_states = {
    'AL': ['Alabama', '亚拉巴马州'],
    'AK': ['Alaska', '阿拉斯加州'],
    'AZ': ['Arizona', '亚利桑那州'],
    'AR': ['Arkansas', '阿肯色州'],
    'CA': ['California', '加利福尼亚州'],
    'CO': ['Colorado', '科罗拉多州'],
    'CT': ['Connecticut', '康涅狄格州'],
    'DE': ['Delaware', '特拉华州'],
    'FL': ['Florida', '佛罗里达州'],
    'GA': ['Georgia', '佐治亚州'],
    'HI': ['Hawaii', '夏威夷州'],
    'ID': ['Idaho', '爱达荷州'],
    'IL': ['Illinois', '伊利诺伊州'],
    'IN': ['Indiana', '印第安纳州'],
    'IA': ['Iowa', '艾奥瓦州'],
    'KS': ['Kansas', '堪萨斯州'],
    'KY': ['Kentucky', '肯塔基州'],
    'LA': ['Louisiana', '路易斯安那州'],
    'ME': ['Maine', '缅因州'],
    'MD': ['Maryland', '马里兰州'],
    'MA': ['Massachusetts', '马萨诸塞州'],
    'MI': ['Michigan', '密歇根州'],
    'MN': ['Minnesota', '明尼苏达州'],
    'MS': ['Mississippi', '密西西比州'],
    'MO': ['Missouri', '密苏里州'],
    'MT': ['Montana', '蒙大拿州'],
    'NE': ['Nebraska', '内布拉斯加州'],
    'NV': ['Nevada', '内华达州'],
    'NH': ['New Hampshire', '新罕布什尔州'],
    'NJ': ['New Jersey', '新泽西州'],
    'NM': ['New Mexico', '新墨西哥州'],
    'NY': ['New York', '纽约州'],
    'NC': ['North Carolina', '北卡罗来纳州'],
    'ND': ['North Dakota', '北达科他州'],
    'OH': ['Ohio', '俄亥俄州'],
    'OK': ['Oklahoma', '俄克拉何马州'],
    'OR': ['Oregon', '俄勒冈州'],
    'PA': ['Pennsylvania', '宾夕法尼亚州'],
    'RI': ['Rhode Island', '罗得岛州'],
    'SC': ['South Carolina', '南卡罗来纳州'],
    'SD': ['South Dakota', '南达科他州'],
    'TN': ['Tennessee', '田纳西州'],
    'TX': ['Texas', '得克萨斯州'],
    'UT': ['Utah', '犹他州'],
    'VT': ['Vermont', '佛蒙特州'],
    'VA': ['Virginia', '弗吉尼亚州'],
    'WA': ['Washington', '华盛顿州'],
    'WV': ['West Virginia', '西弗吉尼亚州'],
    'WI': ['Wisconsin', '威斯康星州'],
    'WY': ['Wyoming', '怀俄明州']
}

In [36]:
df_cigarette['state_fn'] = df_cigarette['state'].apply(lambda x: dict_states[x][0])
df_cigarette['year_str'] = df_cigarette['year'].astype(str)

In [37]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, push_notebook
from bokeh.models import ColumnDataSource
output_notebook()

In [38]:
def generate_box_plot_data(df, category, column):
    groups = df[[column, category]].groupby(by=category, sort=False)
    cats = []
    q1 = groups.quantile(q=0.25)
    q2 = groups.quantile(q=0.5)
    q3 = groups.quantile(q=0.75)
    iqr = q3-q1
    upper = q3+1.5*iqr
    lower = q1-1.5*iqr
    def outliers(group):
        cats.append(cat)
        return group[(group[column] > upper.loc[cat][column]) | (group[column] < lower.loc[cat][column])][column]
    out = groups.apply(outliers).dropna()

    if not out.empty:
        out_index = []
        out_category = []
        out_value = []
        for keys in out.index:
            out_index.append(keys[1])
            out_category.append(keys[0])
            out_value.append(out.loc[keys[0]].loc[keys[1]])

    qmin = groups.quantile(q=0.00)
    qmax = groups.quantile(q=1.00)

    upper[column] = [min([x,y]) for (x,y) in zip(list(qmax.loc[:,column]),upper[column])]
    lower[column] = [max([x,y]) for (x,y) in zip(list(qmin.loc[:,column]),lower[column])]

    data_box_plot = {
        'category': cats,
        'qmin': qmin[column].tolist(),
        'q1' : q1[column].tolist(),
        'q2' : q2[column].tolist(),
        'q3' : q3[column].tolist(),
        'iqr': iqr[column].tolist(),
        'qmax': qmax[column].tolist(),
        'upper': upper[column].tolist(),
        'lower': lower[column].tolist()
    }

    data_box_plot_outlier = {
        'index': out_index,
        'category' : out_category,
        'value': out_value
    }
    
    return data_box_plot, data_box_plot_outlier

def bokeh_box_plot(p, data_box_plot, data_box_plot_out, all_dot=False):
    box, out = data_box_plot, data_box_plot_out
    box = ColumnDataSource(data=box)

    p.segment(x0='upper', y0='category', x1='q3', y1='category', line_color='black', source=box)
    p.segment(x0='lower', y0='category', x1='q1', y1='category', line_color='black', source=box)

    p.hbar(y='category', height=0.7, right='q2', left='q3', fill_color='navy', alpha=0.5, line_color='black', source=box)
    p.hbar(y='category', height=0.7, right='q1', left='q2', fill_color='navy', alpha=0.3, line_color='black', source=box)

    p.circle(x='value', y='category', size=6, color="#F38630", fill_alpha=1, source=out)
    
    return box

In [39]:
class box_plot_app_by_year():
    def __init__(self, df):
        self.df = df.copy()
        self.df['year_str'] = self.df['year'].astype(str)
        self.df.sort_values(by='year', inplace=True, ascending=False)
        self.data_box_plot, self.data_box_plot_out = generate_box_plot_data(df_cigarette, 'year_str', 'packpc')
        self.data_box_plot['category'] = [int(d) for d in self.data_box_plot['category']]
        self.data_box_plot_out['category'] = [int(d) for d in self.data_box_plot_out['category']]
        self.data_box_plot_out['text'] = self.df.loc[self.data_box_plot_out['index'], 'state'].tolist()

        p = figure(title='美国香烟消费 1985～1995', tools="save", 
                   # background_fill_color="#efefef", 
                   x_range=[45, 205], 
                   y_range=[1996, 1984],
                   plot_width=800, plot_height=450, 
                   toolbar_location='right', )
        self.data_source = bokeh_box_plot(p, self.data_box_plot, self.data_box_plot_out, all_dot=False)
        p.yaxis.axis_label = '<--- 年 --->'
        p.xaxis.axis_label = '香烟消费（平均每人每年的包数）'
        p.title.text_font_size = '16pt'
        p.axis.axis_label_text_font_size = "12pt"
        p.axis.major_label_text_font_size = "10pt"
        p.xaxis.ticker.min_interval = 50
        p.xaxis.ticker.num_minor_ticks = 5
        p.yaxis.ticker = [_ for _ in range(1995, 1984, -1)]
        p.text(x='value', y='category', text='text',
               text_font_size='10px', text_align='center', text_baseline='top', y_offset=5, 
               source=self.data_box_plot_out)
        p.line(x=[100, 100], y=['1996', '1984'], line_color='red', line_dash='dashed')
        self.plot_handler = show(p, notebook_handle=True)
        self.figure = p
        
box_plot_app_by_year(df_cigarette)

NameError: name 'cat' is not defined