# Altair. 

https://altair-viz.github.io/gallery/index.html  


https://matthewkudija.com/blog/2018/06/22/altair-interactive/#building-interactive-altair-charts

https://vallandingham.me/altair_intro.html



In [1]:
import altair as alt
import numpy as np
import pandas as pd

# Compute x^2 + y^2 across a 2D grid
x, y = np.meshgrid(range(-5, 5), range(-5, 5))
z = x ** 2 + y ** 2

# Convert this grid to columnar data expected by Altair
source = pd.DataFrame({'x': x.ravel(),
                     'y': y.ravel(),
                     'z': z.ravel()})

alt.Chart(source).mark_rect().encode(
    x='x:O',
    y='y:O',
    color='z:Q'
)

In [2]:
import altair as alt
from vega_datasets import data

source = data.movies.url

alt.Chart(source).mark_bar().encode(
    alt.X("IMDB_Rating:Q", bin=True),
    y='count()',
).interactive()

In [3]:
import altair as alt

# load a simple dataset as a pandas DataFrame
from vega_datasets import data
cars = data.cars()

alt.Chart(cars).mark_point().encode(
    x='Horsepower',
    y='Miles_per_Gallon',
    color='Origin',
).interactive()

In [4]:
import altair as alt
from vega_datasets import data

source = data.movies.url

alt.Chart(source).mark_circle().encode(
    alt.X('IMDB_Rating:Q', bin=True),
    alt.Y('Rotten_Tomatoes_Rating:Q', bin=True),
    size='count()'
).interactive()


<a id="Simple_strip_plot"></a>
# Simple_strip_plot

[Simple_strip_plot](#Simple_strip_plot) 

'Simple_strip_plot'.replace(' ','_')
'Simple_strip_plot'.replace('_',' ').title()

[Top](#Top) &nbsp; [Bottom](#Bottom) 


    

In [5]:
import altair as alt
from vega_datasets import data

source = data.cars()

alt.Chart(source).mark_tick().encode(
    x='Horsepower:Q',
    y='Cylinders:O'
)


In [6]:
type(source.loc[0,'Year'].year)
source['Year_int'] = source['Year'].apply(lambda x : x.year)
source.rename(columns = {'Miles_per_Gallon':'MPG'}, inplace = True) 

source.head()

Unnamed: 0,Name,MPG,Cylinders,Displacement,Horsepower,Weight_in_lbs,Acceleration,Year,Origin,Year_int
0,chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,1970-01-01,USA,1970
1,buick skylark 320,15.0,8,350.0,165.0,3693,11.5,1970-01-01,USA,1970
2,plymouth satellite,18.0,8,318.0,150.0,3436,11.0,1970-01-01,USA,1970
3,amc rebel sst,16.0,8,304.0,150.0,3433,12.0,1970-01-01,USA,1970
4,ford torino,17.0,8,302.0,140.0,3449,10.5,1970-01-01,USA,1970


In [7]:
from tabulate import _table_formats, tabulate
print(tabulate(source[['Name', 'MPG', 'Cylinders', 'Horsepower',
       'Weight_in_lbs', 'Acceleration', 'Year_int']].head(5),headers='keys'))   

    Name                         MPG    Cylinders    Horsepower    Weight_in_lbs    Acceleration    Year_int
--  -------------------------  -----  -----------  ------------  ---------------  --------------  ----------
 0  chevrolet chevelle malibu     18            8           130             3504            12          1970
 1  buick skylark 320             15            8           165             3693            11.5        1970
 2  plymouth satellite            18            8           150             3436            11          1970
 3  amc rebel sst                 16            8           150             3433            12          1970
 4  ford torino                   17            8           140             3449            10.5        1970


In [8]:
cars.head()

Unnamed: 0,Name,Miles_per_Gallon,Cylinders,Displacement,Horsepower,Weight_in_lbs,Acceleration,Year,Origin
0,chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,1970-01-01,USA
1,buick skylark 320,15.0,8,350.0,165.0,3693,11.5,1970-01-01,USA
2,plymouth satellite,18.0,8,318.0,150.0,3436,11.0,1970-01-01,USA
3,amc rebel sst,16.0,8,304.0,150.0,3433,12.0,1970-01-01,USA
4,ford torino,17.0,8,302.0,140.0,3449,10.5,1970-01-01,USA


In [9]:
alt.Chart(source).mark_tick().encode(
    x='Horsepower:Q',
    y='Cylinders:O'
)



In [10]:
cars.head()

Unnamed: 0,Name,Miles_per_Gallon,Cylinders,Displacement,Horsepower,Weight_in_lbs,Acceleration,Year,Origin
0,chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,1970-01-01,USA
1,buick skylark 320,15.0,8,350.0,165.0,3693,11.5,1970-01-01,USA
2,plymouth satellite,18.0,8,318.0,150.0,3436,11.0,1970-01-01,USA
3,amc rebel sst,16.0,8,304.0,150.0,3433,12.0,1970-01-01,USA
4,ford torino,17.0,8,302.0,140.0,3449,10.5,1970-01-01,USA


In [11]:
import altair as alt

# load a simple dataset as a pandas DataFrame
from vega_datasets import data
cars = data.cars()

alt.Chart(cars).mark_point().encode(
    x='Horsepower',
    y='Miles_per_Gallon',
    color='Origin',
).interactive()

# Weather

In [12]:
from vega_datasets import data

df = data.seattle_weather()
df.head()


Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


In [13]:
alt.Chart(df).mark_tick().encode(
    x='precipitation',
)


In [14]:
alt.Chart(df).mark_bar().encode(
    alt.X('precipitation', bin=True),
    y='count()'
)

In [15]:
alt.Chart(df).mark_line().encode(
    x='month(date):T',
    y='average(precipitation)'
)


In [16]:
alt.Chart(df).mark_line().encode(
    x='yearmonth(date):T',
    y='max(temp_max)',
)

In [17]:
alt.Chart(df).mark_line().encode(
    x='year(date):T',
    y='mean(temp_max)',
)

In [18]:
alt.Chart(df).mark_bar().encode(
    x='mean(temp_max)',
    y='year(date):O'
)

In [19]:
alt.Chart(df).mark_bar().encode(
    x='mean(temp_range):Q',
    y='year(date):O'
).transform_calculate(
    temp_range="datum.temp_max - datum.temp_min"
)


In [20]:
alt.Chart(df).mark_bar().encode(
    x='month(date):N',
    y='count()',
    color='weather',
)

In [21]:
scale = alt.Scale(domain=['sun', 'fog', 'drizzle', 'rain', 'snow'],
                  range=['#e7ba52', '#c7c7c7', '#aec7e8', '#1f77b4', '#9467bd'])


In [22]:
alt.Chart(df).mark_bar().encode(
    x=alt.X('month(date):N', title='Month of the year'),
    y='count()',
    color=alt.Color('weather', legend=alt.Legend(title='Weather type'), scale=scale),
)


In [23]:
alt.Chart(df).mark_point().encode(
    alt.X('temp_max', title='Maximum Daily Temperature (C)'),
    alt.Y('temp_range:Q', title='Daily Temperature Range (C)'),
    alt.Color('weather', scale=scale),
    alt.Size('precipitation', scale=alt.Scale(range=[1, 200]))
).transform_calculate(
    "temp_range", "datum.temp_max - datum.temp_min"
).properties(
    width=600,
    height=400
).interactive()


In [24]:
alt.Chart(df).mark_bar().encode(
    x='count()',
    y='weather:N',
    color=alt.Color('weather:N', scale=scale),
)


And now we can vertically concatenate this histogram to the points plot above, and add a brush selection tool such that the histogram reflects the content of the selection (for more information on selections, see Bindings, Selections, Conditions: Making Charts Interactive):  

https://altair-viz.github.io/user_guide/interactions.html#user-guide-interactions



In [25]:
brush = alt.selection(type='interval')

points = alt.Chart().mark_point().encode(
    alt.X('temp_max:Q', title='Maximum Daily Temperature (C)'),
    alt.Y('temp_range:Q', title='Daily Temperature Range (C)'),
    color=alt.condition(brush, 'weather:N', alt.value('lightgray'), scale=scale),
    size=alt.Size('precipitation:Q', scale=alt.Scale(range=[1, 200]))
).transform_calculate(
    "temp_range", "datum.temp_max - datum.temp_min"
).properties(
    width=600,
    height=400
).add_selection(
    brush
)

bars = alt.Chart().mark_bar().encode(
    x='count()',
    y='weather:N',
    color=alt.Color('weather:N', scale=scale),
).transform_calculate(
    "temp_range", "datum.temp_max - datum.temp_min"
).transform_filter(
    brush
).properties(
    width=600
)

alt.vconcat(points, bars, data=df)


### Multiple Interactions

This example shows how multiple user inputs can be layered onto a chart. The four inputs have functionality as follows:

Dropdown: Filters the movies by genre  
Radio Buttons: Highlights certain films by Worldwide Gross  
Mouse Drag and Scroll: Zooms the x and y scales to allow for panning. 


https://altair-viz.github.io/gallery/multiple_interactions.html?highlight=dropdown

In [26]:
import altair as alt
from vega_datasets import data

movies = alt.UrlData(
    data.movies.url,
    format=alt.DataFormat(parse={"Release_Date":"date"})
)
ratings = ['G', 'NC-17', 'PG', 'PG-13', 'R']
genres = ['Action', 'Adventure', 'Black Comedy', 'Comedy',
       'Concert/Performance', 'Documentary', 'Drama', 'Horror', 'Musical',
       'Romantic Comedy', 'Thriller/Suspense', 'Western']

base = alt.Chart(movies, width=200, height=200).mark_point(filled=True).transform_calculate(
    Rounded_IMDB_Rating = "floor(datum.IMDB_Rating)",
    Hundred_Million_Production =  "datum.Production_Budget > 100000000.0 ? 100 : 10",
    Release_Year = "year(datum.Release_Date)"
).transform_filter(
    alt.datum.IMDB_Rating > 0
).transform_filter(
    alt.FieldOneOfPredicate(field='MPAA_Rating', oneOf=ratings)
).encode(
    x=alt.X('Worldwide_Gross:Q', scale=alt.Scale(domain=(100000,10**9), clamp=True)),
    y='IMDB_Rating:Q',
    tooltip="Title:N"
)

# A slider filter
year_slider = alt.binding_range(min=1969, max=2018, step=1)
slider_selection = alt.selection_single(bind=year_slider, fields=['Release_Year'], name="Release Year_")


filter_year = base.add_selection(
    slider_selection
).transform_filter(
    slider_selection
).properties(title="Slider Filtering")

# A dropdown filter
genre_dropdown = alt.binding_select(options=genres)
genre_select = alt.selection_single(fields=['Major_Genre'], bind=genre_dropdown, name="Genre")

filter_genres = base.add_selection(
    genre_select
).transform_filter(
    genre_select
).properties(title="Dropdown Filtering")

#color changing marks
rating_radio = alt.binding_radio(options=ratings)

rating_select = alt.selection_single(fields=['MPAA_Rating'], bind=rating_radio, name="Rating")
rating_color_condition = alt.condition(rating_select,
                      alt.Color('MPAA_Rating:N', legend=None),
                      alt.value('lightgray'))

highlight_ratings = base.add_selection(
    rating_select
).encode(
    color=rating_color_condition
).properties(title="Radio Button Highlighting")

# Boolean selection for format changes
input_checkbox = alt.binding_checkbox()
checkbox_selection = alt.selection_single(bind=input_checkbox, name="Big Budget Films")

size_checkbox_condition = alt.condition(checkbox_selection,
                                        alt.SizeValue(25),
                                        alt.Size('Hundred_Million_Production:Q')
                                       )

budget_sizing = base.add_selection(
    checkbox_selection
).encode(
    size=size_checkbox_condition
).properties(title="Checkbox Formatting")

( filter_year | filter_genres) &  (highlight_ratings | budget_sizing  )


In [27]:
import altair as alt
from vega_datasets import data

movies = alt.UrlData(
    data.movies.url,
    format=alt.DataFormat(parse={"Release_Date":"date"})
)
ratings = ['G', 'NC-17', 'PG', 'PG-13', 'R']
genres = ['Action', 'Adventure', 'Black Comedy', 'Comedy',
       'Concert/Performance', 'Documentary', 'Drama', 'Horror', 'Musical',
       'Romantic Comedy', 'Thriller/Suspense', 'Western']

base = alt.Chart(movies, width=200, height=200).mark_point(filled=True).transform_calculate(
    Rounded_IMDB_Rating = "floor(datum.IMDB_Rating)",
    Hundred_Million_Production =  "datum.Production_Budget > 100000000.0 ? 100 : 10",
    Release_Year = "year(datum.Release_Date)"
).transform_filter(
    alt.datum.IMDB_Rating > 0
).transform_filter(
    alt.FieldOneOfPredicate(field='MPAA_Rating', oneOf=ratings)
).encode(
    x=alt.X('Worldwide_Gross:Q', scale=alt.Scale(domain=(100000,10**9), clamp=True)),
    y='IMDB_Rating:Q',
    tooltip="Title:N"
)

# A slider filter
year_slider = alt.binding_range(min=1969, max=2018, step=1)
slider_selection = alt.selection_single(bind=year_slider, fields=['Release_Year'], name="Release Year_")


filter_year = base.add_selection(
    slider_selection
).transform_filter(
    slider_selection
).properties(title="Slider Filtering")

# A dropdown filter
genre_dropdown = alt.binding_select(options=genres)
genre_select = alt.selection_single(fields=['Major_Genre'], bind=genre_dropdown, name="Genre")

filter_genres = base.add_selection(
    genre_select
).transform_filter(
    genre_select
).properties(title="Dropdown Filtering")

#color changing marks
rating_radio = alt.binding_radio(options=ratings)

rating_select = alt.selection_single(fields=['MPAA_Rating'], bind=rating_radio, name="Rating")
rating_color_condition = alt.condition(rating_select,
                      alt.Color('MPAA_Rating:N', legend=None),
                      alt.value('lightgray'))

highlight_ratings = base.add_selection(
    rating_select
).encode(
    color=rating_color_condition
).properties(title="Radio Button Highlighting")

# Boolean selection for format changes
input_checkbox = alt.binding_checkbox()
checkbox_selection = alt.selection_single(bind=input_checkbox, name="Big Budget Films")

size_checkbox_condition = alt.condition(checkbox_selection,
                                        alt.SizeValue(25),
                                        alt.Size('Hundred_Million_Production:Q')
                                       )

budget_sizing = base.add_selection(
    checkbox_selection
).encode(
    size=size_checkbox_condition
).properties(title="Checkbox Formatting")

( filter_year | filter_genres) &  (highlight_ratings | budget_sizing  )


# For CMS Project

In [28]:
import altair as alt
# Create a selection that chooses the nearest point & selects based on x-value
nearest = alt.selection(type='single', nearest=True, on='mouseover',
                        fields=['date'], empty='none')

# The basic line
line = alt.Chart().mark_line(interpolate='basis').encode(
    alt.X('date:T', axis=alt.Axis(title='')),
    alt.Y('price:Q', axis=alt.Axis(title='',format='$f')),
    color='symbol:N'
)

# Transparent selectors across the chart. This is what tells us
# the x-value of the cursor
selectors = alt.Chart().mark_point().encode(
    x='date:T',
    opacity=alt.value(0),
).add_selection(
    nearest
)

# Draw points on the line, and highlight based on selection
points = line.mark_point().encode(
    opacity=alt.condition(nearest, alt.value(1), alt.value(0))
)

# Draw text labels near the points, and highlight based on selection
text = line.mark_text(align='left', dx=5, dy=-5).encode(
    text=alt.condition(nearest, 'price:Q', alt.value(' '))
)

# Draw a rule at the location of the selection
rules = alt.Chart().mark_rule(color='gray').encode(
    x='date:T',
).transform_filter(
    nearest
)

# Put the five layers into a chart and bind the data
stockChart = alt.layer(line, selectors, points, rules, text,
                       data='https://raw.githubusercontent.com/altair-viz/vega_datasets/master/vega_datasets/_data/stocks.csv', 
                       width=600, height=300,title='Stock History')
#stockChart.save('stocks.html')


In [29]:
filename='https://raw.githubusercontent.com/altair-viz/vega_datasets/master/vega_datasets/_data/stocks.csv' 
df = pd.read_csv(filename)
df.head()

Unnamed: 0,symbol,date,price
0,MSFT,Jan 1 2000,39.81
1,MSFT,Feb 1 2000,36.35
2,MSFT,Mar 1 2000,43.22
3,MSFT,Apr 1 2000,28.37
4,MSFT,May 1 2000,25.45


In [30]:
filename = 'Provider Sites - DRG Descriptions Cleanedx.csv'
df = pd.read_csv(filename)
df = df[df.columns[1:]]
df.head()

Unnamed: 0,id_num,drg,name,street,city,state,zipcode,region,discharges,avg_charges,total_payment,medicare_payment,year,drg3,disc_times_pay,discharge_rank,payment_rank
0,330270,470 - MAJOR JOINT REPLACEMENT OR REATTACHMENT ...,HOSPITAL FOR SPECIAL SURGERY,535 EAST 70TH STREET,NEW YORK,NY,10021,NY - Manhattan,3990,63444,20097,14168,2015,470,56530320,1,1
1,330270,470 - MAJOR JOINT REPLACEMENT OR REATTACHMENT ...,HOSPITAL FOR SPECIAL SURGERY,535 EAST 70TH STREET,NEW YORK,NY,10021,NY - Manhattan,3855,59648,20574,14221,2014,470,54821955,1,1
2,330270,470 - MAJOR JOINT REPLACEMENT OR REATTACHMENT ...,HOSPITAL FOR SPECIAL SURGERY,535 EAST 70TH STREET,NEW YORK,NY,10021,NY - Manhattan,3680,56357,19266,14127,2013,470,51987360,1,1
3,330270,470 - MAJOR JOINT REPLACEMENT OR REATTACHMENT ...,HOSPITAL FOR SPECIAL SURGERY,535 EAST 70TH STREET,NEW YORK,NY,10021,NY - Manhattan,3657,55217,18814,14885,2012,470,54434445,1,1
4,330270,470 - MAJOR JOINT REPLACEMENT OR REATTACHMENT ...,HOSPITAL FOR SPECIAL SURGERY,535 EAST 70TH STREET,NEW YORK,NY,10021,NY - Manhattan,3383,53113,19023,14880,2011,470,50339040,1,1


# Cleaned up File

In [31]:
df.to_csv('Practice File CMS data.csv',index=False)



In [32]:
filename = 'Practice File CMS data.csv'

df = pd.read_csv(filename)
df.head()



Unnamed: 0,id_num,drg,name,street,city,state,zipcode,region,discharges,avg_charges,total_payment,medicare_payment,year,drg3,disc_times_pay,discharge_rank,payment_rank
0,330270,470 - MAJOR JOINT REPLACEMENT OR REATTACHMENT ...,HOSPITAL FOR SPECIAL SURGERY,535 EAST 70TH STREET,NEW YORK,NY,10021,NY - Manhattan,3990,63444,20097,14168,2015,470,56530320,1,1
1,330270,470 - MAJOR JOINT REPLACEMENT OR REATTACHMENT ...,HOSPITAL FOR SPECIAL SURGERY,535 EAST 70TH STREET,NEW YORK,NY,10021,NY - Manhattan,3855,59648,20574,14221,2014,470,54821955,1,1
2,330270,470 - MAJOR JOINT REPLACEMENT OR REATTACHMENT ...,HOSPITAL FOR SPECIAL SURGERY,535 EAST 70TH STREET,NEW YORK,NY,10021,NY - Manhattan,3680,56357,19266,14127,2013,470,51987360,1,1
3,330270,470 - MAJOR JOINT REPLACEMENT OR REATTACHMENT ...,HOSPITAL FOR SPECIAL SURGERY,535 EAST 70TH STREET,NEW YORK,NY,10021,NY - Manhattan,3657,55217,18814,14885,2012,470,54434445,1,1
4,330270,470 - MAJOR JOINT REPLACEMENT OR REATTACHMENT ...,HOSPITAL FOR SPECIAL SURGERY,535 EAST 70TH STREET,NEW YORK,NY,10021,NY - Manhattan,3383,53113,19023,14880,2011,470,50339040,1,1


In [33]:
df.columns

Index(['id_num', 'drg', 'name', 'street', 'city', 'state', 'zipcode', 'region',
       'discharges', 'avg_charges', 'total_payment', 'medicare_payment',
       'year', 'drg3', 'disc_times_pay', 'discharge_rank', 'payment_rank'],
      dtype='object')

In [34]:
df_small = df[['id_num','year','discharges','drg3','disc_times_pay']]

In [35]:
df1 = df_small.loc[np.where(df_small['drg3'].isin([870,871,872]))]
df1 = df1.reset_index(drop=True)
#df_small.loc[np.where(df_small['id_num'].isin([50030,100007]))]
df1.shape

(38592, 5)

In [213]:
df1.tail()

Unnamed: 0,id_num,year,discharges,drg3,disc_times_pay
38587,260176,2016,11,872,49874
38588,150181,2016,11,872,46552
38589,10146,2016,11,872,45529
38590,150057,2016,11,872,43780
38591,250069,2016,11,872,43395


In [214]:
top = list(df1[ df1['year'] == final_year].sort_values('discharges',ascending=False).id_num.values)
top = top[:10]

top

[100007, 50625, 50030, 70022, 440049, 370091, 220074, 450184, 330106, 170122]

In [215]:

df1 = df1.loc[np.where(df1['id_num'].isin(top))]
df1 = df1.reset_index(drop=True)
df1.tail()

Unnamed: 0,id_num,year,discharges,drg3,disc_times_pay
174,220074,2016,271,872,1907298
175,450184,2016,278,872,1655768
176,440049,2016,272,872,1639888
177,170122,2016,198,872,1087218
178,370091,2016,191,872,1012300


In [216]:
df1['id_num'] = df1['id_num'].apply (lambda x : str(x))
df1.rename(columns = {'disc_times_pay':'Payments','discharges':'Discharges'}, inplace = True) 

df1['Payments'] = df1['Payments'].apply( lambda x : x/1000000)
df1.head()





Unnamed: 0,id_num,year,Discharges,drg3,Payments
0,100007,2015,1424,871,15.097248
1,50030,2015,1385,871,18.21552
2,50030,2014,1297,871,16.898613
3,220074,2015,1241,871,15.514982
4,70022,2015,1233,871,20.741526


In [217]:
alt.Chart(df1).mark_point().encode(
    x=alt.X('Discharges', title='Discharges'),
    y= 'Payments' ,
    color=alt.Color('id_num', legend=alt.Legend(title='ID Number')),
).properties(title="Total Payments vs Discharges",
    width=600,
    height=400
).interactive()

In [204]:
    color='id_num',


    
    #Chart.save('mychart.html', embed_options={'renderer':'svg'})
xxx title='Weather type'),

    color=alt.Color('weather', legend=alt.Legend(title='Weather type'), scale=scale),

alt.Chart(df).mark_bar().encode(
    x=alt.X('month(date):N', title='Month of the year'),
    y='count()',
    color=alt.Color('weather', legend=alt.Legend(title='Weather type'), scale=scale),
)

In [148]:
cars.head()

Unnamed: 0,Name,Miles_per_Gallon,Cylinders,Displacement,Horsepower,Weight_in_lbs,Acceleration,Year,Origin
0,chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,1970-01-01,USA
1,buick skylark 320,15.0,8,350.0,165.0,3693,11.5,1970-01-01,USA
2,plymouth satellite,18.0,8,318.0,150.0,3436,11.0,1970-01-01,USA
3,amc rebel sst,16.0,8,304.0,150.0,3433,12.0,1970-01-01,USA
4,ford torino,17.0,8,302.0,140.0,3449,10.5,1970-01-01,USA


In [147]:
alt.Chart(cars).mark_point().encode(
    x='Horsepower',
    y='Miles_per_Gallon',
    color='Origin',
).interactive()

In [136]:
df1 = df_small.loc[  df_small['drg3'].isin([870,871,872])  &  (df_small['id_num'].isin([50030,100007]))   ]

In [43]:
df_small[df_small['year'] == final_year]

Unnamed: 0,id_num,year,discharges,drg3
878371,50625,2016,47,1
878372,50441,2016,29,1
878373,330101,2016,44,1
878374,220116,2016,45,1
878375,260032,2016,59,1
...,...,...,...,...
1075649,340114,2016,11,988
1075650,220071,2016,11,989
1075651,170104,2016,18,989
1075652,180088,2016,12,989


In [86]:
final_year = 2016
df1 = df_small[df_small['drg3'].isin([871]) & (df_small['year'] == final_year)]

df1 = df_small[df_small['drg3'].isin([871]) ]


df1 = df1.sort_values(['discharges'], ascending=[False])
df1 = df1.reset_index(drop=True)
df1.head()

Unnamed: 0,id_num,year,discharges,drg3
0,100007,2016,1867,871
1,100007,2015,1424,871
2,50030,2015,1385,871
3,50030,2014,1297,871
4,220074,2015,1241,871


In [143]:
top = list(df1[ df1['year'] == final_year].sort_values('discharges',ascending=False).id_num.values)
top[:10]

[100007, 50625, 50030, 70022, 440049, 370091, 220074, 450184, 330106, 170122]

In [93]:
#df_small[df_small['drg3'].isin([870,871,872])].groupby( ['id_num','year','drg3'])["discharges"].count()

df_small[df_small['id_num'].isin(top)].groupby\
  ( ['year','id_num','drg3'])["discharges"].sum().sort_values(ascending=False)

year  id_num  drg3
2016  100007  871     1867
2014  100007  392     1730
2013  100007  392     1566
2014  100007  470     1557
2012  100007  392     1543
                      ... 
2015  50030   384       11
              372       11
2016  70022   37        11
2015  50030   204       11
2011  50030   57        11
Name: discharges, Length: 5391, dtype: int64

In [105]:
#top = [50030,100007]
df1 = df_small[df_small['id_num'].isin(top) & ( df_small['drg3'].isin([871])    )]
df1 = df1.sort_values(['discharges'], ascending=[False])
df1 = df1.reset_index(drop=True)
df1.head(10)



Unnamed: 0,id_num,year,discharges,drg3
0,100007,2016,1867,871
1,100007,2015,1424,871
2,50030,2015,1385,871
3,50030,2014,1297,871
4,70022,2015,1233,871
5,50625,2016,1150,871
6,50030,2016,1145,871
7,70022,2016,1102,871
8,50625,2015,1068,871
9,370091,2016,1060,871


In [106]:
type(df1.loc[0,'year'])

numpy.int64

In [120]:
df1['year'] = df1['year'].apply (lambda x: str(x))

df1['id_num'] = df1['id_num'].apply (lambda x: str(x))

type(df1.loc[0,'year'])


str

In [121]:
df1.head()

Unnamed: 0,id_num,year,discharges,drg3
0,100007,2016-01-01 00:00:00,1867,871
1,100007,2015-01-01 00:00:00,1424,871
2,50030,2015-01-01 00:00:00,1385,871
3,50030,2014-01-01 00:00:00,1297,871
4,70022,2015-01-01 00:00:00,1233,871


In [122]:
df1['year'] = pd.to_datetime(df1['year'])
df1.head()

Unnamed: 0,id_num,year,discharges,drg3
0,100007,2016-01-01,1867,871
1,100007,2015-01-01,1424,871
2,50030,2015-01-01,1385,871
3,50030,2014-01-01,1297,871
4,70022,2015-01-01,1233,871


In [124]:

alt.Chart(df1).mark_line().encode(
    x='year',
    y='discharges',
    color='id_num'
).interactive()

In [119]:
type(source.loc[0,'price']), type(df1.loc[0,'discharges'])

type(source.loc[0,'symbol']), type(df1.loc[0,'id_num'])

(str, numpy.int64)

In [111]:
import altair as alt
from vega_datasets import data

source = data.stocks()

alt.Chart(source).mark_line().encode(
    x='date',
    y='price',
    color='symbol'
).interactive()

In [69]:
source.head()
type(source.loc[0,'date'])

pandas._libs.tslibs.timestamps.Timestamp

In [73]:
df1['year'] = pd.to_datetime(df1['year'])
df1

Unnamed: 0,id_num,year,discharges,drg3
1060100,100007,1970-01-01 00:00:00.000002016,1867,871
30,100007,1970-01-01 00:00:00.000002015,1424,871
33,50030,1970-01-01 00:00:00.000002015,1385,871
44,50030,1970-01-01 00:00:00.000002014,1297,871
49,70022,1970-01-01 00:00:00.000002015,1233,871
1060101,50625,1970-01-01 00:00:00.000002016,1150,871
1060108,50030,1970-01-01 00:00:00.000002016,1145,871


In [71]:
import altair as alt
from vega_datasets import data

source = data.stocks()

alt.Chart(source).mark_line().encode(
    x='date',
    y='price',
    color='symbol'
).interactive()

In [65]:


alt.Chart(df1).mark_line().encode(
    x='year',
    y='discharge',
    color='id_num'
).interactive()

ValueError: discharge encoding field is specified without a type; the type cannot be inferred because it does not match any column in the data.

alt.Chart(...)

In [21]:
import altair as alt
# Create a selection that chooses the nearest point & selects based on x-value
nearest = alt.selection(type='single', nearest=True, on='mouseover',
                        fields=['date'], empty='none')

# The basic line
line = alt.Chart().mark_line(interpolate='basis').encode(
    alt.X('date:T', axis=alt.Axis(title='')),
    alt.Y('price:Q', axis=alt.Axis(title='',format='$f')),
    color='symbol:N'
)

# Transparent selectors across the chart. This is what tells us
# the x-value of the cursor
selectors = alt.Chart().mark_point().encode(
    x='date:T',
    opacity=alt.value(0),
).add_selection(
    nearest
)

# Draw points on the line, and highlight based on selection
points = line.mark_point().encode(
    opacity=alt.condition(nearest, alt.value(1), alt.value(0))
)

# Draw text labels near the points, and highlight based on selection
text = line.mark_text(align='left', dx=5, dy=-5).encode(
    text=alt.condition(nearest, 'price:Q', alt.value(' '))
)

# Draw a rule at the location of the selection
rules = alt.Chart().mark_rule(color='gray').encode(
    x='date:T',
).transform_filter(
    nearest
)

# Put the five layers into a chart and bind the data
stockChart = alt.layer(line, selectors, points, rules, text,
                       data='https://raw.githubusercontent.com/altair-viz/vega_datasets/master/vega_datasets/_data/stocks.csv', 
                       width=600, height=300,title='Stock History')
#stockChart.save('stocks.html')


In [23]:
source.head()

Unnamed: 0,symbol,date,price
0,MSFT,2000-01-01,39.81
1,MSFT,2000-02-01,36.35
2,MSFT,2000-03-01,43.22
3,MSFT,2000-04-01,28.37
4,MSFT,2000-05-01,25.45


In [62]:
import altair as alt
from vega_datasets import data

source = data.stocks()

alt.Chart(source).mark_line().encode(
    x='date',
    y='price',
    color='symbol'
).interactive()

In [128]:
import altair as alt

# load a simple dataset as a pandas DataFrame
from vega_datasets import data
cars = data.cars()
cars.head()

Unnamed: 0,Name,Miles_per_Gallon,Cylinders,Displacement,Horsepower,Weight_in_lbs,Acceleration,Year,Origin
0,chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,1970-01-01,USA
1,buick skylark 320,15.0,8,350.0,165.0,3693,11.5,1970-01-01,USA
2,plymouth satellite,18.0,8,318.0,150.0,3436,11.0,1970-01-01,USA
3,amc rebel sst,16.0,8,304.0,150.0,3433,12.0,1970-01-01,USA
4,ford torino,17.0,8,302.0,140.0,3449,10.5,1970-01-01,USA


In [125]:
alt.Chart(cars).mark_point().encode(
    x='Horsepower',
    y='Miles_per_Gallon',
    color='Origin',
).interactive()

In [127]:
type(source.loc[0,'Year'].year)
source['Year_int'] = source['Year'].apply(lambda x : x.year)
source.rename(columns = {'Miles_per_Gallon':'MPG'}, inplace = True) 

source.head()

from tabulate import _table_formats, tabulate
print(tabulate(source[['Name', 'MPG', 'Cylinders', 'Horsepower',
       'Weight_in_lbs', 'Acceleration', 'Year_int']].head(5),headers='keys'))   
cars.head()

KeyError: 'Year'

In [38]:
import altair as alt
from vega_datasets import data

source = data.barley()

alt.Chart(source).mark_bar().encode(
    x='year:O',
    y='sum(yield):Q',
    color='year:N',
    column='site:N'
).properties(
    width=100,
    height=400
).interactive()




In [39]:
import altair as alt
from vega_datasets import data

source = data.barley()

alt.Chart(source).mark_bar().encode(
    x='sum(yield):Q',
    y=alt.Y('site:N', sort='-x')
)

In [43]:
source.head()

Unnamed: 0,yield,variety,year,site
0,27.0,Manchuria,1931,University Farm
1,48.86667,Manchuria,1931,Waseca
2,27.43334,Manchuria,1931,Morris
3,39.93333,Manchuria,1931,Crookston
4,32.96667,Manchuria,1931,Grand Rapids


In [46]:
source[source['variety'] == 'Trebi']

Unnamed: 0,yield,variety,year,site
24,36.56666,Trebi,1931,University Farm
25,63.8333,Trebi,1931,Waseca
26,43.76667,Trebi,1931,Morris
27,46.93333,Trebi,1931,Crookston
28,29.76667,Trebi,1931,Grand Rapids
29,33.93333,Trebi,1931,Duluth
84,29.06667,Trebi,1932,University Farm
85,49.2333,Trebi,1932,Waseca
86,46.63333,Trebi,1932,Morris
87,41.83333,Trebi,1932,Crookston


In [42]:
import altair as alt
from vega_datasets import data

source = data.barley()

alt.Chart(source).mark_bar().encode(
    x='variety',
    y='sum(yield)',
    color='site'
).properties(
    width=600,
    height=400
) 


In [7]:
# Altair. 

#https://altair-viz.github.io/gallery/index.html  
#https://vegawidget.github.io/altair/articles/example-gallery-08-interactive-charts.html

#https://matthewkudija.com/blog/2018/06/22/altair-interactive/#building-interactive-altair-charts

# https://altair-viz.github.io/user_guide/marks.html 
# for box plots


import altair as alt
import numpy as np
import pandas as pd



# Weather

from vega_datasets import data
df = data.seattle_weather()
df.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


In [10]:
chart = alt.Chart(df).mark_tick().encode(
    x='precipitation',
).properties(
    title='Precipitation'
)

chart.configure_title(
    fontSize=20,
    font='Courier',
    anchor='start',
    color='black'
)

In [11]:

source = data.cars()

alt.Chart(source).mark_tick().encode(
    x='Horsepower:Q',
    y='Cylinders:O'
)

In [12]:
chart = alt.Chart(df).mark_bar().encode(
    alt.X('precipitation', bin=True),
    y='count()'
).properties(
    title='Precipitation'
)

chart.configure_title(
    fontSize=20,
    font='Courier',
    anchor='middle',
    color='black'
)

In [13]:
alt.Chart(df).mark_line().encode(
    x='month(date):T',
    y='average(precipitation)'
)

In [14]:
alt.Chart(df).mark_line().encode(
    x='yearmonth(date):T',
    y='max(temp_max)',
)

In [9]:
alt.Chart(df).mark_line().encode(
    x='year(date):T',
    y='mean(temp_max)',
)

alt.Chart(df).mark_bar().encode(
    x='mean(temp_max)',
    y='year(date):O'
)

alt.Chart(df).mark_bar().encode(
    x='mean(temp_range):Q',
    y='year(date):O'
).transform_calculate(
    temp_range="datum.temp_max - datum.temp_min"
)


alt.Chart(df).mark_bar().encode(
    x='month(date):N',
    y='count()',
    color='weather',
)

scale = alt.Scale(domain=['sun', 'fog', 'drizzle', 'rain', 'snow'],
                  range=['#e7ba52', '#c7c7c7', '#aec7e8', '#1f77b4', '#9467bd'])


df.head()

alt.Chart(df).mark_bar().encode(
    x=alt.X('month(date):N', title='Month of the year'),
    y='count()',
    color=alt.Color('weather', legend=alt.Legend(title='Weather type'), scale=scale),
)


alt.Chart(df).mark_point().encode(
    alt.X('temp_max', title='Maximum Daily Temperature (C)'),
    alt.Y('temp_range:Q', title='Daily Temperature Range (C)'),
    alt.Color('weather', scale=scale),
    alt.Size('precipitation', scale=alt.Scale(range=[1, 200]))
).transform_calculate(
    "temp_range", "datum.temp_max - datum.temp_min"
).properties(
    width=600,
    height=400
).interactive()


alt.Chart(df).mark_bar().encode(
    x='count()',
    y='weather:N',
    color=alt.Color('weather:N', scale=scale),
)


#And now we can vertically concatenate this histogram to the points plot above, and add a brush selection tool such that the histogram reflects the content of the selection (for more information on selections, see Bindings, Selections, Conditions: Making Charts Interactive):  

#https://altair-viz.github.io/user_guide/interactions.html#user-guide-interactions



brush = alt.selection(type='interval')

points = alt.Chart().mark_point().encode(
    alt.X('temp_max:Q', title='Maximum Daily Temperature (C)'),
    alt.Y('temp_range:Q', title='Daily Temperature Range (C)'),
    color=alt.condition(brush, 'weather:N', alt.value('lightgray'), scale=scale),
    size=alt.Size('precipitation:Q', scale=alt.Scale(range=[1, 200]))
).transform_calculate(
    "temp_range", "datum.temp_max - datum.temp_min"
).properties(
    width=600,
    height=400
).add_selection(
    brush
)

bars = alt.Chart().mark_bar().encode(
    x='count()',
    y='weather:N',
    color=alt.Color('weather:N', scale=scale),
).transform_calculate(
    "temp_range", "datum.temp_max - datum.temp_min"
).transform_filter(
    brush
).properties(
    width=600
)

alt.vconcat(points, bars, data=df)




from vega_datasets import data

source = data.population.url

alt.Chart(source).mark_boxplot().encode(
    y='people:Q'
).properties(
    width=200,
    height=300
)