# Interactive visualisation in Python 

## Getting the notebook ready

In [1]:
pip install scipy




In [2]:
pip install ipywidgets 


Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install chart-studio 

Collecting chart-studio
  Obtaining dependency information for chart-studio from https://files.pythonhosted.org/packages/ca/ce/330794a6b6ca4b9182c38fc69dd2a9cbff60fd49421cb8648ee5fee352dc/chart_studio-1.1.0-py3-none-any.whl.metadata
  Downloading chart_studio-1.1.0-py3-none-any.whl.metadata (1.3 kB)
Collecting retrying>=1.3.3 (from chart-studio)
  Obtaining dependency information for retrying>=1.3.3 from https://files.pythonhosted.org/packages/8f/04/9e36f28be4c0532c0e9207ff9dc01fb13a2b0eb036476a213b0000837d0e/retrying-1.3.4-py3-none-any.whl.metadata
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Downloading chart_studio-1.1.0-py3-none-any.whl (64 kB)
   ---------------------------------------- 0.0/64.4 kB ? eta -:--:--
   ---------------------------------------- 0.0/64.4 kB ? eta -:--:--
   ------ --------------------------------- 10.2/64.4 kB ? eta -:--:--
   ------------ --------------------------- 20.5/64.4 kB 217.9 kB/s eta 0:00:01
   ------------------- ----------

In [4]:
pip install pyarrow 

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install cufflinks

Collecting cufflinks
  Downloading cufflinks-0.17.3.tar.gz (81 kB)
     ---------------------------------------- 0.0/81.7 kB ? eta -:--:--
     ----- ---------------------------------- 10.2/81.7 kB ? eta -:--:--
     --------- ---------------------------- 20.5/81.7 kB 162.5 kB/s eta 0:00:01
     --------- ---------------------------- 20.5/81.7 kB 162.5 kB/s eta 0:00:01
     -------------- ----------------------- 30.7/81.7 kB 163.8 kB/s eta 0:00:01
     -------------- ----------------------- 30.7/81.7 kB 163.8 kB/s eta 0:00:01
     ------------------- ------------------ 41.0/81.7 kB 130.7 kB/s eta 0:00:01
     ------------------- ------------------ 41.0/81.7 kB 130.7 kB/s eta 0:00:01
     ---------------------------- --------- 61.4/81.7 kB 172.4 kB/s eta 0:00:01
     -------------------------------------- 81.7/81.7 kB 198.6 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting colorlover>=0.2.1 (from cufflinks)


### Step 2: Enable interactive visualisations in Jupyter

In [6]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


### Step 3: Import the required packages

In [7]:
# Standard data science helpers
import numpy as np
import pandas as pd
import scipy

# Instantiate the Plotly charting library.
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.express as px
# We use plotly.offline as this allows us to create interactive 
# visualisations without the use of an internet connection, 
# making our notebook more distributable to others. 
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

# The Cufflinks library allows us to directly bind 
# Pandas DataFrames to Plotly charts. 
import cufflinks as cf
# Once again, we use the Cufflinks library in offline mode. 
cf.go_offline(connected=True)
cf.set_config_file(colorscale='plotly', world_readable=True)

# Extra options. We use these to make our interactive 
# visualisations more aesthetically appealing. 
from IPython.core.display import HTML
pd.options.display.max_rows = 30
pd.options.display.max_columns = 25

# Show all code cells outputs.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

### Step 4: Read in the data

In [8]:
df = pd.read_parquet('https://github.com/WillKoehrsen/Data-Analysis/blob/master/medium/data/medium_data_2019_01_26?raw=true')
df.head()

Unnamed: 0,claps,days_since_publication,fans,link,num_responses,publication,published_date,read_ratio,read_time,reads,started_date,tags,text,title,title_word_count,type,views,word_count,claps_per_word,editing_days,<tag>Education,<tag>Data Science,<tag>Towards Data Science,<tag>Machine Learning,<tag>Python
129,2,597.301123,2,https://medium.com/p/screw-the-environment-but...,0,,2017-06-10 14:25:00,42.17,7,70,2017-06-10 14:24:00,"[Climate Change, Economics]","Screw the Environment, but Consider Your Walle...","Screw the Environment, but Consider Your Wallet",8,published,166,1859,0.001076,0,0,0,0,0,0
125,18,589.983168,3,https://medium.com/p/the-vanquishing-of-war-pl...,0,,2017-06-17 22:02:00,30.34,14,54,2017-06-17 22:02:00,"[Climate Change, Humanity, Optimism, History]","The Vanquishing of War, Plague and Famine Part...","The Vanquishing of War, Plague and Famine",8,published,178,3891,0.004626,0,0,0,0,0,0
132,51,577.363292,20,https://medium.com/p/capstone-project-mercedes...,0,,2017-06-30 12:55:00,20.02,42,222,2017-06-30 12:00:00,"[Machine Learning, Python, Udacity, Kaggle]",Capstone Project: Mercedes-Benz Greener Manufa...,Capstone Project: Mercedes-Benz Greener Manufa...,7,published,1109,12025,0.004241,0,0,0,0,1,1
126,0,576.520688,0,https://medium.com/p/home-of-the-scared-5af0fe...,0,,2017-07-01 09:08:00,35.85,9,19,2017-06-30 18:21:00,"[Politics, Books, News, Media Criticism]",Home of the Scared A review of A Culture of Fe...,Home of the Scared,4,published,53,2533,0.0,0,0,0,0,0,0
121,0,572.533035,0,https://medium.com/p/the-triumph-of-peace-f485...,0,,2017-07-05 08:51:00,8.47,14,5,2017-07-03 20:18:00,"[Books, Psychology, History, Humanism]",The Triumph of Peace A review of The Better An...,The Triumph of Peace,4,published,59,3892,0.0,1,0,0,0,0,0


In [9]:
df.describe()

Unnamed: 0,claps,days_since_publication,fans,num_responses,published_date,read_ratio,read_time,reads,started_date,title_word_count,views,word_count,claps_per_word,editing_days,<tag>Education,<tag>Data Science,<tag>Towards Data Science,<tag>Machine Learning,<tag>Python
count,133.0,133.0,133.0,133.0,133,133.0,133.0,133.0,133,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0
mean,1815.263158,248.407273,352.052632,7.045113,2018-05-25 11:51:41.954887168,29.074662,12.917293,6336.300752,2018-05-04 16:41:25.714285568,7.12782,23404.030075,3029.120301,0.957638,20.330827,0.729323,0.609023,0.43609,0.383459,0.315789
min,0.0,1.218629,0.0,0.0,2017-06-10 14:25:00,8.11,1.0,1.0,2017-06-10 14:24:00,2.0,3.0,163.0,0.0,-13.0,0.0,0.0,0.0,0.0,0.0
25%,121.0,74.543822,23.0,0.0,2018-01-17 19:42:00,20.02,8.0,363.0,2018-01-10 08:20:00,5.0,1375.0,1653.0,0.052115,0.0,0.0,0.0,0.0,0.0,0.0
50%,815.0,245.41613,136.0,4.0,2018-05-28 11:39:00,27.06,10.0,2049.0,2018-04-28 13:02:00,7.0,7608.0,2456.0,0.421525,1.0,1.0,1.0,0.0,0.0,0.0
75%,2700.0,376.080598,528.0,12.0,2018-11-15 08:35:00,34.91,14.0,7815.0,2018-10-14 08:24:00,8.0,30141.0,3553.0,1.099366,5.0,1.0,1.0,1.0,1.0,1.0
max,13600.0,597.301123,2588.0,59.0,2019-01-27 16:23:00,74.37,54.0,41978.0,2019-01-27 06:26:00,16.0,173714.0,15063.0,17.891817,349.0,1.0,1.0,1.0,1.0,1.0
std,2449.074661,179.370879,479.060117,9.056108,,12.41767,9.510795,9007.284726,,3.158475,33995.636496,2393.414456,1.846756,74.111579,0.445989,0.489814,0.497774,0.488067,0.466587


## Interactive visualisations

### Example 1

In [11]:
from ipywidgets import interact, interact_manual, widgets

In [13]:
@interact
def show_articles_more_than(column='claps', x=5000):
    display(HTML(f'<h4>Showing articles with more than {x} {column}<h4>'))
    display(df.loc[df[column] > x, ['title', 'claps', 'published_date', 'read_time', 'tags', 'views', 'reads']])

interactive(children=(Text(value='claps', description='column'), IntSlider(value=5000, description='x', max=15…

### Example 2

In [14]:
@interact
def show_titles_more_than(x=(10, 50000, 10),
                          column=['read_time', 'views', 'reads']):
    display(HTML(f'<h4>Showing articles with more than {x} {column}<h4>'))
    display(df.loc[df[column] > x, ['title', 'published_date', 'read_time', 'tags', 'views', 'reads']])

interactive(children=(IntSlider(value=25000, description='x', max=50000, min=10, step=10), Dropdown(descriptio…

### Example 3

In [15]:
@interact
def correlations(column1=list(df.select_dtypes('number').columns), 
                 column2=list(df.select_dtypes('number').columns)):
    print(f"Correlation: {df[column1].corr(df[column2])}")

interactive(children=(Dropdown(description='column1', options=('claps', 'days_since_publication', 'fans', 'num…

### Example 4

In [16]:
@interact
def scatter_plot(x=list(df.select_dtypes('number').columns), 
                 y=list(df.select_dtypes('number').columns)[1:]):
    if x == y:
        print(f"Please select separate variables for X and Y")
    else:
        df.iplot(kind='scatter', x=x, y=y, mode='markers', 
                 xTitle=x.title(), yTitle=y.title(), title=f'{y.title()} vs {x.title()}')
        ## If you are using Google Colab, comment out the above line of code and uncomment the lines below
        #fig = px.scatter(df, x=x, y=y, title=f'{y.title()} vs {x.title()}')
        #fig.show(renderer="colab")

interactive(children=(Dropdown(description='x', options=('claps', 'days_since_publication', 'fans', 'num_respo…

### Example 5

In [17]:
cscales = ['Greys', 'YlGnBu', 'Greens', 'YlOrRd', 'Bluered', 'RdBu',
            'Reds', 'Blues', 'Picnic', 'Rainbow', 'Portland', 'Jet',
            'Hot', 'Blackbody', 'Earth', 'Electric', 'Viridis', 'Cividis']

# We use the Figure Factory module of Plotly, which
# defines many unique and powerful plots to be used
# in Python. 
# For more info, see: https://plot.ly/python/figure-factory-subplots/
import plotly.figure_factory as ff

corrs = df.corr(numeric_only=True)

@interact
def plot_corrs(colorscale=cscales):
    figure = ff.create_annotated_heatmap(z = corrs.round(2).values, 
                                     x =list(corrs.columns), 
                                     y=list(corrs.index), 
                                     colorscale=colorscale,
                                     annotation_text=corrs.round(2).values)
    iplot(figure)
    ## If you are using Google Colab, comment out the above line of code and uncomment the line below
    #figure.show(renderer="colab")

interactive(children=(Dropdown(description='colorscale', options=('Greys', 'YlGnBu', 'Greens', 'YlOrRd', 'Blue…

### Example 6

In [18]:
@interact_manual
def scatter_plot(x=list(df.select_dtypes('number').columns), 
                 y=list(df.select_dtypes('number').columns)[1:],
                 theme=list(cf.themes.THEMES.keys()), 
                 colorscale=list(cf.colors._scales_names.keys())):
    
    if x == y:
        print(f"Please select separate variables for X and Y")
    else:
        df.iplot(kind='scatter', x=x, y=y, mode='markers', 
                 xTitle=x.title(), yTitle=y.title(), 
                 text='title',
                 title=f'{y.title()} vs {x.title()}',
                theme=theme, colorscale=colorscale)
        ## If you are using Google Colab, comment out the above line of code and uncomment the line below
        #fig = px.scatter(df, x=x, y=y, title=f'{y.title()} vs {x.title()}')
        #fig.show(renderer="colab")

interactive(children=(Dropdown(description='x', options=('claps', 'days_since_publication', 'fans', 'num_respo…

### Example 7

In [19]:
df.set_index('published_date', inplace=True)

In [20]:
def print_articles_published(start_date, end_date):
    start_date = pd.Timestamp(start_date)
    end_date = pd.Timestamp(end_date)
    stat_df = df.loc[(df.index >= start_date) & (df.index <= end_date)].copy()
    total_words = stat_df['word_count'].sum()
    total_read_time = stat_df['read_time'].sum()
    num_articles = len(stat_df)
    print(f'According to our dataset, published by Medium.com, there are {num_articles} articles between {start_date.date()} and {end_date.date()}.')
    print(f'These articles totalled {total_words:,} words and {total_read_time/60:.2f} hours to read.')
    
_ = interact(print_articles_published,
             start_date=widgets.DatePicker(value=pd.to_datetime('2018-01-01')),
             end_date=widgets.DatePicker(value=pd.to_datetime('2019-01-01')))

interactive(children=(DatePicker(value=Timestamp('2018-01-01 00:00:00'), description='start_date', step=1), Da…

### Example 8

In [21]:
def plot_up_to(column, date):
    date = pd.Timestamp(date)
    plot_df = df.loc[df.index <= date].copy()
    plot_df[column].cumsum().iplot(mode='markers+lines', 
                                   xTitle='published date',
                                   yTitle=column, 
                                  title=f'Cumulative {column.title()} Until {date.date()}')
    
_ = interact(plot_up_to, column=widgets.Dropdown(options=list(df.select_dtypes('number').columns)), 
             date = widgets.DatePicker(value=pd.to_datetime('2019-01-01')))

interactive(children=(Dropdown(description='column', options=('claps', 'days_since_publication', 'fans', 'num_…