# The Next Level of Data Visualization in Python

Link to : 
  - https://towardsdatascience.com/the-next-level-of-data-visualization-in-python-dd6e99039d5e
  - https://github.com/WillKoehrsen/Data-Analysis/blob/master/plotly/Plotly%20Whirlwind%20Introduction.ipynb

## `plotly`

open-source library built on `plotly.js`

In [65]:
import pandas as pd
import numpy as np

In [66]:
# std plotly imports
# https://stackoverflow.com/questions/62094165/how-do-i-fix-a-deprecated-module-for-plotly-plotly
import chart_studio.plotly as ply 
import plotly.graph_objs as go 
from plotly.offline import iplot, init_notebook_mode

# using plotly + cufflinks in offline mode
import cufflinks

cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

In [67]:
df = pd.read_parquet('https://github.com/WillKoehrsen/Data-Analysis/blob/master/plotly/data/medium_data_2019_01_06?raw=true')
df.head()

Unnamed: 0,claps,days_since_publication,fans,link,num_responses,publication,published_date,read_ratio,read_time,reads,...,type,views,word_count,claps_per_word,editing_days,<tag>Education,<tag>Data Science,<tag>Towards Data Science,<tag>Machine Learning,<tag>Python
119,2,574.858594,2,https://medium.com/p/screw-the-environment-but...,0,,2017-06-10 14:25:00,41.98,7,68,...,published,162,1859,0.001076,0,0,0,0,0,0
118,18,567.540639,3,https://medium.com/p/the-vanquishing-of-war-pl...,0,,2017-06-17 22:02:00,32.93,14,54,...,published,164,3891,0.004626,0,0,0,0,0,0
121,50,554.920762,19,https://medium.com/p/capstone-project-mercedes...,0,,2017-06-30 12:55:00,20.19,42,215,...,published,1065,12025,0.004158,0,0,0,0,1,1
122,0,554.07816,0,https://medium.com/p/home-of-the-scared-5af0fe...,0,,2017-07-01 09:08:00,35.85,9,19,...,published,53,2533,0.0,0,0,0,0,0,0
114,0,550.090507,0,https://medium.com/p/the-triumph-of-peace-f485...,0,,2017-07-05 08:51:00,8.77,14,5,...,published,57,3892,0.0,1,0,0,0,0,0


In [68]:
df.columns

Index(['claps', 'days_since_publication', 'fans', 'link', 'num_responses',
       'publication', 'published_date', 'read_ratio', 'read_time', 'reads',
       'started_date', 'tags', 'text', 'title', 'title_word_count', 'type',
       'views', 'word_count', 'claps_per_word', 'editing_days',
       '<tag>Education', '<tag>Data Science', '<tag>Towards Data Science',
       '<tag>Machine Learning', '<tag>Python'],
      dtype='object')

## Single Variable Distributions: Histogram and Boxplots

### Histograms

In [69]:
df['claps'].iplot(
    kind='hist', 
    bins=30,        # Bins count
    linecolor='black',
    xTitle='claps', 
    yTitle='count', 
    title='Claps Distribution'
)

#### Percentage Histogram

In [70]:
df['reads'].iplot(
    kind='hist',
    bins=30,
    xTitle='reads',
    linecolor='black',
    histnorm='percent',
    yTitle='percentage (%)',
    title='Reads Distribution in Percent'
)

#### Grouped and Overlaid Histogram

When we want to display two different distributions on the same plot,
we can group together the data to show it SxS.
Set `barmode` to `group` or `overlay` with two distribution

In [71]:
def to_time(dt):
    return dt.hour + dt.minute / 60

In [72]:
df['time_started'] = df['started_date'].apply(to_time)
df['time_published'] = df['published_date'].apply(to_time)

df['time_started']

119    14.400000
118    22.033333
121    12.000000
122    18.350000
114    20.300000
         ...    
17     11.683333
18     16.933333
0      20.150000
1       9.950000
2      21.600000
Name: time_started, Length: 126, dtype: float64

In [73]:
df[["time_started", "time_published"]].iplot(
    kind="hist",
    linecolor="black",
    bins=24,
    histnorm="percent",
    bargap=0.1,
    opacity=0.8,
    barmode="group",
    xTitle="Time of Day",
    yTitle="(%) of Articles",
    title="Time Started and Time Published",
)

In [74]:
df[["time_published", "time_started"]].iplot(
    kind="hist",
    bins=24,
    linecolor="black",
    opacity=0.8,
    histnorm="percent",
    barmode="overlay",
    xTitle="Time of day",
    yTitle="(%) of articles",
    title="Time Started and Time Published Overlaid",
)

### Bar Plot

for bar plot, we need to apply some sort of aggregation function then plot

In [75]:
df.groupby('publication').count()['fans'].iplot(
    kind='bar',
    yTitle='Number of Fans',
    linecolor='black',
    title='Fans by Publication'
)

In [76]:
[c for c in df if "<tag>" in c]

['<tag>Education',
 '<tag>Data Science',
 '<tag>Towards Data Science',
 '<tag>Machine Learning',
 '<tag>Python']

In [77]:
df[[c for c in df if "<tag>" in c]].sum().iplot(
    kind='bar',
    xTitle='Tag',
    yTitle='Number of Articles with Tag',
    title='Frequency of Tags',
    linecolor='black',
    sortbars=True,
)

#### Bar Plot with 2 categories

In [78]:
df2 = (
    df[['views', 'reads', 'published_date']]
    .set_index('published_date')        # Set index to be the date
    .resample('M').mean()               # Resample to (M)onth frequency, then take the mean
)

df2.head()

Unnamed: 0_level_0,views,reads
published_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-06-30,463.666667,112.333333
2017-07-31,5521.333333,1207.166667
2017-08-31,6242.8,993.7
2017-09-30,2113.0,279.0
2017-10-31,,


In [79]:
df2.iplot(
    kind='bar',
    xTitle='Date',
    yTitle='Average',
    title='Monthly Average Views and Reads'
)

#### Bar Plot with Second Y-Axis

In [80]:
df2 = (
    df[['views', 'read_time', 'published_date']]
    .set_index('published_date')
    .resample('M')
    .mean()
)

df2.head()

Unnamed: 0_level_0,views,read_time
published_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-06-30,463.666667,21.0
2017-07-31,5521.333333,17.166667
2017-08-31,6242.8,24.0
2017-09-30,2113.0,24.0
2017-10-31,,


In [81]:
df2.iplot(
    kind='bar',
    xTitle='Date',
    secondary_y='read_time',
    secondary_y_title='Average Read Time',
    yTitle='Average Views',
    title='Monthly Averages'
)

### Boxplot

In [82]:
df[['claps', 'fans']].iplot(
    secondary_y='fans',
    secondary_y_title='Fans',
    kind='box',
    yTitle='Claps',
    title='Box Plot of Claps and Fans'
)

#### Boxplot with Different Categories

if we have different categories that we want to plot, we need to use `pivot`.
this transform each unique value in a column into a separate column

In [83]:
df.head()

Unnamed: 0,claps,days_since_publication,fans,link,num_responses,publication,published_date,read_ratio,read_time,reads,...,word_count,claps_per_word,editing_days,<tag>Education,<tag>Data Science,<tag>Towards Data Science,<tag>Machine Learning,<tag>Python,time_started,time_published
119,2,574.858594,2,https://medium.com/p/screw-the-environment-but...,0,,2017-06-10 14:25:00,41.98,7,68,...,1859,0.001076,0,0,0,0,0,0,14.4,14.416667
118,18,567.540639,3,https://medium.com/p/the-vanquishing-of-war-pl...,0,,2017-06-17 22:02:00,32.93,14,54,...,3891,0.004626,0,0,0,0,0,0,22.033333,22.033333
121,50,554.920762,19,https://medium.com/p/capstone-project-mercedes...,0,,2017-06-30 12:55:00,20.19,42,215,...,12025,0.004158,0,0,0,0,1,1,12.0,12.916667
122,0,554.07816,0,https://medium.com/p/home-of-the-scared-5af0fe...,0,,2017-07-01 09:08:00,35.85,9,19,...,2533,0.0,0,0,0,0,0,0,18.35,9.133333
114,0,550.090507,0,https://medium.com/p/the-triumph-of-peace-f485...,0,,2017-07-05 08:51:00,8.77,14,5,...,3892,0.0,1,0,0,0,0,0,20.3,8.85


In [84]:
df.shape

(126, 27)

In [86]:
# Unique values of `publication` series are pivoted as column names
# values 

df2 = df.pivot(columns='publication', values='fans')
df2

publication,Engineering @ Feature Labs,None,Noteworthy - The Journal Blog,The Reality Project,Towards Data Science
0,,,,34.0,
1,,,,29.0,
2,,,,13.0,
3,,34.0,,,
4,,47.0,,,
...,...,...,...,...,...
121,,19.0,,,
122,,0.0,,,
123,,,,,43.0
124,,,,,861.0


In [89]:
df2.iplot(
    kind='box',
    layout=dict(
        height=600,
        yaxis=dict(title='fans'),
        title='Fans by Publication',
        margin=dict(b=140)
    )
)

In [90]:
df[df['read_time'] <= 10].pivot(columns='read_time', values='reads').iplot(
    kind='box',
    colorscale='set2',
    xTitle='Read Time',
    yTitle='Number of Reads',
    title='Box Plot of Reads by Reading Time'
)

### Scatter Plots

visualize the relationship between two variables

#### Time-Series

set datetime as the index, then pass dataframe as y and plotly will know to use datetime index as xaxis

In [93]:
tds = df[df['publication'] == 'Towards Data Science'].set_index('published_date')
tds.head()

Unnamed: 0_level_0,claps,days_since_publication,fans,link,num_responses,publication,read_ratio,read_time,reads,started_date,...,word_count,claps_per_word,editing_days,<tag>Education,<tag>Data Science,<tag>Towards Data Science,<tag>Machine Learning,<tag>Python,time_started,time_published
published_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-12-27 11:20:00,4800,374.986885,861,https://towardsdatascience.com/random-forest-i...,27,Towards Data Science,17.68,21,28566,2017-12-26 15:11:00,...,4494,1.068091,0,0,1,0,1,1,15.183333,11.333333
2018-01-06 20:15:00,857,364.615092,112,https://towardsdatascience.com/improving-rando...,6,Towards Data Science,22.76,17,7207,2018-01-03 21:38:00,...,3504,0.244578,2,0,1,0,1,1,21.633333,20.25
2018-01-07 20:37:00,186,363.599979,45,https://towardsdatascience.com/data-science-a-...,1,Towards Data Science,28.64,15,775,2018-01-07 13:18:00,...,3569,0.052115,0,0,1,0,0,0,13.3,20.616667
2018-01-08 16:58:00,119,362.752029,43,https://towardsdatascience.com/a-theory-of-pre...,2,Towards Data Science,31.53,11,740,2018-01-02 17:23:00,...,2817,0.042244,5,0,1,0,0,0,17.383333,16.966667
2018-01-09 21:49:00,2000,361.550093,392,https://towardsdatascience.com/hyperparameter-...,12,Towards Data Science,23.99,12,25505,2018-01-09 12:26:00,...,2456,0.814332,0,0,1,0,1,1,12.433333,21.816667


In [95]:
tds['read_time'].iplot(
    mode='lines+markers',
    opacity=0.8,
    size=8,
    symbol=1,
    xTitle='Date',
    yTitle='Read Time (min)',
    title='Read time trends'
)

#### Two Variables Time-Series

In [96]:
tds[["claps", "fans"]].iplot(
    mode="lines+markers",
    opacity=0.8,
    size=8,
    symbol=1,
    xTitle="Date",
    yTitle="Fans and Claps",
    title="Fans and Claps over Time",
)

In [97]:
tds[["fans", "word_count", "title"]].iplot(
    y="fans",
    mode="lines+markers",
    secondary_y="word_count",
    secondary_y_title="Word Count",
    opacity=0.8,
    size=8,
    symbol=1,
    xTitle="Date",
    yTitle="Claps",
    text="title",
    title="Fans and Word Count over Time",
)

### Text Annotations on Plot