# Python Data Visualization landscape.

by  Jingru Ma


Before going into data visualization using pandas or other library let go over basics of matplotlib.

[matplotlib](https://matplotlib.org/) is python library for plotting. It was originally developed by [John D. Hunter](https://en.wikipedia.org/wiki/John_D._Hunter).

Other important libraries are [seaborn](https://seaborn.pydata.org/) and [bokeh](https://bokeh.pydata.org/en/latest/)


look at this [presentation](https://www.youtube.com/watch?v=FytuB8nFHPQ) by  Jake VanderPlas to see whole landscape. 
The Python Visualization Landscape PyCon 2017

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy.random import randn
%matplotlib inline

In [None]:
#A Brief matplotlib APT Primer

In [None]:
#plot
data = np.arange(10)
plt.plot(data)


In [None]:
#create a new figure and three subplot
fig = plt.figure()
ax1 = fig.add_subplot(2, 2, 1)
ax2 = fig.add_subplot(2, 2, 2)
ax3 = fig.add_subplot(2, 2, 3)
#create black dashed line on third subplot
plt.plot(randn(50).cumsum(), 'k--')
#plot histogram on first subplot
_ = ax1.hist(randn(100),bins=20,color='k',alpha=0.3)
#creat scatter on second subplot
ax2.scatter(np.arange(30), np.arange(30) + 3 * randn(30))


In [None]:
#Adjusting the spacing around subplots
fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)
for i in range(2):
    for j in range(2):
        axes[i, j].hist(np.random.randn(500), bins=50, color='k', alpha=0.5)
plt.subplots_adjust(wspace=0, hspace=0)

In [None]:
#Colors, Markers, and Line Styles
plt.plot(randn(30).cumsum(), 'ko--')
plt.plot(randn(30).cumsum(), color='k', linestyle='dashed', marker='o')

In [None]:
#dchange draw style
data = np.random.randn(30).cumsum()
plt.plot(data, 'k--', label='Default')
plt.plot(data, 'k-', drawstyle='steps-post', label='steps-post')
plt.legend(loc='best')

In [None]:
#Ticks, Labels, and Legends
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(np.random.randn(1000).cumsum())
#to change the x-axis ticks(the range of label to put),labels and plot title.
ticks = ax.set_xticks([0, 250, 500, 750, 1000])
labels = ax.set_xticklabels(['one', 'two', 'three', 'four', 'five'],rotation=30, fontsize='small')
ax.set_title('My first matplotlib plot')
ax.set_xlabel('Stages')

In [None]:
#Adding legends
fig = plt.figure(); ax = fig.add_subplot(1, 1, 1)
ax.plot(randn(1000).cumsum(), 'k', label='one')
ax.plot(randn(1000).cumsum(), 'k--', label='two')
ax.plot(randn(1000).cumsum(), 'k.', label='three')
ax.legend(loc='best')
#Annotation
ax.text(200,20, 'Hello World!', family='monospace', fontsize=10)

In [None]:
#Drawing on a Subplot
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color='k', alpha=0.3)
circ = plt.Circle((0.7, 0.2), 0.15, color='b', alpha=0.3)
pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],
                   color='g', alpha=0.5)

ax.add_patch(rect)
ax.add_patch(circ)
ax.add_patch(pgon)

In [None]:
#Saving Plots to File
plt.savefig()
#Configuration
plt.rc()


In [None]:
#Plotting with pandas and seaborn

In [None]:
#Line Plots
s = pd.Series(np.random.randn(10).cumsum(), index=np.arange(0, 100, 10))
s.plot()
df = pd.DataFrame(np.random.randn(10, 4).cumsum(0),columns=['A', 'B', 'C', 'D'],index=np.arange(0, 100, 10))
df.plot()

In [None]:
#Bar Plots
fig, axes = plt.subplots(2, 1)
data = pd.Series(np.random.rand(16), index=list('abcdefghijklmnop'))
data.plot.bar(ax=axes[0], color='k', alpha=0.7)
data.plot.barh(ax=axes[1], color='k', alpha=0.7)

df = pd.DataFrame(np.random.rand(6, 4),index=['one', 'two', 'three', 'four', 'five', 'six'],columns=pd.Index(['A', 'B', 'C', 'D'], name='Genus'))
df.plot.bar()
df.plot.barh(stacked=True, alpha=0.5)



In [None]:
#Example 
tips = pd.read_csv('tips.csv')
party_counts = pd.crosstab(tips['day'], tips['size'])
party_counts = party_counts.loc[:, 2:5]
party_pcts = party_counts.div(party_counts.sum(1), axis=0)
party_pcts.plot.bar()
#Party sizes increase in this chart



In [None]:
#seaborn to aggregate or summarize before making a plot
import seaborn as sns
tips['tip_pct'] = tips['tip'] / (tips['total_bill'] - tips['tip'])
tips.head()
sns.barplot(x='tip_pct', y='day', hue='time', data=tips, orient='h')
sns.set(style="whitegrid")

In [None]:
#Histograms and Density Plots
tips['tip_pct'].plot.hist(bins=50)


In [None]:
tips['tip_pct'].plot.density()

In [None]:
#plot both histogram and a continuous density
comp1 = np.random.normal(0, 1, size=200)
comp2 = np.random.normal(10, 2, size=200)
values = pd.Series(np.concatenate([comp1, comp2]))
sns.distplot(values, bins=100, color='k')

In [None]:
#Scatter or Point Plots
macro = pd.read_csv('macrodata.csv')
data = macro[['cpi', 'm1', 'tbilrate', 'unemp']]
trans_data = np.log(data).diff().dropna()
trans_data[-5:]

In [None]:
sns.regplot('m1', 'unemp', data=trans_data)
plt.title('Changes in log %s versus log %s' % ('m1', 'unemp'))

In [None]:
sns.pairplot(trans_data, diag_kind='kde', plot_kws={'alpha': 0.2})

In [None]:
#Facet Grids and Categorical Data 
sns.factorplot(x='day', y='tip_pct', row='time',col='smoker',kind='bar', data=tips[tips.tip_pct < 1])
sns.factorplot(x='tip_pct', y='day', kind='box',data=tips[tips.tip_pct < 0.5])