## Visualization in Python with matplotlib and numpy

### labels, title, and texts

### Scatterplots

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
N = 30
x = 0.9* np.random.rand(N)
print(x)
y = 0.9* np.random.rand(N)
print(y)
plt.scatter(x,y)
plt.show()

### Histograms

In [None]:
x = np.random.normal(2, 0.5, 10)
plt.hist(x, bins=50)
plt.show()

In [None]:
x = []
y = []
for v in range(21):
    x.append(v)
    y.append(v**2)
x.reverse()
print(x)
print(y)

plt.xticks(np.arange(min(x), max(x)+10, 10.0))
plt.yticks(np.arange(min(y), max(y)+10, 100.0))


plt.scatter(x,y)
plt.show()

In [None]:
# more graphs
from mpl_toolkits.mplot3d import axes3d

x, y, z = axes3d.get_test_data(0.1)

# wire graph
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_wireframe(x, y, z)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(x, y, z, cmap = 'viridis')

for angle in range(-90, 270):
   ax.view_init(90, angle)
   plt.draw()

plt.show()

In [None]:
linear_data = np.array([1,2,3,4,5,6,7,8])

exponential_data = linear_data**2 

#plt.subplot?
plt.figure()

# subplot with 1 row, 2 columns, and current axis is 1st subplot axes
plt.subplot(1, 2, 1)

plt.plot(linear_data, '-o')

plt.subplot(1, 2, 2)
plt.plot(exponential_data, '-o')


plt.subplot(1, 2, 1)
plt.plot(exponential_data, '-x')

# subplot with 1 row, 2 columns, and current axis is 2nd subplot axes
# pass sharey=ax1 to ensure the two subplots share the same y axis
# create a 3x3 grid of subplots
fig, ((ax1, ax2, ax3), (ax4, ax5, ax6), (ax7, ax8, ax9)) = plt.subplots(3, 3, sharex = True)

ax5.plot(linear_data, '-')
ax9.plot(exponential_data, '-o')

# repeat with number of bins set to 100

## Pandas

In [None]:
# read a csv file to a Pandas DataFrame
df = pd.read_csv('./data/olympics.csv')
print(df.head())
df.columns

In [None]:
# read a csv file by skipping a row
df = pd.read_csv('./data/olympics.csv', 
                 index_col = 0, skiprows = 1)
print(df.head())
df.columns

In [None]:
# renaming columns
for col in df.columns:
    if col[:2]=='01':
        df.rename(columns={col:'Gold' + col[4:]}, inplace=True)
    if col[:2]=='02':
        df.rename(columns={col:'Silver' + col[4:]}, inplace=True)
    if col[:2]=='03':
        df.rename(columns={col:'Bronze' + col[4:]}, inplace=True)
    if col[:1]=='No':
        df.rename(columns={col:'#' + col[1:]}, inplace=True) 

print(df.head())


In [None]:
#accessing the data with constraints
only_gold = df.where(df['Gold'] > 0)
print(only_gold.head())

In [None]:

# drop data if na
only_gold = only_gold.dropna()
print(only_gold.head())

In [None]:
# read daa from a csv file into DataFrame
df = pd.read_csv('./data/log.csv')
df.head()

In [None]:
# set time as index
df = df.set_index('time')
df = df.sort_index()
df.head()

In [None]:
# reset index

df = df.reset_index()
df = df.set_index(['time', 'user'])
df.head()

In [None]:
# Missing Values Handling

df = df.fillna(method='ffill')
df.head()

## Pandas data visualization

In [None]:
# create 3 datasets using different data distributions, normal, random and gamma
normal_sample = np.random.normal(loc=0.0, scale=1.0, size=10000)
random_sample = np.random.random(size=10000)
gamma_sample = np.random.gamma(2, size=10000)


# create a DataFrame with these datasets using keys, 'normal', 'random', and 'gamma'
df = pd.DataFrame({'normal': normal_sample, 'random': random_sample, 
                   'gamma': gamma_sample})

print(df.describe())

In [None]:
plt.figure()
plt.boxplot(df['normal'], whis = 'range')

plt.show()

In [None]:
plt.figure()
plt.hist(df['normal'], bins = 100)
plt.show()

In [None]:
plt.figure()
plt.boxplot([df['normal'], df['random'], df['gamma']], whis = 'range')
plt.show()

In [None]:
# read data from a csv file into DataFrame
iris = pd.read_csv('./data/iris.csv')

# draw scatter graphs: x axis is 'Name' values
plt.figure()
for species, irissubset in iris.groupby('Name'):
    plt.scatter(irissubset['PetalLength'], irissubset['PetalWidth'], alpha=0.8, label=species)

plt.xlabel('PetalLength')
plt.ylabel('PetalWidth')
plt.legend();

plt.show()

In [None]:
# read data from CSV file to dataframe
iris = pd.read_csv('./data/iris.csv')
print(iris.head())

# scatter matrix plot
pd.plotting.scatter_matrix(iris)

plt.figure()
pd.plotting.parallel_coordinates(iris, 'Name')


In [None]:
import seaborn as sns

np.random.seed(1234)

v1 = pd.Series(np.random.normal(0,10,1000), name='v1')
v2 = pd.Series(2*v1 + np.random.normal(60,15,1000), name='v2')

plt.figure()
plt.hist(v1, alpha=0.7, bins=np.arange(-50,150,5), label='v1');
plt.hist(v2, alpha=0.7, bins=np.arange(-50,150,5), label='v2');
plt.legend();

# plot a kernel density estimation over a stacked barchart
plt.figure()
plt.hist([v1, v2], histtype='barstacked', normed=True);
v3 = np.concatenate((v1,v2))
sns.kdeplot(v3);

sns.jointplot(v1, v2, alpha=0.4);


# set the seaborn style for all the following plots
sns.set_style('white')
sns.jointplot(v1, v2, kind='kde', space=0);


# read data from CSV file to dataframe
iris = pd.read_csv('./data/iris.csv')

# pair-wise kernel density estimation plot
sns.pairplot(iris, hue='Name', diag_kind='kde', size=2);