# PyData tour



- `pandas` and `numpy` are the primary libraries used for data analysis in Python
- `pd` and `np` are the conventional Python namespaces used; it is strongly recommended you do the same


In [None]:
import pandas as pd
import numpy as np


- `matplotlib` and `seaborn` are the most popular static data viz libraries in Python
- Again it is strongly recommended you use the `plt` and `sns` namespaces for both!


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

## Load a data set example: 

In [None]:
# url="https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/d546eaee765268bf2f487608c537c05e22e4b221/iris.csv"

# df = pd.read_csv(url)
# df.head()

df = sns.load_dataset("iris")

In [None]:
df.shape

In [None]:
df.describe()

## Selecting values

In [None]:
df.columns

In [None]:
df[['sepal_length']].iloc[:2]

In [None]:
df[['sepal_length', 'sepal_width']].iloc[:2]

## Selecting values continued

In [None]:
df[df.sepal_length>5].head()

In [None]:
df[df.species=="setosa"].head()

## NumPy Arrays

In [None]:
sepal_len = df.sepal_length.values
petal_len = df.petal_length.values

In [None]:
sepal_len.shape

In [None]:
np.mean(sepal_len), np.median(sepal_len)

In [None]:
sepal_len[:5]

## Broadcasting

In [None]:
diffs = np.abs(sepal_len[:,np.newaxis] - sepal_len[np.newaxis,:])
diffs.shape

In [None]:
sns.heatmap(diffs);

In [None]:
%%timeit
diffs = np.abs(sepal_len[:,np.newaxis] - sepal_len[np.newaxis,:])

In [None]:
%%timeit
diffs = np.zeros((150,150))

for i in range(150):
    for j in range(150):
        diffs[i,j] = np.abs(sepal_len[i] - sepal_len[j])

## Matplotlib : MATLAB-style API

In [None]:
x = np.arange(0,50, .1)
plt.plot(x, np.sin(x))

## Matplotlib : MATLAB-style API

In [None]:
plt.figure()  # create a plot figure


# create the first of two panels and set current axis
plt.subplot(2, 1, 1) # (rows, columns, panel number)
plt.plot(x, np.sin(x))

# create the second panel and set current axis
plt.subplot(2, 1, 2)
plt.plot(-x, np.cos(x));

## Matplotlib: Object Oriented API

In [None]:
# First create a grid of plots
# ax will be an array of two Axes objects
fig, ax = plt.subplots(2, figsize=(8,4))

# Call plot() method on the appropriate object
ax[0].plot(x, np.sin(x))
ax[1].plot(x, np.cos(x));

## Seaborn overview


In [None]:

tips = sns.load_dataset('tips')
tips['tip_pct'] = 100 * tips['tip'] / tips['total_bill']


In [None]:

with sns.axes_style(style='ticks'):
    grid = sns.FacetGrid(tips, row="sex", col="time", hue= "smoker", 
                         margin_titles=True)
    grid.map(plt.hist, "tip_pct", bins=np.linspace(0, 40, 15));

In [None]:

g = sns.catplot("day", "total_bill", "sex", data=tips, kind="box")


## feature engineering example

In [None]:
from sklearn.datasets import make_circles

np.random.seed(0)

X, y = make_circles(n_samples=400, factor=.1, noise=.05)

In [None]:
df_X = pd.DataFrame(X, columns = ["f1", "f2"])
df_X["f3"] = df_X.f1**2 + df_X.f2**2

In [None]:
sns.relplot(x = "f1", y = "f2", hue = "f3", data = df_X);