# Data Exploration with Pandas

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('../data/titanic-train.csv')

In [None]:
type(df)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

### Indexing

In [None]:
df.iloc[3]

In [None]:
df.loc[0:4,'Ticket']

In [None]:
df['Ticket'].head()

In [None]:
df[['Embarked', 'Ticket']].head()

### Selections

In [None]:
df[df['Age'] > 70]

In [None]:
df['Age'] > 70

In [None]:
df.query("Age > 70")

In [None]:
df[(df['Age'] == 11) & (df['SibSp'] == 5)]

In [None]:
df[(df.Age == 11) | (df.SibSp == 5)]

In [None]:
df.query('(Age == 11) | (SibSp == 5)')

### Unique Values

In [None]:
df['Embarked'].unique()

### Sorting

In [None]:
df.sort_values('Age', ascending = False).head()

### Aggregations

In [None]:
df['Survived'].value_counts()

In [None]:
df['Pclass'].value_counts()

In [None]:
df.groupby(['Pclass', 'Survived'])['PassengerId'].count()

In [None]:
df['Age'].min()

In [None]:
df['Age'].max()

In [None]:
df['Age'].mean()

In [None]:
df['Age'].median()

In [None]:
mean_age_by_survived = df.groupby('Survived')['Age'].mean()
mean_age_by_survived

In [None]:
std_age_by_survived = df.groupby('Survived')['Age'].std()
std_age_by_survived

### Merge

In [None]:
df1 = mean_age_by_survived.round(0).reset_index()
df2 = std_age_by_survived.round(0).reset_index()

In [None]:
df1

In [None]:
df2

In [None]:
df3 = pd.merge(df1, df2, on='Survived')

In [None]:
df3

In [None]:
df3.columns = ['Survived', 'Average Age', 'Age Standard Deviation']

In [None]:
df3

### Pivot Tables

In [None]:
df.pivot_table(index='Pclass',
               columns='Survived',
               values='PassengerId',
               aggfunc='count')

### Correlations

In [None]:
df['IsFemale'] = df['Sex'] == 'female'

In [None]:
correlated_with_survived = df.corr()['Survived'].sort_values()
correlated_with_survived

In [None]:
%matplotlib inline

In [None]:
correlated_with_survived.iloc[:-1].plot(kind='bar',
                                        title='Titanic Passengers: correlation with survival')

# Visual Data Exploration with Matplotlib

In [None]:
data1 = np.random.normal(0, 0.1, 1000)
data2 = np.random.normal(1, 0.4, 1000) + np.linspace(0, 1, 1000)
data3 = 2 + np.random.random(1000) * np.linspace(1, 5, 1000)
data4 = np.random.normal(3, 0.2, 1000) + 0.3 * np.sin(np.linspace(0, 20, 1000))

In [None]:
data = np.vstack([data1, data2, data3, data4]).transpose()

In [None]:
df = pd.DataFrame(data, columns=['data1', 'data2', 'data3', 'data4'])
df.head()

### Line Plot

In [None]:
df.plot(title='Line plot')

In [None]:
plt.plot(df)
plt.title('Line plot')
plt.legend(['data1', 'data2', 'data3', 'data4'])

### Scatter Plot

In [None]:
df.plot(style='.')

In [None]:
_ = df.plot(kind='scatter', x='data1', y='data2',
            xlim=(-1.5, 1.5), ylim=(0, 3))

### Histograms

In [None]:
df.plot(kind='hist',
        bins=50,
        title='Histogram',
        alpha=0.6)

### Cumulative distribution

In [None]:
df.plot(kind='hist',
        bins=100,
        title='Cumulative distributions',
        normed=True,
        cumulative=True,
        alpha=0.4)

### Box Plot

In [None]:
df.plot(kind='box',
        title='Boxplot')

### Subplots

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(5, 5))

df.plot(ax=ax[0][0],
        title='Line plot')

df.plot(ax=ax[0][1],
        style='o',
        title='Scatter plot')

df.plot(ax=ax[1][0],
        kind='hist',
        bins=50,
        title='Histogram')

df.plot(ax=ax[1][1],
        kind='box',
        title='Boxplot')

plt.tight_layout()

### Pie charts

In [None]:
gt01 = df['data1'] > 0.1
piecounts = gt01.value_counts()
piecounts

In [None]:
piecounts.plot(kind='pie',
               figsize=(5, 5),
               explode=[0, 0.15],
               labels=['<= 0.1', '> 0.1'],
               autopct='%1.1f%%',
               shadow=True,
               startangle=90,
               fontsize=16)

### Hexbin plot

In [None]:
data = np.vstack([np.random.normal((0, 0), 2, size=(1000, 2)),
                  np.random.normal((9, 9), 3, size=(2000, 2))])
df = pd.DataFrame(data, columns=['x', 'y'])

In [None]:
df.head()

In [None]:
df.plot()

In [None]:
df.plot(kind='kde')

In [None]:
df.plot(kind='hexbin', x='x', y='y', bins=100, cmap='rainbow')

# Unstructured data

### Images

In [None]:
from PIL import Image

In [None]:
img = Image.open('../data/iss.jpg')
img

In [None]:
type(img)

In [None]:
imgarray = np.asarray(img)

In [None]:
type(imgarray)

In [None]:
imgarray.shape

In [None]:
imgarray.ravel().shape

In [None]:
435 * 640 * 3

### Sound

In [None]:
from scipy.io import wavfile

In [None]:
rate, snd = wavfile.read(filename='../data/sms.wav')

In [None]:
from IPython.display import Audio

In [None]:
Audio(data=snd, rate=rate)

In [None]:
len(snd)

In [None]:
snd

In [None]:
plt.plot(snd)

In [None]:
_ = plt.specgram(snd, NFFT=1024, Fs=44100)
plt.ylabel('Frequency (Hz)')
plt.xlabel('Time (s)')

# Data Exploration Exercises

## Exercise 1
- load the dataset: `../data/international-airline-passengers.csv`
- inspect it using the `.info()` and `.head()` commands
- use the function [`pd.to_datetime()`](http://pandas.pydata.org/pandas-docs/version/0.20/generated/pandas.to_datetime.html) to change the column type of 'Month' to a datatime type
- set the index of df to be a datetime index using the column 'Month' and the `df.set_index()` method
- choose the appropriate plot and display the data
- choose appropriate scale
- label the axes

## Exercise 2
- load the dataset: `../data/weight-height.csv`
- inspect it
- plot it using a scatter plot with Weight as a function of Height
- plot the male and female populations with 2 different colors on a new scatter plot
- remember to label the axes

## Exercise 3
- plot the histogram of the heights for males and for females on the same plot
- use alpha to control transparency in the plot comand
- plot a vertical line at the mean of each population using `plt.axvline()`

## Exercise 4
- plot the weights of the males and females using a box plot
- which one is easier to read?
- (remember to put in titles, axes and legends)

## Exercise 5
- load the dataset: `../data/titanic-train.csv`
- learn about scattermatrix here: http://pandas.pydata.org/pandas-docs/stable/visualization.html
- display the data using a scattermatrix