In [1]:
%matplotlib inline 

import numpy as np 
import scipy as sp 
import matplotlib as mpl # this actually imports matplotlib(just for quick acess)
import matplotlib.cm as cm #allows us easy access to colormaps
import matplotlib.pyplot as plt #sets up plotting under plt
import pandas as pd #lets us handle data as dataframes

# pandas presets
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns #sets up styles and gives us more plotting options

### Load the data 
Load the data from the `iris.csv` dataset. 
First we input the data

In [5]:
df = pd.read_csv('../datasets/iris.csv')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,name
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


### Messing with selection methods

Selecting per row

In [13]:
df[0:3]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,name
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa


Selecting per column

In [15]:
df['sepal_width'].head()  # this returns a Series

0    3.5
1    3.0
2    3.2
3    3.1
4    3.6
Name: sepal_width, dtype: float64

Multiple columns

In [17]:
df[['sepal_width', 'name']].head()  # multiple columns return a dataframe

Unnamed: 0,sepal_width,name
0,3.5,setosa
1,3.0,setosa
2,3.2,setosa
3,3.1,setosa
4,3.6,setosa


Selecting rows by their label(in **this** case, it does the same thing as an index select) 

In [21]:
print df.loc[0], '\n\n', df.iloc[0]

sepal_length       5.1
sepal_width        3.5
petal_length       1.4
petal_width        0.2
name            setosa
Name: 0, dtype: object 

sepal_length       5.1
sepal_width        3.5
petal_length       1.4
petal_width        0.2
name            setosa
Name: 0, dtype: object


Selecting portions. Works pretty much like `numpy` indexing 

In [23]:
df.loc[:, ['name', 'sepal_width']].head()

Unnamed: 0,name,sepal_width
0,setosa,3.5
1,setosa,3.0
2,setosa,3.2
3,setosa,3.1
4,setosa,3.6


In [38]:
# or like this
filter_df = df.loc[df.name == 'setosa', ['sepal_length', 'name']]
filter_df.head()

Unnamed: 0,sepal_length,name
0,5.1,setosa
1,4.9,setosa
2,4.7,setosa
3,4.6,setosa
4,5.0,setosa


### Assignment

In [44]:
df.at[1, 'sepal_width'] = 2
df.at[1, 'sepal_width']  # scalar setting works works

2.0

In [50]:
df.loc[1:2, 'name'] = 'aaaa'
df.loc[1:2, 'name'] 

1    aaaa
2    aaaa
Name: name, dtype: object

In [52]:
# size mismatch(does not work with)
df.loc[1:3, 'name'] = ['aaaa', 'ss']

ValueError: Must have equal len keys and value when setting with an iterable

In [61]:
# mismatch is however handled by Seires objects
# the missing values are set to NaN
df.loc[1:3, 'name'] = pd.Series(data=['ab', 'ba'], index=[2, 1])
df.loc[1:3, 'name']

1     ba
2     ab
3    NaN
Name: name, dtype: object

In [72]:
# this also works but it has a bit of overhead for
# deciding what you are indexing
df[3:5][['sepal_width', 'name']]['name']

3       NaN
4    setosa
Name: name, dtype: object

## Grouping

In [74]:
df = pd.read_csv('../datasets/iris.csv')  # reload the data

In [93]:
grouped = df.groupby('name')
grouped[['sepal_width', 'sepal_length']].agg(np.mean)

Unnamed: 0_level_0,sepal_width,sepal_length
name,Unnamed: 1_level_1,Unnamed: 2_level_1
setosa,3.418,5.006
versicolor,2.77,5.936
virginica,2.974,6.588
