# Pandas

## Summary
- DataFrame
- Selection

    | Functions | Description | Example |
    | :-------- | :---------- | :------ |
    | loc   | Access a group of rows and columns by label(s) or a boolean array.    | `df.loc['index1, 'col1']` |
    |       | [] return a Series                |
    |       | [[]] return a DataFrame           |       
    |       | Can set value for selection       | `df.loc['index1', 'col1] = 10`    |
    | iloc  | Purely integer-location based indexing for selection by position.     | `df.iloc[[0, 2], [1, 3]]`|
    |       |                                   | `df.iloc[:, [True, False, True, False]]`

# Setup

In [34]:
import pandas as pd
import numpy as np

# Create DataFrame

In [3]:
# data[col][row]
data = {
    'col1': ['c1r1', 'c1r2'], 
    'col2': ['c2r1', 'c2r2'],
    'col3': ['c3r1', 'c3r2']
    }

df = pd.DataFrame(data)
df

Unnamed: 0,col1,col2,col3
0,c1r1,c2r1,c3r1
1,c1r2,c2r2,c3r2


In [4]:
# Notice the different of data [row][col]
data = [
    ['c1r1', 'c2r1', 'c3r1'], 
    ['c1r2', 'c2r2', 'c3r2'],
    ]

cols = ['col1', 'col2', 'col3']

df = pd.DataFrame(data, columns=cols)
df

Unnamed: 0,col1,col2,col3
0,c1r1,c2r1,c3r1
1,c1r2,c2r2,c3r2


# Selection

In [8]:
from sklearn import datasets

iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
# df['target'] = iris.target.tolist()
df

df_target = pd.DataFrame(iris.target, columns=['target'])

In [None]:
# Select by condition
# dataframe[conditions]
df[(df['sepal length (cm)'] > 6) & (df['sepal width (cm)'] > 3.5)]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
109,7.2,3.6,6.1,2.5,2
117,7.7,3.8,6.7,2.2,2
131,7.9,3.8,6.4,2.0,2


In [14]:
# Or it could mix with other df
df[(df['sepal length (cm)'] > 7) & (df_target['target'] == 2)]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
102,7.1,3.0,5.9,2.1
105,7.6,3.0,6.6,2.1
107,7.3,2.9,6.3,1.8
109,7.2,3.6,6.1,2.5
117,7.7,3.8,6.7,2.2
118,7.7,2.6,6.9,2.3
122,7.7,2.8,6.7,2.0
125,7.2,3.2,6.0,1.8
129,7.2,3.0,5.8,1.6
130,7.4,2.8,6.1,1.9


In [22]:
# Select by index, index [0, 4] and column [1, 3]
df.iloc[[0, 4], [1, 3]]

Unnamed: 0,sepal width (cm),petal width (cm)
0,3.5,0.2
4,3.6,0.2


In [32]:
# Select by index from 135 to 140, and column name ['sepal width (cm)']
df.loc[135:140, ['sepal width (cm)']]

Unnamed: 0,sepal width (cm)
135,3.0
136,3.4
137,3.1
138,3.0
139,3.1
140,3.1


## isin

In [59]:
df.isin([0, 2])
# Is not in
~df.isin([0, 2])

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
145,False,False,False,False
146,False,False,False,False
147,False,False,False,True
148,False,False,False,False


## any and all

In [96]:
# Is index in 'sepal length (cm)' contain num in range 6...8
num_range = [i for i in range(6, 8)]

df[['sepal length (cm)']].isin(num_range).any(axis=1)

0      False
1      False
2      False
3      False
4      False
       ...  
145    False
146    False
147    False
148    False
149    False
Length: 150, dtype: bool

In [93]:
# Use it to select by index

df.loc[df[['sepal length (cm)']].isin(num_range).any(axis=1)]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
50,7.0,3.2,4.7,1.4
62,6.0,2.2,4.0,1.0
78,6.0,2.9,4.5,1.5
83,6.0,2.7,5.1,1.6
85,6.0,3.4,4.5,1.6
119,6.0,2.2,5.0,1.5
138,6.0,3.0,4.8,1.8


## where

## replace

## apply
Apply function to df, or specify axis 

In [39]:
df.apply(np.sqrt)
df.apply(np.sum, axis=0)

sepal length (cm)    876.5
sepal width (cm)     458.6
petal length (cm)    563.7
petal width (cm)     179.9
dtype: float64