# DataFrames

In [1]:
import pandas as pd
import numpy as np

In [2]:
from numpy.random import randn


In [6]:
df = pd.DataFrame(randn(5,4),index='1 2 3 4 5'.split(),columns='7 8 9 10'.split())

In [7]:
df

Unnamed: 0,7,8,9,10
1,-1.715947,-0.278171,-0.169605,-0.233486
2,-0.644372,-0.421756,0.755781,1.458239
3,0.197638,0.341062,1.332541,0.406564
4,1.4711,-1.396421,1.55461,-0.923326
5,0.771683,-1.621382,0.417423,0.110812


## Selection and Indexing


In [9]:
df['7']

1   -1.715947
2   -0.644372
3    0.197638
4    1.471100
5    0.771683
Name: 7, dtype: float64

In [10]:
# Pass a list of column names
df[['8','10']]

Unnamed: 0,8,10
1,-0.278171,-0.233486
2,-0.421756,1.458239
3,0.341062,0.406564
4,-1.396421,-0.923326
5,-1.621382,0.110812


DataFrame Columns are just Series

In [11]:
type(df['7'])

pandas.core.series.Series

**Creating a new column:**

In [12]:
df['15'] = df['7'] + df['8']

In [13]:
df

Unnamed: 0,7,8,9,10,15
1,-1.715947,-0.278171,-0.169605,-0.233486,-1.994118
2,-0.644372,-0.421756,0.755781,1.458239,-1.066128
3,0.197638,0.341062,1.332541,0.406564,0.538699
4,1.4711,-1.396421,1.55461,-0.923326,0.074679
5,0.771683,-1.621382,0.417423,0.110812,-0.849699


** Removing Columns**

In [17]:
df.drop('15',axis=1)

Unnamed: 0,7,8,9,10
1,-1.715947,-0.278171,-0.169605,-0.233486
2,-0.644372,-0.421756,0.755781,1.458239
3,0.197638,0.341062,1.332541,0.406564
4,1.4711,-1.396421,1.55461,-0.923326
5,0.771683,-1.621382,0.417423,0.110812


In [18]:
# Not inplace unless specified!
df

Unnamed: 0,7,8,9,10,15
1,-1.715947,-0.278171,-0.169605,-0.233486,-1.994118
2,-0.644372,-0.421756,0.755781,1.458239,-1.066128
3,0.197638,0.341062,1.332541,0.406564,0.538699
4,1.4711,-1.396421,1.55461,-0.923326,0.074679
5,0.771683,-1.621382,0.417423,0.110812,-0.849699


In [22]:
df.drop('9',axis=1,inplace=True)

In [23]:
df

Unnamed: 0,7,8,10
1,-1.715947,-0.278171,-0.233486
2,-0.644372,-0.421756,1.458239
3,0.197638,0.341062,0.406564
4,1.4711,-1.396421,-0.923326
5,0.771683,-1.621382,0.110812


Can also drop rows this way:

In [24]:
df.drop('2',axis=0)

Unnamed: 0,7,8,10
1,-1.715947,-0.278171,-0.233486
3,0.197638,0.341062,0.406564
4,1.4711,-1.396421,-0.923326
5,0.771683,-1.621382,0.110812


** Selecting Rows**

In [25]:
df.loc['3']

7     0.197638
8     0.341062
10    0.406564
Name: 3, dtype: float64

Or select based off of position instead of label 

In [26]:
df.iloc[2]

7     0.197638
8     0.341062
10    0.406564
Name: 3, dtype: float64

** Selecting subset of rows and columns **

In [28]:
df.loc['2','8']
df

Unnamed: 0,7,8,10
1,-1.715947,-0.278171,-0.233486
2,-0.644372,-0.421756,1.458239
3,0.197638,0.341062,0.406564
4,1.4711,-1.396421,-0.923326
5,0.771683,-1.621382,0.110812


### Conditional Selection

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [30]:
df

Unnamed: 0,7,8,10
1,-1.715947,-0.278171,-0.233486
2,-0.644372,-0.421756,1.458239
3,0.197638,0.341062,0.406564
4,1.4711,-1.396421,-0.923326
5,0.771683,-1.621382,0.110812


In [31]:
df>0

Unnamed: 0,7,8,10
1,False,False,False
2,False,False,True
3,True,True,True
4,True,False,False
5,True,False,True


In [32]:
df[df>0]

Unnamed: 0,7,8,10
1,,,
2,,,1.458239
3,0.197638,0.341062,0.406564
4,1.4711,,
5,0.771683,,0.110812


In [33]:
df[df['7']>0]

Unnamed: 0,7,8,10
3,0.197638,0.341062,0.406564
4,1.4711,-1.396421,-0.923326
5,0.771683,-1.621382,0.110812


In [35]:
df[df['7']>0]['7']

3    0.197638
4    1.471100
5    0.771683
Name: 7, dtype: float64

In [36]:
df[df['7']>0][['7','8']]

Unnamed: 0,7,8
3,0.197638,0.341062
4,1.4711,-1.396421
5,0.771683,-1.621382


For two conditions you can use | and & with parenthesis:

In [37]:
df[(df['7']>0) & (df['8'] > 1)]

Unnamed: 0,7,8,10


## More Index Details


In [38]:
df

Unnamed: 0,7,8,10
1,-1.715947,-0.278171,-0.233486
2,-0.644372,-0.421756,1.458239
3,0.197638,0.341062,0.406564
4,1.4711,-1.396421,-0.923326
5,0.771683,-1.621382,0.110812
