# DataFrame - Basic Slicing

In [1]:
import pandas as pd
pd.__version__

u'0.18.1'

In [2]:
# feature are columns
feat = "A B C D".split()
# drop_feat are columns we want to remove at some point
drop_feat = ['A','D']

In [3]:
feat

['A', 'B', 'C', 'D']

In [4]:
drop_feat


['A', 'D']

In [5]:
[range(4),range(4),range(4),range(4)]

[[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]

In [6]:
df = pd.DataFrame([range(4),range(4),range(4),range(4)], columns=feat)

In [7]:
df

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,0,1,2,3
2,0,1,2,3
3,0,1,2,3


## Selecting

#### Selecting columns with a **list of strings**, selects columns by name

In [8]:
cols = ['B','C','D']
df[cols]

Unnamed: 0,B,C,D
0,1,2,3
1,1,2,3
2,1,2,3
3,1,2,3



#### Selecting columns with a **list of integers**, selects columns by index

In [9]:
df[[2,3]]

Unnamed: 0,C,D
0,2,3
1,2,3
2,2,3
3,2,3


#### More Complex Selection : Slicing rows by index, columns by name

In [10]:
df.ix[:2,["B"]]

Unnamed: 0,B
0,1
1,1
2,1


#### Repeating a column name / index, just returns that column multiple times

In [11]:
df.ix[[2,1,2,2,2,2]]

Unnamed: 0,A,B,C,D
2,0,1,2,3
1,0,1,2,3
2,0,1,2,3
2,0,1,2,3
2,0,1,2,3
2,0,1,2,3


### Returns : DataFrame vs Series

#### With lists you get a data frame

In [12]:
df[['B']]

Unnamed: 0,B
0,1
1,1
2,1
3,1


In [13]:
df.ix[[2],["B"]]

Unnamed: 0,B
2,1


#### Without, a Series

In [14]:
df['B']

0    1
1    1
2    1
3    1
Name: B, dtype: int64

In [15]:
df.ix[2,["B"]]

B    1
Name: 2, dtype: int64

In [16]:
df.ix[[2],"B"]

2    1
Name: B, dtype: int64

## Advanced

In [17]:
df

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,0,1,2,3
2,0,1,2,3
3,0,1,2,3


In [18]:
dfx = df.copy()

#### Does order matter?

In [19]:
dfx.columns = ['A D B C'.split()]
dfx

Unnamed: 0,A,D,B,C
0,0,1,2,3
1,0,1,2,3
2,0,1,2,3
3,0,1,2,3


In [20]:
cols = ['B','D']
dfx.ix[:,cols]

Unnamed: 0,B,D
0,2,1
1,2,1
2,2,1
3,2,1


In [21]:
dfx[cols]

Unnamed: 0,B,D
0,2,1
1,2,1
2,2,1
3,2,1


In [22]:
dfx.index.astype(int)

Int64Index([0, 1, 2, 3], dtype='int64')

In [23]:
# set the index to 1-4
dfx.index = range(1,5)

In [24]:
# Create a boolean mask
dfx.index.astype(bool)

Index([True, True, True, True], dtype='object')

In [29]:
dfx.index % 2

Int64Index([1, 0, 1, 0], dtype='int64')

In [30]:
# Create a boolen mask for all the odd rows
(dfx.index % 2).astype(bool)

Index([True, False, True, False], dtype='object')

In [31]:
# Apply the boolean mask to select all the odd rows
dfx[list((dfx.index % 2).astype(bool))]

Unnamed: 0,A,D,B,C
1,0,1,2,3
3,0,1,2,3


#### Using boolean masks intuitively

In [34]:
# Set the age column
dfx['age'] = range(38,42)

In [35]:
# Select the rows with age over 40
dfx[dfx.age > 40]

Unnamed: 0,A,D,B,C,age
4,0,1,2,3,41


#### What happens when you slice a non-existing column order?

In [37]:
dfx.ix[:,'B':'D']

1
2
3
4


In [38]:
df.ix[:2,'B':]

Unnamed: 0,B,C,D
0,1,2,3
1,1,2,3
2,1,2,3


### Cell Assignment

In [40]:
df.ix[3,'C'] = 9

In [41]:
df

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,0,1,2,3
2,0,1,2,3
3,0,1,9,3


## Dropping columns

In [43]:
df.columns.difference(drop_feat)

Index([u'B', u'C'], dtype='object')

In [44]:
df = df[df.columns.difference(drop_feat)]

In [45]:
df

Unnamed: 0,B,C
0,1,2
1,1,2
2,1,2
3,1,9


#### OR

In [46]:
df = pd.DataFrame([range(4),range(4),range(4),range(4)], columns="A B C D".split())

In [47]:
df.drop(drop_feat, axis=1, inplace=True)

In [48]:
df

Unnamed: 0,B,C
0,1,2
1,1,2
2,1,2
3,1,2


### Reading From a ClipBoard

In [49]:
pd.read_clipboard()

Unnamed: 0,http://localhost:8888/notebooks/notebooks/demos/groupby-example.ipynb
