# Index Columns

* setting a new index
* filtering using an index
* adding information using an index (mini-join)

# Advanced Filtering

* pandas negation
* finding missing values

## Convenient Column Subsetting

* str accessor methods
* drop columns

In [1]:
import pandas as pd
import numpy as np

stock = pd.DataFrame({
    'item_no': pd.Series([1, 2, 2, 4, 5, 6, 7, 8, 9, 10], dtype='Int64'),
    'cost_class': pd.Series(['1st', '2nd', '3rd', '4th', '4th', '3rd', '2nd', np.nan, '1st', '3rd'], dtype='string'),
    'cost': pd.Series([10.99, np.nan, 2.99, np.nan, 2.99, 2.45, 5.99, 5.99, 3.00, None], dtype='float64'),
    'stock_code': pd.Series(['a', 'a', 'c', 'b', 'a', 'b', np.nan, np.nan, 'a', 'c'], dtype='string'),
    'priority_code': pd.Series([np.nan, None, 'a', 'b', None, 'a', 'e', None, 'a', 'd'], dtype='string'),
    'tax_rate': pd.Series([0, 0, 20, 20, 20, 0, 20, 20, 5, 20])
})

stock

Unnamed: 0,item_no,cost_class,cost,stock_code,priority_code,tax_rate
0,1,1st,10.99,a,,0
1,2,2nd,,a,,0
2,2,3rd,2.99,c,a,20
3,4,4th,,b,b,20
4,5,4th,2.99,a,,20
5,6,3rd,2.45,b,a,0
6,7,2nd,5.99,,e,20
7,8,,5.99,,,20
8,9,1st,3.0,a,a,5
9,10,3rd,,c,d,20


In [2]:
stock.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   item_no        10 non-null     Int64  
 1   cost_class     9 non-null      string 
 2   cost           7 non-null      float64
 3   stock_code     8 non-null      string 
 4   priority_code  6 non-null      string 
 5   tax_rate       10 non-null     int64  
dtypes: Int64(1), float64(1), int64(1), string(3)
memory usage: 618.0 bytes


In [5]:
stock.describe(include="all")

Unnamed: 0,item_no,cost_class,cost,stock_code,priority_code,tax_rate
count,10.0,9,7.0,8,6,10.0
unique,,4,,3,4,
top,,3rd,,a,a,
freq,,3,,4,3,
mean,5.4,,4.914286,,,12.5
std,3.134042,,3.065169,,,9.78945
min,1.0,,2.45,,,0.0
25%,2.5,,2.99,,,1.25
50%,5.5,,3.0,,,20.0
75%,7.75,,5.99,,,20.0


In [4]:
stock.shape

(10, 6)

`.axes` gives information about:
<br>
`[rows, columns]`

In [6]:
stock.axes

[RangeIndex(start=0, stop=10, step=1),
 Index(['item_no', 'cost_class', 'cost', 'stock_code', 'priority_code',
        'tax_rate'],
       dtype='object')]

In [7]:
stock.loc[0:5, :]

Unnamed: 0,item_no,cost_class,cost,stock_code,priority_code,tax_rate
0,1,1st,10.99,a,,0
1,2,2nd,,a,,0
2,2,3rd,2.99,c,a,20
3,4,4th,,b,b,20
4,5,4th,2.99,a,,20
5,6,3rd,2.45,b,a,0


## Setting a new index

```python
df.index = pd.RangeIndex(start, stop, name="index_name")
```

or can use a column as the index

```python
df.set_index("column_name", inplace=True)
```

In [8]:
stock.index = pd.RangeIndex(13, 23, name="index")

In [9]:
stock

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
13,1,1st,10.99,a,,0
14,2,2nd,,a,,0
15,2,3rd,2.99,c,a,20
16,4,4th,,b,b,20
17,5,4th,2.99,a,,20
18,6,3rd,2.45,b,a,0
19,7,2nd,5.99,,e,20
20,8,,5.99,,,20
21,9,1st,3.0,a,a,5
22,10,3rd,,c,d,20


In [10]:
stock.loc[13:15, :]

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
13,1,1st,10.99,a,,0
14,2,2nd,,a,,0
15,2,3rd,2.99,c,a,20


In [11]:
# use a column as an index

stock["item_code"] = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]

In [12]:
stock

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate,item_code
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
13,1,1st,10.99,a,,0,a
14,2,2nd,,a,,0,b
15,2,3rd,2.99,c,a,20,c
16,4,4th,,b,b,20,d
17,5,4th,2.99,a,,20,e
18,6,3rd,2.45,b,a,0,f
19,7,2nd,5.99,,e,20,g
20,8,,5.99,,,20,h
21,9,1st,3.0,a,a,5,i
22,10,3rd,,c,d,20,j


In [14]:
stock.set_index("item_code", inplace=True)

In [15]:
stock.loc[["f", "i"], :]

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
f,6,3rd,2.45,b,a,0
i,9,1st,3.0,a,a,5


In [17]:
# if we want to use numerical indices: `.iloc`
stock.iloc[[5, 8], [1, 4]]

Unnamed: 0_level_0,cost_class,priority_code
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1
f,3rd,a
i,1st,a


In [19]:
new_series = pd.Series(['pen', 'pencil', 'calculator', 'ruler'], index = ['a', 'c', 'e', 'i'])

In [20]:
new_series

a           pen
c        pencil
e    calculator
i         ruler
dtype: object

In [21]:
stock["item_type"] = new_series

stock

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate,item_type
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a,1,1st,10.99,a,,0,pen
b,2,2nd,,a,,0,
c,2,3rd,2.99,c,a,20,pencil
d,4,4th,,b,b,20,
e,5,4th,2.99,a,,20,calculator
f,6,3rd,2.45,b,a,0,
g,7,2nd,5.99,,e,20,
h,8,,5.99,,,20,
i,9,1st,3.0,a,a,5,ruler
j,10,3rd,,c,d,20,


## More on Filtering

#### negating conditions

In [24]:
stock.loc[stock.item_no != 2, :]

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate,item_type
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a,1,1st,10.99,a,,0,pen
d,4,4th,,b,b,20,
e,5,4th,2.99,a,,20,calculator
f,6,3rd,2.45,b,a,0,
g,7,2nd,5.99,,e,20,
h,8,,5.99,,,20,
i,9,1st,3.0,a,a,5,ruler
j,10,3rd,,c,d,20,


In [26]:
stock.loc[~(stock.item_no == 2), :]

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate,item_type
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a,1,1st,10.99,a,,0,pen
d,4,4th,,b,b,20,
e,5,4th,2.99,a,,20,calculator
f,6,3rd,2.45,b,a,0,
g,7,2nd,5.99,,e,20,
h,8,,5.99,,,20,
i,9,1st,3.0,a,a,5,ruler
j,10,3rd,,c,d,20,


In [27]:
# find all rows that aren't a pen or pencil

stock.loc[~(stock.item_type.isin(["pen", "pencil"])), :]

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate,item_type
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
b,2,2nd,,a,,0,
d,4,4th,,b,b,20,
e,5,4th,2.99,a,,20,calculator
f,6,3rd,2.45,b,a,0,
g,7,2nd,5.99,,e,20,
h,8,,5.99,,,20,
i,9,1st,3.0,a,a,5,ruler
j,10,3rd,,c,d,20,


In [28]:
# individual columns

stock.loc[stock.cost.isna()]

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate,item_type
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
b,2,2nd,,a,,0,
d,4,4th,,b,b,20,
j,10,3rd,,c,d,20,


In [29]:
# across all columns
stock.isna()

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate,item_type
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a,False,False,False,False,True,False,False
b,False,False,True,False,True,False,True
c,False,False,False,False,False,False,False
d,False,False,True,False,False,False,True
e,False,False,False,False,True,False,False
f,False,False,False,False,False,False,True
g,False,False,False,True,False,False,True
h,False,True,False,True,True,False,True
i,False,False,False,False,False,False,False
j,False,False,True,False,False,False,True


In [30]:
# data mask using any
# if any column for that row contains NA --> True
# else --> False

stock.isna().any(axis="columns")

item_code
a     True
b     True
c    False
d     True
e     True
f     True
g     True
h     True
i    False
j     True
dtype: bool

In [31]:
# find all rows that include any NAs

stock.loc[stock.isna().any(axis="columns")]

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate,item_type
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a,1,1st,10.99,a,,0,pen
b,2,2nd,,a,,0,
d,4,4th,,b,b,20,
e,5,4th,2.99,a,,20,calculator
f,6,3rd,2.45,b,a,0,
g,7,2nd,5.99,,e,20,
h,8,,5.99,,,20,
j,10,3rd,,c,d,20,


In [32]:
# find all rows that have zero NAs

stock.loc[~stock.isna().any(axis="columns")]

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate,item_type
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
c,2,3rd,2.99,c,a,20,pencil
i,9,1st,3.0,a,a,5,ruler


## Bonus Selection Techniques

* subsetting columns helpers

Select all columns that begin with a 'c'

In [33]:
stock.columns.str.startswith("c")

array([False,  True,  True, False, False, False, False])

In [34]:
stock.loc[:, stock.columns.str.startswith("c")]

Unnamed: 0_level_0,cost_class,cost
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1st,10.99
b,2nd,
c,3rd,2.99
d,4th,
e,4th,2.99
f,3rd,2.45
g,2nd,5.99
h,,5.99
i,1st,3.0
j,3rd,


In [35]:
stock.loc[:, stock.columns.str.contains(r"^.o")]

Unnamed: 0_level_0,cost_class,cost
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1st,10.99
b,2nd,
c,3rd,2.99
d,4th,
e,4th,2.99
f,3rd,2.45
g,2nd,5.99
h,,5.99
i,1st,3.0
j,3rd,


## Dropping Columns

In [36]:
stock.drop(columns=["cost", "cost_class"])

Unnamed: 0_level_0,item_no,stock_code,priority_code,tax_rate,item_type
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,1,a,,0,pen
b,2,a,,0,
c,2,c,a,20,pencil
d,4,b,b,20,
e,5,a,,20,calculator
f,6,b,a,0,
g,7,,e,20,
h,8,,,20,
i,9,a,a,5,ruler
j,10,c,d,20,


In [37]:
item_cols = stock.columns[stock.columns.str.startswith("i")]

In [38]:
stock.drop(columns=item_cols)

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,1st,10.99,a,,0
b,2nd,,a,,0
c,3rd,2.99,c,a,20
d,4th,,b,b,20
e,4th,2.99,a,,20
f,3rd,2.45,b,a,0
g,2nd,5.99,,e,20
h,,5.99,,,20
i,1st,3.0,a,a,5
j,3rd,,c,d,20


## Dropping rows

* use index labels

In [39]:
stock.drop(["a", "i"])

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate,item_type
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
b,2,2nd,,a,,0,
c,2,3rd,2.99,c,a,20,pencil
d,4,4th,,b,b,20,
e,5,4th,2.99,a,,20,calculator
f,6,3rd,2.45,b,a,0,
g,7,2nd,5.99,,e,20,
h,8,,5.99,,,20,
j,10,3rd,,c,d,20,
