In [1]:
# adds janitor to use just clean_names
from janitor import clean_names

In [8]:
import pandas as pd
import numpy as np

stock = pd.DataFrame({
    'item_no': pd.Series([1, 2, 2, 4, 5, 6, 7, 8, 9, 10], dtype='Int64'),
    'cost_class': pd.Series(['1st', '2nd', '3rd', '4th', '4th', '3rd', '2nd', np.nan, '1st', '3rd'], dtype='string'),
    'cost': pd.Series([10.99, np.nan, 2.99, np.nan, 2.99, 2.45, 5.99, 5.99, 3.00, None], dtype='float64'),
    'stock_code': pd.Series(['a', 'a', 'c', 'b', 'a', 'b', np.nan, np.nan, 'a', 'c'], dtype='string'),
    'priority_code': pd.Series([np.nan, None, 'a', 'b', None, 'a', 'e', None, 'a', 'd'], dtype='string'),
    'tax_rate': pd.Series([0, 0, 20, 20, 20, 0, 20, 20, 5, 20])
})

stock

Unnamed: 0,item_no,cost_class,cost,stock_code,priority_code,tax_rate
0,1,1st,10.99,a,,0
1,2,2nd,,a,,0
2,2,3rd,2.99,c,a,20
3,4,4th,,b,b,20
4,5,4th,2.99,a,,20
5,6,3rd,2.45,b,a,0
6,7,2nd,5.99,,e,20
7,8,,5.99,,,20
8,9,1st,3.0,a,a,5
9,10,3rd,,c,d,20


# 1 Indexing

In [9]:
stock.set_index('item_no', inplace = True)

In [10]:
stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1st,10.99,a,,0
2,2nd,,a,,0
2,3rd,2.99,c,a,20
4,4th,,b,b,20
5,4th,2.99,a,,20
6,3rd,2.45,b,a,0
7,2nd,5.99,,e,20
8,,5.99,,,20
9,1st,3.0,a,a,5
10,3rd,,c,d,20


In [12]:
# checking the index is unique
stock.index.is_unique

False

In [14]:
stock.loc[stock.index.duplicated()]

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,3rd,2.99,c,a,20


In [15]:
stock.loc[2]

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,2nd,,a,,0
2,3rd,2.99,c,a,20



- Provide some sort of collection of values of the correct size (either the number of rows or number of columns)
- Use the .rename() method specifying the index= argument


In [17]:
stock.index = range(1,11)
stock.index.name = "item_no"
stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1st,10.99,a,,0
2,2nd,,a,,0
3,3rd,2.99,c,a,20
4,4th,,b,b,20
5,4th,2.99,a,,20
6,3rd,2.45,b,a,0
7,2nd,5.99,,e,20
8,,5.99,,,20
9,1st,3.0,a,a,5
10,3rd,,c,d,20




**Task - 2 mins**

- Reset the `Index` using the `reset_index()` method (passing in arguments `inplace=True` and `drop=False`)
- What does the `drop` argument do in the line above? Investigate this.
- Now set the `Index` back once again to the `item_no` column using the `set_index()` method. Make sure the change is persisted in the `DataFrame`.



In [18]:
# drop=False tells pandas to insert the current index as a column 
# in the DataFrame before it creates the default RangeIndex object, 
# starting at 
stock.reset_index(inplace= True, drop= False)

In [19]:
stock

Unnamed: 0,item_no,cost_class,cost,stock_code,priority_code,tax_rate
0,1,1st,10.99,a,,0
1,2,2nd,,a,,0
2,3,3rd,2.99,c,a,20
3,4,4th,,b,b,20
4,5,4th,2.99,a,,20
5,6,3rd,2.45,b,a,0
6,7,2nd,5.99,,e,20
7,8,,5.99,,,20
8,9,1st,3.0,a,a,5
9,10,3rd,,c,d,20


In [20]:
# set the index back to item_no again and persist the change
stock.set_index("item_no", inplace=True)
stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1st,10.99,a,,0
2,2nd,,a,,0
3,3rd,2.99,c,a,20
4,4th,,b,b,20
5,4th,2.99,a,,20
6,3rd,2.45,b,a,0
7,2nd,5.99,,e,20
8,,5.99,,,20
9,1st,3.0,a,a,5
10,3rd,,c,d,20


# 2 Indexing with Loc

In [21]:
stock.loc[[1,2,3], :]

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1st,10.99,a,,0
2,2nd,,a,,0
3,3rd,2.99,c,a,20


In [22]:
# .iloc alway in python indexing 0 start
stock.iloc[[1,2,3], :]

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,2nd,,a,,0
3,3rd,2.99,c,a,20
4,4th,,b,b,20


# 3 Chained Indexing

In [23]:
# try not to use 
stock['cost'][[1,2]]

item_no
1    10.99
2      NaN
Name: cost, dtype: float64

In [24]:
# better to use this 
stock.loc[[1,2], 'cost']

item_no
1    10.99
2      NaN
Name: cost, dtype: float64

# 4 Indexing for alignment

In [25]:
new_series = pd.Series(['a', 'b', 'c', 'd'], index = [2, 3, 4, 6])
new_series

2    a
3    b
4    c
6    d
dtype: object

In [26]:
stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1st,10.99,a,,0
2,2nd,,a,,0
3,3rd,2.99,c,a,20
4,4th,,b,b,20
5,4th,2.99,a,,20
6,3rd,2.45,b,a,0
7,2nd,5.99,,e,20
8,,5.99,,,20
9,1st,3.0,a,a,5
10,3rd,,c,d,20


In [27]:
stock.loc[:, 'new'] = new_series

In [28]:
stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,new
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1st,10.99,a,,0,
2,2nd,,a,,0,a
3,3rd,2.99,c,a,20,b
4,4th,,b,b,20,c
5,4th,2.99,a,,20,
6,3rd,2.45,b,a,0,d
7,2nd,5.99,,e,20,
8,,5.99,,,20,
9,1st,3.0,a,a,5,
10,3rd,,c,d,20,


# 5 Indexing : Missing values

In [30]:
stock.loc[stock.cost.isna()]

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,new
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,2nd,,a,,0,a
4,4th,,b,b,20,c
10,3rd,,c,d,20,


In [31]:
# find missing rows
stock.loc[stock.isna().any(axis = 'columns')]

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,new
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1st,10.99,a,,0,
2,2nd,,a,,0,a
4,4th,,b,b,20,c
5,4th,2.99,a,,20,
7,2nd,5.99,,e,20,
8,,5.99,,,20,
9,1st,3.0,a,a,5,
10,3rd,,c,d,20,


In [32]:
# find complete rows
stock.loc[~stock.isna().any(axis = 'columns')]

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,new
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,3rd,2.99,c,a,20,b
6,3rd,2.45,b,a,0,d


**Task - 2 mins**

Consider the output of the following code:  

In [33]:  
stock.notna().all(axis='rows')  
Out[33]:  
cost_class       False  
cost             False  
stock_code       False  
priority_code    False  
tax_rate          True  
dtype: bool  
How could we use this to output all complete columns (i.e. columns without missing values)?
How can we count how many missing values are there in each column?

In [33]:
stock.notna().all(axis='rows')

cost_class       False
cost             False
stock_code       False
priority_code    False
tax_rate          True
new              False
dtype: bool

In [34]:
# output all complete columns
stock.loc[:, stock.notna().all(axis='rows')]

Unnamed: 0_level_0,tax_rate
item_no,Unnamed: 1_level_1
1,0
2,0
3,20
4,20
5,20
6,0
7,20
8,20
9,5
10,20


In [36]:
# how can we count the missing values in each column
stock.isna().sum(axis = 'rows')

cost_class       1
cost             3
stock_code       2
priority_code    4
tax_rate         0
new              6
dtype: int64

In [37]:
stock.cost

stock['cost']

item_no
1     10.99
2       NaN
3      2.99
4       NaN
5      2.99
6      2.45
7      5.99
8      5.99
9      3.00
10      NaN
Name: cost, dtype: float64