In [1]:
import pandas as pd
import numpy as np

stock = pd.DataFrame({
    'item_no': pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='Int64'),
    'cost_class': pd.Series(['1st', '2nd', '3rd', '4th', '4th', '3rd', '2nd', np.nan, '1st', '3rd'], dtype='string'),
    'cost': pd.Series([10.99, np.nan, 2.99, np.nan, 2.99, 2.45, 5.99, 5.99, 3.00, None], dtype='float64'),
    'stock_code': pd.Series(['a', 'a', 'c', 'b', 'a', 'b', np.nan, np.nan, 'a', 'c'], dtype='string'),
    'priority_code': pd.Series([np.nan, None, 'a', 'b', None, 'a', 'e', None, 'a', 'd'], dtype='string'),
    'tax_rate': pd.Series([0, 0, 20, 20, 20, 0, 20, 20, 5, 20])
}).set_index('item_no')

In [2]:
stock.loc[:, "year"] = 2020

In [3]:
stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,year
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1st,10.99,a,,0,2020
2,2nd,,a,,0,2020
3,3rd,2.99,c,a,20,2020
4,4th,,b,b,20,2020
5,4th,2.99,a,,20,2020
6,3rd,2.45,b,a,0,2020
7,2nd,5.99,,e,20,2020
8,,5.99,,,20,2020
9,1st,3.0,a,a,5,2020
10,3rd,,c,d,20,2020


In [4]:
stock.assign(new_year=2021, checked=True)

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,year,new_year,checked
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1st,10.99,a,,0,2020,2021,True
2,2nd,,a,,0,2020,2021,True
3,3rd,2.99,c,a,20,2020,2021,True
4,4th,,b,b,20,2020,2021,True
5,4th,2.99,a,,20,2020,2021,True
6,3rd,2.45,b,a,0,2020,2021,True
7,2nd,5.99,,e,20,2020,2021,True
8,,5.99,,,20,2020,2021,True
9,1st,3.0,a,a,5,2020,2021,True
10,3rd,,c,d,20,2020,2021,True


## Mutate column using a list comprehension

In [5]:
# look up table

adjust_lookup = {
    '1st': 12.5,
    '2nd': 5,
    '3rd': 0,
    '4th': -5,
    pd.NA: np.nan
}

adjust_lookup

{'1st': 12.5, '2nd': 5, '3rd': 0, '4th': -5, <NA>: nan}

In [6]:
adjust_lookup.get("4th")

-5

In [7]:
# set default for values not in dictionary
adjust_lookup.get("6th", np.nan)

nan

In [10]:
stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,year
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1st,10.99,a,,0,2020
2,2nd,,a,,0,2020
3,3rd,2.99,c,a,20,2020
4,4th,,b,b,20,2020
5,4th,2.99,a,,20,2020
6,3rd,2.45,b,a,0,2020
7,2nd,5.99,,e,20,2020
8,,5.99,,,20,2020
9,1st,3.0,a,a,5,2020
10,3rd,,c,d,20,2020


In [12]:
[cc
 for cc in stock.cost_class
]

['1st', '2nd', '3rd', '4th', '4th', '3rd', '2nd', <NA>, '1st', '3rd']

In [13]:
[adjust_lookup.get(cc, np.nan)
 for cc in stock.cost_class
]

[12.5, 5, 0, -5, -5, 0, 5, nan, 12.5, 0]

In [14]:
stock.loc[:, "cost_adjustment"] = [adjust_lookup.get(cc, np.nan)
                                   for cc in stock.cost_class
                                  ]

In [15]:
stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,year,cost_adjustment
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1st,10.99,a,,0,2020,12.5
2,2nd,,a,,0,2020,5.0
3,3rd,2.99,c,a,20,2020,0.0
4,4th,,b,b,20,2020,-5.0
5,4th,2.99,a,,20,2020,-5.0
6,3rd,2.45,b,a,0,2020,0.0
7,2nd,5.99,,e,20,2020,5.0
8,,5.99,,,20,2020,
9,1st,3.0,a,a,5,2020,12.5
10,3rd,,c,d,20,2020,0.0


In [20]:
stock.loc[:, "stock_including_tax"] = stock.cost + stock.tax_rate * stock.cost / 100
stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,year,cost_adjustment,stock_including_tax
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1st,10.99,a,,0,2020,12.5,10.99
2,2nd,,a,,0,2020,5.0,
3,3rd,2.99,c,a,20,2020,0.0,3.588
4,4th,,b,b,20,2020,-5.0,
5,4th,2.99,a,,20,2020,-5.0,3.588
6,3rd,2.45,b,a,0,2020,0.0,2.45
7,2nd,5.99,,e,20,2020,5.0,7.188
8,,5.99,,,20,2020,,7.188
9,1st,3.0,a,a,5,2020,12.5,3.15
10,3rd,,c,d,20,2020,0.0,


In [19]:
stock.loc[:, "stock_including_tax"] = stock.stock_including_tax.round(2)
stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,year,cost_adjustment,stock_including_tax
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1st,10.99,a,,0,2020,12.5,10.99
2,2nd,,a,,0,2020,5.0,
3,3rd,2.99,c,a,20,2020,0.0,3.59
4,4th,,b,b,20,2020,-5.0,
5,4th,2.99,a,,20,2020,-5.0,3.59
6,3rd,2.45,b,a,0,2020,0.0,2.45
7,2nd,5.99,,e,20,2020,5.0,7.19
8,,5.99,,,20,2020,,7.19
9,1st,3.0,a,a,5,2020,12.5,3.15
10,3rd,,c,d,20,2020,0.0,


In [24]:
stock = stock.drop("stock_including_tax", axis="columns")

In [25]:
stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,year,cost_adjustment
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1st,10.99,a,,0,2020,12.5
2,2nd,,a,,0,2020,5.0
3,3rd,2.99,c,a,20,2020,0.0
4,4th,,b,b,20,2020,-5.0
5,4th,2.99,a,,20,2020,-5.0
6,3rd,2.45,b,a,0,2020,0.0
7,2nd,5.99,,e,20,2020,5.0
8,,5.99,,,20,2020,
9,1st,3.0,a,a,5,2020,12.5
10,3rd,,c,d,20,2020,0.0


In [30]:
# drop any row with cost_class NA

stock.index[stock.cost_class.isna()]
stock.drop(stock.index[stock.cost_class.isna()], axis="rows")

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,year,cost_adjustment
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1st,10.99,a,,0,2020,12.5
2,2nd,,a,,0,2020,5.0
3,3rd,2.99,c,a,20,2020,0.0
4,4th,,b,b,20,2020,-5.0
5,4th,2.99,a,,20,2020,-5.0
6,3rd,2.45,b,a,0,2020,0.0
7,2nd,5.99,,e,20,2020,5.0
9,1st,3.0,a,a,5,2020,12.5
10,3rd,,c,d,20,2020,0.0


In [32]:
stock.dropna()

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,year,cost_adjustment
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,3rd,2.99,c,a,20,2020,0.0
6,3rd,2.45,b,a,0,2020,0.0
9,1st,3.0,a,a,5,2020,12.5


In [34]:
stock = stock.dropna(axis="rows", subset = ["cost_class"])

In [35]:
stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,year,cost_adjustment
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1st,10.99,a,,0,2020,12.5
2,2nd,,a,,0,2020,5.0
3,3rd,2.99,c,a,20,2020,0.0
4,4th,,b,b,20,2020,-5.0
5,4th,2.99,a,,20,2020,-5.0
6,3rd,2.45,b,a,0,2020,0.0
7,2nd,5.99,,e,20,2020,5.0
9,1st,3.0,a,a,5,2020,12.5
10,3rd,,c,d,20,2020,0.0


Fill any missing values in cost with median cost from whole dataset

* row 2, 4 & 10

In [37]:
stock = stock.fillna({
    "cost": np.round(stock.cost.median(), 2)
})

stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,year,cost_adjustment
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1st,10.99,a,,0,2020,12.5
2,2nd,3.0,a,,0,2020,5.0
3,3rd,2.99,c,a,20,2020,0.0
4,4th,3.0,b,b,20,2020,-5.0
5,4th,2.99,a,,20,2020,-5.0
6,3rd,2.45,b,a,0,2020,0.0
7,2nd,5.99,,e,20,2020,5.0
9,1st,3.0,a,a,5,2020,12.5
10,3rd,3.0,c,d,20,2020,0.0


## Chained indexing

`SettingWithCopyWarning` - common nuisance!!!!!!!

> The costs of items in the 1st `cost_class` with `stock_code` "a" are wrong: they need to be reduced by 10%

In [38]:
mask = (stock.cost_class == "1st") & (stock.stock_code == "a")

mask

item_no
1      True
2     False
3     False
4     False
5     False
6     False
7     False
9      True
10    False
dtype: boolean

In [39]:
stock.loc[mask]

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,year,cost_adjustment
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1st,10.99,a,,0,2020,12.5
9,1st,3.0,a,a,5,2020,12.5


In [40]:
stock[mask]['cost'] = (stock[mask]['cost'] * 0.9).round(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock[mask]['cost'] = (stock[mask]['cost'] * 0.9).round(2)


In [42]:
stock['cost'][mask] = (stock[mask]['cost'] * 0.9).round(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock['cost'][mask] = (stock[mask]['cost'] * 0.9).round(2)


### Chained indexing - DON'T USE

### This is different to chained method

In [55]:
# reset dataframe

original_stock_costs = pd.Series([10.99, np.nan, 2.99, np.nan, 2.99, 2.45, 5.99, 5.99, 3.00, None],
                                 index = range(1, 11))
stock.loc[:, 'cost'] = original_stock_costs

In [56]:
stock[mask]

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,year,cost_adjustment
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1st,10.99,a,,0,2020,12.5
9,1st,3.0,a,a,5,2020,12.5


In [57]:
stock.loc[mask, 'cost'] = (stock[mask]['cost'] * 0.9).round(2)
stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,year,cost_adjustment
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1st,9.89,a,,0,2020,12.5
2,2nd,,a,,0,2020,5.0
3,3rd,2.99,c,a,20,2020,0.0
4,4th,,b,b,20,2020,-5.0
5,4th,2.99,a,,20,2020,-5.0
6,3rd,2.45,b,a,0,2020,0.0
7,2nd,5.99,,e,20,2020,5.0
9,1st,2.7,a,a,5,2020,12.5
10,3rd,,c,d,20,2020,0.0


Task

Add 5.00 to the cost of all the items with `stock_code` 'a'.

Make sure you persist this change to the original stock DataFrame and not to a copy

In [58]:
stock.loc[stock.stock_code == 'a', 'cost'] = stock.loc[stock.stock_code == 'a', 'cost'] + 5
stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,year,cost_adjustment
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1st,14.89,a,,0,2020,12.5
2,2nd,,a,,0,2020,5.0
3,3rd,2.99,c,a,20,2020,0.0
4,4th,,b,b,20,2020,-5.0
5,4th,7.99,a,,20,2020,-5.0
6,3rd,2.45,b,a,0,2020,0.0
7,2nd,5.99,,e,20,2020,5.0
9,1st,7.7,a,a,5,2020,12.5
10,3rd,,c,d,20,2020,0.0


In [60]:
stock_copy = stock.copy()

stock_copy.loc[stock.cost < 3, 'cost'] = stock_copy.cost[stock.cost < 3] + 2
stock_copy

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,year,cost_adjustment
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1st,14.89,a,,0,2020,12.5
2,2nd,,a,,0,2020,5.0
3,3rd,4.99,c,a,20,2020,0.0
4,4th,,b,b,20,2020,-5.0
5,4th,7.99,a,,20,2020,-5.0
6,3rd,4.45,b,a,0,2020,0.0
7,2nd,5.99,,e,20,2020,5.0
9,1st,7.7,a,a,5,2020,12.5
10,3rd,,c,d,20,2020,0.0


In [61]:
def z_score(series):
    mean = series.mean()
    std = series.std()
    return (series - mean) / std

In [62]:
cost_copy = stock.loc[:, ["cost", "cost_class"]].copy()
cost_copy

Unnamed: 0_level_0,cost,cost_class
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1
1,14.89,1st
2,,2nd
3,2.99,3rd
4,,4th
5,7.99,4th
6,2.45,3rd
7,5.99,2nd
9,7.7,1st
10,,3rd


In [63]:
cost_copy['cost_zscore'] = z_score(cost_copy.cost)

cost_copy

Unnamed: 0_level_0,cost,cost_class,cost_zscore
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,14.89,1st,1.750588
2,,2nd,
3,2.99,3rd,-0.890274
4,,4th,
5,7.99,4th,0.219332
6,2.45,3rd,-1.010111
7,5.99,2nd,-0.22451
9,7.7,1st,0.154975
10,,3rd,
