# How To Filter Pandas DataFrame

In [3]:
import pandas as pd

In [4]:
url = "https://raw.githubusercontent.com/nyangweso-rodgers/Data_Analytics/main/Datasets/Online_Retail.csv"
raw_data = pd.read_csv(url, encoding= 'unicode_escape', parse_dates=['InvoiceDate'])
raw_data.head(3)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom


## Examples Of Filtering

### 1: Get Transactions from Country Brazil
* You can use __isin()__ method.

In [6]:
raw_data[raw_data['Country'].isin(['Brazil'])]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
157299,550201,22423,REGENCY CAKESTAND 3 TIER,16,2011-04-15 10:25:00,10.95,12769.0,Brazil
157300,550201,22699,ROSES REGENCY TEACUP AND SAUCER,24,2011-04-15 10:25:00,2.55,12769.0,Brazil
157301,550201,22697,GREEN REGENCY TEACUP AND SAUCER,24,2011-04-15 10:25:00,2.55,12769.0,Brazil
157302,550201,22698,PINK REGENCY TEACUP AND SAUCER,24,2011-04-15 10:25:00,2.55,12769.0,Brazil
157303,550201,22366,DOORMAT AIRMAIL,10,2011-04-15 10:25:00,6.75,12769.0,Brazil
157304,550201,21430,SET/3 RED GINGHAM ROSE STORAGE BOX,24,2011-04-15 10:25:00,3.39,12769.0,Brazil
157305,550201,22630,DOLLY GIRL LUNCH BOX,24,2011-04-15 10:25:00,1.95,12769.0,Brazil
157306,550201,22662,LUNCH BAG DOLLY GIRL DESIGN,10,2011-04-15 10:25:00,1.65,12769.0,Brazil
157307,550201,22629,SPACEBOY LUNCH BOX,12,2011-04-15 10:25:00,1.95,12769.0,Brazil
157308,550201,22382,LUNCH BAG SPACEBOY DESIGN,10,2011-04-15 10:25:00,1.65,12769.0,Brazil


### Example 2: Get Transactions from Malta and Canada

In [8]:
raw_data[raw_data['Country'].isin(['Malta', 'Canada'])]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
119191,546533,20886,BOX OF 9 PEBBLE CANDLES,12,2011-03-14 13:53:00,1.95,15388.0,Canada
119192,546533,79030D,"TUMBLER, BAROQUE",6,2011-03-14 13:53:00,1.65,15388.0,Canada
119193,546533,21132,SILVER STANDING GNOME,4,2011-03-14 13:53:00,4.25,15388.0,Canada
119194,546533,84879,ASSORTED COLOUR BIRD ORNAMENT,8,2011-03-14 13:53:00,1.69,15388.0,Canada
119195,546533,84755,COLOUR GLASS T-LIGHT HOLDER HANGING,16,2011-03-14 13:53:00,0.65,15388.0,Canada
...,...,...,...,...,...,...,...,...
516550,C579929,22768,FAMILY PHOTO FRAME CORNICE,-2,2011-12-01 09:34:00,9.95,17828.0,Malta
516551,C579929,85034C,3 ROSE MORRIS BOXED CANDLES,-1,2011-12-01 09:34:00,1.25,17828.0,Malta
516552,C579929,72807C,SET/3 VANILLA SCENTED CANDLE IN BOX,-1,2011-12-01 09:34:00,4.25,17828.0,Malta
516553,C579929,22307,GOLD MUG BONE CHINA TREE OF LIFE,-1,2011-12-01 09:34:00,1.06,17828.0,Malta


### 3: Get Items whose UnitPrice is greater than $20,000
* Here we are using __Logical Operators__

In [10]:
raw_data[raw_data['UnitPrice'] > 20000]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
222681,C556445,M,Manual,-1,2011-06-10 15:31:00,38970.0,15098.0,United Kingdom


### Example 4: Get Items whose UnitPrice is greater than $20,000 and from United Kingdom
* Occasions may arise where we have to filter our data for multiple conditions.
* Again, we are use __Logical Operators__

In [14]:
raw_data[(raw_data['UnitPrice'] > 20000) & (raw_data['Country'] == 'United Kingdom')]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
222681,C556445,M,Manual,-1,2011-06-10 15:31:00,38970.0,15098.0,United Kingdom


__REMARK__: _You can also use other logical operators such as less than(<), greater than(>), equal to(=), not equal to(!=), etc._

### Example 5: Get Transactions from Belgium
* Using a __Query Function__
* The query function takes in an expression as an argument which evaluates to a Boolean that is used to filter the dataframe.

In [16]:
raw_data.query("Country == 'Belgium'")

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
7279,537026,84375,SET OF 20 KIDS COOKIE CUTTERS,12,2010-12-03 16:35:00,2.10,12395.0,Belgium
7280,537026,21217,RED RETROSPOT ROUND CAKE TINS,2,2010-12-03 16:35:00,9.95,12395.0,Belgium
7281,537026,21212,PACK OF 72 RETROSPOT CAKE CASES,120,2010-12-03 16:35:00,0.42,12395.0,Belgium
7282,537026,21977,PACK OF 60 PINK PAISLEY CAKE CASES,120,2010-12-03 16:35:00,0.42,12395.0,Belgium
7283,537026,22417,PACK OF 60 SPACEBOY CAKE CASES,120,2010-12-03 16:35:00,0.42,12395.0,Belgium
...,...,...,...,...,...,...,...,...
541194,581493,23204,CHARLOTTE BAG APPLES DESIGN,10,2011-12-09 10:10:00,0.85,12423.0,Belgium
541195,581493,21108,FAIRY CAKE FLANNEL ASSORTED COLOUR,18,2011-12-09 10:10:00,0.79,12423.0,Belgium
541196,581493,22252,BIRDCAGE DECORATION TEALIGHT HOLDER,12,2011-12-09 10:10:00,1.25,12423.0,Belgium
541197,581493,22807,SET OF 6 T-LIGHTS TOADSTOOLS,6,2011-12-09 10:10:00,2.95,12423.0,Belgium


* We can also query based on multiple conditions
* Get Transactions from Belgium whose __UnitPrice__ > 10

In [18]:
raw_data.query("Country == 'Belgium' and UnitPrice > 10")

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
7290,537026,POST,POSTAGE,2,2010-12-03 16:35:00,18.00,12395.0,Belgium
34464,539391,22423,REGENCY CAKESTAND 3 TIER,4,2010-12-17 11:51:00,12.75,12417.0,Belgium
34469,539391,22770,MIRROR CORNICE,2,2010-12-17 11:51:00,14.95,12417.0,Belgium
34471,539391,POST,POSTAGE,2,2010-12-17 11:51:00,15.00,12417.0,Belgium
36285,539447,22504,CABIN BAG VINTAGE RETROSPOT,3,2010-12-17 16:00:00,12.75,12395.0,Belgium
...,...,...,...,...,...,...,...,...
531760,580965,22423,REGENCY CAKESTAND 3 TIER,4,2011-12-06 14:52:00,12.75,12417.0,Belgium
531761,580965,23007,SPACEBOY BABY GIFT SET,1,2011-12-06 14:52:00,16.95,12417.0,Belgium
531777,580965,POST,POSTAGE,3,2011-12-06 14:52:00,15.00,12417.0,Belgium
531924,580979,POST,POSTAGE,3,2011-12-06 15:40:00,18.00,12362.0,Belgium


## Str Accessor
* Pandas make it easy to work with string values. Using the str accessor, we can filter for records whose values are strings.

In [19]:
raw_data[raw_data['Country'].str.contains('b')]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
72985,542276,82551,LAUNDRY 15C METAL SIGN,12,2011-01-27 10:19:00,1.45,12764.0,Lebanon
72986,542276,21165,BEWARE OF THE CAT METAL SIGN,12,2011-01-27 10:19:00,1.69,12764.0,Lebanon
72987,542276,82600,NO SINGING METAL SIGN,12,2011-01-27 10:19:00,2.10,12764.0,Lebanon
72988,542276,21754,HOME BUILDING BLOCK WORD,6,2011-01-27 10:19:00,5.95,12764.0,Lebanon
72989,542276,21756,BATH BUILDING BLOCK WORD,6,2011-01-27 10:19:00,5.95,12764.0,Lebanon
...,...,...,...,...,...,...,...,...
383504,570026,84347,ROTATING SILVER ANGELS T-LIGHT HLDR,24,2011-10-07 10:37:00,2.55,12781.0,Czech Republic
383505,570026,POST,POSTAGE,1,2011-10-07 10:37:00,40.00,12781.0,Czech Republic
479276,C577151,84459A,PINK METAL CHICKEN HEART,-24,2011-11-18 09:50:00,1.49,12781.0,Czech Republic
479277,C577151,22231,JIGSAW TREE WITH BIRDHOUSE,-15,2011-11-18 09:50:00,1.45,12781.0,Czech Republic


In [21]:
raw_data[raw_data['Country'].str.startswith('F')]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
26,536370,22728,ALARM CLOCK BAKELIKE PINK,24,2010-12-01 08:45:00,3.75,12583.0,France
27,536370,22727,ALARM CLOCK BAKELIKE RED,24,2010-12-01 08:45:00,3.75,12583.0,France
28,536370,22726,ALARM CLOCK BAKELIKE GREEN,12,2010-12-01 08:45:00,3.75,12583.0,France
29,536370,21724,PANDA AND BUNNIES STICKER SHEET,12,2010-12-01 08:45:00,0.85,12583.0,France
30,536370,21883,STARS GIFT TAPE,24,2010-12-01 08:45:00,0.65,12583.0,France
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


## nlargest and nsmallest
* Most times, we just need records of the highest or lowest values in a column. These methods make it possible. 

In [23]:
# get highest 3 transactions with highest UnitPrice
raw_data.nlargest(3, 'UnitPrice')

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
222681,C556445,M,Manual,-1,2011-06-10 15:31:00,38970.0,15098.0,United Kingdom
524602,C580605,AMAZONFEE,AMAZON FEE,-1,2011-12-05 11:36:00,17836.46,,United Kingdom
43702,C540117,AMAZONFEE,AMAZON FEE,-1,2011-01-05 09:55:00,16888.02,,United Kingdom


* Get bottom 3 transactions with lowest __UnitPrice__

In [24]:
raw_data.nsmallest(3, 'UnitPrice')

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
299983,A563186,B,Adjust bad debt,1,2011-08-12 14:51:00,-11062.06,,United Kingdom
299984,A563187,B,Adjust bad debt,1,2011-08-12 14:52:00,-11062.06,,United Kingdom
622,536414,22139,,56,2010-12-01 11:52:00,0.0,,United Kingdom


## Tilde sign (~)
*  Used to reverse the logic used in filter condition

In [25]:
raw_data[~raw_data['Country'].str.contains('b')]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


In [27]:
raw_data[~raw_data['Country'].isin(['United Kingdom', 'Brazil'])]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
26,536370,22728,ALARM CLOCK BAKELIKE PINK,24,2010-12-01 08:45:00,3.75,12583.0,France
27,536370,22727,ALARM CLOCK BAKELIKE RED,24,2010-12-01 08:45:00,3.75,12583.0,France
28,536370,22726,ALARM CLOCK BAKELIKE GREEN,12,2010-12-01 08:45:00,3.75,12583.0,France
29,536370,21724,PANDA AND BUNNIES STICKER SHEET,12,2010-12-01 08:45:00,0.85,12583.0,France
30,536370,21883,STARS GIFT TAPE,24,2010-12-01 08:45:00,0.65,12583.0,France
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


## isnull | notnull
* Using the __isnull__ method, we can return records that have __NaN__ values and mark them for deletion. 
* Using the __notnull__ method, we can filter for records that do not contain __NaN__ values.

In [30]:
# transactions with null customerid
raw_data[raw_data['CustomerID'].isnull()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
622,536414,22139,,56,2010-12-01 11:52:00,0.00,,United Kingdom
1443,536544,21773,DECORATIVE ROSE BATHROOM BOTTLE,1,2010-12-01 14:32:00,2.51,,United Kingdom
1444,536544,21774,DECORATIVE CATS BATHROOM BOTTLE,2,2010-12-01 14:32:00,2.51,,United Kingdom
1445,536544,21786,POLKADOT RAIN HAT,4,2010-12-01 14:32:00,0.85,,United Kingdom
1446,536544,21787,RAIN PONCHO RETROSPOT,2,2010-12-01 14:32:00,1.66,,United Kingdom
...,...,...,...,...,...,...,...,...
541536,581498,85099B,JUMBO BAG RED RETROSPOT,5,2011-12-09 10:26:00,4.13,,United Kingdom
541537,581498,85099C,JUMBO BAG BAROQUE BLACK WHITE,4,2011-12-09 10:26:00,4.13,,United Kingdom
541538,581498,85150,LADIES & GENTLEMEN METAL SIGN,1,2011-12-09 10:26:00,4.96,,United Kingdom
541539,581498,85174,S/4 CACTI CANDLES,1,2011-12-09 10:26:00,10.79,,United Kingdom


In [32]:
# transactions with CustomerID
raw_data[raw_data['CustomerID'].notnull()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


## Filter
* Using this method, we can also filter for a subset of data.

In [35]:
raw_data.filter(items=['CustomerID', 'Country'], axis=1)

Unnamed: 0,CustomerID,Country
0,17850.0,United Kingdom
1,17850.0,United Kingdom
2,17850.0,United Kingdom
3,17850.0,United Kingdom
4,17850.0,United Kingdom
...,...,...
541904,12680.0,France
541905,12680.0,France
541906,12680.0,France
541907,12680.0,France


#### filter the dataframe where the index is 6

In [36]:
raw_data.filter(like='6', axis=0)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,17850.0,United Kingdom
16,536367,22622,BOX OF VINTAGE ALPHABET BLOCKS,2,2010-12-01 08:34:00,9.95,13047.0,United Kingdom
26,536370,22728,ALARM CLOCK BAKELIKE PINK,24,2010-12-01 08:45:00,3.75,12583.0,France
36,536370,22659,LUNCH BOX I LOVE LONDON,24,2010-12-01 08:45:00,1.95,12583.0,France
46,536371,22086,PAPER CHAIN KIT 50'S CHRISTMAS,80,2010-12-01 09:00:00,2.55,13748.0,United Kingdom
...,...,...,...,...,...,...,...,...
541869,581585,22481,BLACK TEA TOWEL CLASSIC DESIGN,12,2011-12-09 12:31:00,0.39,15804.0,United Kingdom
541876,581585,84945,MULTI COLOUR SILVER T-LIGHT HOLDER,24,2011-12-09 12:31:00,0.85,15804.0,United Kingdom
541886,581585,22398,MAGNETS PACK OF 4 SWALLOWS,12,2011-12-09 12:31:00,0.39,15804.0,United Kingdom
541896,581587,22555,PLASTERS IN TIN STRONGMAN,12,2011-12-09 12:50:00,1.65,12680.0,France


#### Filter where the index begins with 5 or 8 using regex.

In [37]:
raw_data.filter(regex='5|8', axis=0)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,17850.0,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
15,536367,22623,BOX OF VINTAGE JIGSAW BLOCKS,3,2010-12-01 08:34:00,4.95,13047.0,United Kingdom
18,536367,21755,LOVE BUILDING BLOCK WORD,3,2010-12-01 08:34:00,5.95,13047.0,United Kingdom
25,536369,21756,BATH BUILDING BLOCK WORD,3,2010-12-01 08:35:00,5.95,13047.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France
