# Filtering/Subsetting


- [*loc*](#Selecting-Rows-Using-loc)   
- [*iloc*](#Selecting-Rows-with-iloc)  
- [*Boolean Filters*](#Selecting-Rows-with-Boolean-Filters)       
  - Using the AND Operator  
  - Using the OR Operator  
- [Selecting on Multiple Values in a **List**](#Selecting-on-Multiple-Values-in-a-List)    
- [Selecting Based on index or Column *Content*](#Selecting-Rows-Based-on-index-or-Column-Content)  
  
   

In [1]:
import pandas as pd 

In [2]:
#Read the csv file into a pandas dataframe
df = pd.read_csv('Data/Olympics.csv')

#Display the top rows in the dataframe
df.head(5)

Unnamed: 0,Rank,Country,Gold,Silver,Bronze,Total
0,1,United States (USA),46,37,38,121
1,2,Great Britain (GBR),27,23,17,67
2,3,China (CHN),26,18,26,70
3,4,Russia (RUS),19,17,19,55
4,5,Germany (GER),17,10,15,42


In [3]:
df.index.values

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86], dtype=int64)

# Selecting Rows Using *loc*
In order to use the Index of a Datframe with loc, the dataframe must have an Index!

In [4]:
# No Index for this Dataframe
df.head(3)

Unnamed: 0,Rank,Country,Gold,Silver,Bronze,Total
0,1,United States (USA),46,37,38,121
1,2,Great Britain (GBR),27,23,17,67
2,3,China (CHN),26,18,26,70


In [5]:
# Create an Index:  Set it to be the Country Column
df.set_index('Country', inplace=True)

df.head(10)

Unnamed: 0_level_0,Rank,Gold,Silver,Bronze,Total
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
United States (USA),1,46,37,38,121
Great Britain (GBR),2,27,23,17,67
China (CHN),3,26,18,26,70
Russia (RUS),4,19,17,19,55
Germany (GER),5,17,10,15,42
Japan (JPN),6,12,8,21,41
France (FRA),7,10,18,14,42
South Korea (KOR),8,9,3,9,21
Italy (ITA),9,8,12,8,28
Australia (AUS),10,8,11,10,29


In [6]:
# Display all of the values for the new Dataframe Index
df.index.values

array(['United States (USA)', 'Great Britain (GBR)', 'China (CHN)',
       'Russia (RUS)', 'Germany (GER)', 'Japan (JPN)', 'France (FRA)',
       'South Korea (KOR)', 'Italy (ITA)', 'Australia (AUS)',
       'Netherlands (NED)', 'Hungary (HUN)', 'Brazil (BRA)*',
       'Spain (ESP)', 'Kenya (KEN)', 'Jamaica (JAM)', 'Croatia (CRO)',
       'Cuba (CUB)', 'New Zealand (NZL)', 'Canada (CAN)',
       'Uzbekistan (UZB)', 'Kazakhstan (KAZ)', 'Colombia (COL)',
       'Switzerland (SUI)', 'Iran (IRI)', 'Greece (GRE)',
       'Argentina (ARG)', 'Denmark (DEN)', 'Sweden (SWE)',
       'South Africa (RSA)', 'Ukraine (UKR)', 'Serbia (SRB)',
       'Poland (POL)', 'North Korea (PRK)', 'Belgium (BEL)',
       'Thailand (THA)', 'Slovakia (SVK)', 'Georgia (GEO)',
       'Azerbaijan (AZE)', 'Belarus (BLR)', 'Turkey (TUR)',
       'Armenia (ARM)', 'Czech Republic (CZE)', 'Ethiopia (ETH)',
       'Slovenia (SLO)', 'Indonesia (INA)', 'Romania (ROU)',
       'Bahrain (BRN)', 'Vietnam (VIE)', 'Chinese Taipei

In [7]:
# Use loc with the Country index for Great Britain
df.loc['Great Britain (GBR)']

Rank       2
Gold      27
Silver    23
Bronze    17
Total     67
Name: Great Britain (GBR), dtype: int64

In [8]:
df.head(6)

Unnamed: 0_level_0,Rank,Gold,Silver,Bronze,Total
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
United States (USA),1,46,37,38,121
Great Britain (GBR),2,27,23,17,67
China (CHN),3,26,18,26,70
Russia (RUS),4,19,17,19,55
Germany (GER),5,17,10,15,42
Japan (JPN),6,12,8,21,41


In [9]:
# China and everything after
df.loc['China (CHN)': ].head()

Unnamed: 0_level_0,Rank,Gold,Silver,Bronze,Total
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
China (CHN),3,26,18,26,70
Russia (RUS),4,19,17,19,55
Germany (GER),5,17,10,15,42
Japan (JPN),6,12,8,21,41
France (FRA),7,10,18,14,42


In [10]:
# Germany to Italy
df.loc['Germany (GER)': 'Italy (ITA)']

Unnamed: 0_level_0,Rank,Gold,Silver,Bronze,Total
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Germany (GER),5,17,10,15,42
Japan (JPN),6,12,8,21,41
France (FRA),7,10,18,14,42
South Korea (KOR),8,9,3,9,21
Italy (ITA),9,8,12,8,28


In [11]:
df.head(10)

Unnamed: 0_level_0,Rank,Gold,Silver,Bronze,Total
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
United States (USA),1,46,37,38,121
Great Britain (GBR),2,27,23,17,67
China (CHN),3,26,18,26,70
Russia (RUS),4,19,17,19,55
Germany (GER),5,17,10,15,42
Japan (JPN),6,12,8,21,41
France (FRA),7,10,18,14,42
South Korea (KOR),8,9,3,9,21
Italy (ITA),9,8,12,8,28
Australia (AUS),10,8,11,10,29


In [12]:
# Slicing from the beginning through France
df.loc[ :'France (FRA)']

Unnamed: 0_level_0,Rank,Gold,Silver,Bronze,Total
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
United States (USA),1,46,37,38,121
Great Britain (GBR),2,27,23,17,67
China (CHN),3,26,18,26,70
Russia (RUS),4,19,17,19,55
Germany (GER),5,17,10,15,42
Japan (JPN),6,12,8,21,41
France (FRA),7,10,18,14,42


In [13]:
# Slicing from France to the end
df.loc['France (FRA)': ].head()

Unnamed: 0_level_0,Rank,Gold,Silver,Bronze,Total
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
France (FRA),7,10,18,14,42
South Korea (KOR),8,9,3,9,21
Italy (ITA),9,8,12,8,28
Australia (AUS),10,8,11,10,29
Netherlands (NED),11,8,7,4,19


# Selecting Rows with *iloc*  
*iloc* use the row number position (starting with 0!)

In [14]:
df.head(3)

Unnamed: 0_level_0,Rank,Gold,Silver,Bronze,Total
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
United States (USA),1,46,37,38,121
Great Britain (GBR),2,27,23,17,67
China (CHN),3,26,18,26,70


In [15]:
# Selecting the first row of the dataframe
df.iloc[0] 

Rank        1
Gold       46
Silver     37
Bronze     38
Total     121
Name: United States (USA), dtype: int64

In [16]:
# Select the second row of the dataframe 
df.iloc[1] 

Rank       2
Gold      27
Silver    23
Bronze    17
Total     67
Name: Great Britain (GBR), dtype: int64

In [17]:
# Select All rows but the first row
df.iloc[1:].head() 

Unnamed: 0_level_0,Rank,Gold,Silver,Bronze,Total
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Great Britain (GBR),2,27,23,17,67
China (CHN),3,26,18,26,70
Russia (RUS),4,19,17,19,55
Germany (GER),5,17,10,15,42
Japan (JPN),6,12,8,21,41


# Selecting Rows with Boolean Filters

In [18]:
df.head(10)

Unnamed: 0_level_0,Rank,Gold,Silver,Bronze,Total
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
United States (USA),1,46,37,38,121
Great Britain (GBR),2,27,23,17,67
China (CHN),3,26,18,26,70
Russia (RUS),4,19,17,19,55
Germany (GER),5,17,10,15,42
Japan (JPN),6,12,8,21,41
France (FRA),7,10,18,14,42
South Korea (KOR),8,9,3,9,21
Italy (ITA),9,8,12,8,28
Australia (AUS),10,8,11,10,29


### Display the top three countries by Rank

In [19]:
df.query('Rank <= 3')

Unnamed: 0_level_0,Rank,Gold,Silver,Bronze,Total
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
United States (USA),1,46,37,38,121
Great Britain (GBR),2,27,23,17,67
China (CHN),3,26,18,26,70


### Display countries with more than 20 Gold Medals

In [20]:
# Display only countries with more than 20 Gold Medals
df.query('Gold > 20')

Unnamed: 0_level_0,Rank,Gold,Silver,Bronze,Total
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
United States (USA),1,46,37,38,121
Great Britain (GBR),2,27,23,17,67
China (CHN),3,26,18,26,70


### *and* Operator

In [21]:
# Display countries with Ranks greater than 5 and less than or equal to 10
df.query('(Rank > 5) and (Rank <= 10)')

Unnamed: 0_level_0,Rank,Gold,Silver,Bronze,Total
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Japan (JPN),6,12,8,21,41
France (FRA),7,10,18,14,42
South Korea (KOR),8,9,3,9,21
Italy (ITA),9,8,12,8,28
Australia (AUS),10,8,11,10,29


### *or* Operator

In [22]:
# Display countries with a Rank of  5 or 6
df.query('(Rank == 5) or (Rank == 6)')

Unnamed: 0_level_0,Rank,Gold,Silver,Bronze,Total
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Germany (GER),5,17,10,15,42
Japan (JPN),6,12,8,21,41


# Selecting on Multiple Values in a List

In [23]:
df.query("Rank in [5, 6]")

Unnamed: 0_level_0,Rank,Gold,Silver,Bronze,Total
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Germany (GER),5,17,10,15,42
Japan (JPN),6,12,8,21,41


# Selecting Rows Based on index or Column Content

#### Searching an dataframe index values

In [25]:
# All the Countries with ia in their name
ia_index = df.index.str.contains('ia')
df[ia_index].head()

Unnamed: 0_level_0,Rank,Gold,Silver,Bronze,Total
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Russia (RUS),4,19,17,19,55
Australia (AUS),10,8,11,10,29
Croatia (CRO),17,5,3,2,10
Colombia (COL),23,3,2,3,8
Serbia (SRB),32,2,4,2,8


#### Searching an dataframe column/field values

In [26]:
# First remove the index for df = moving it back to being the column/field Country
df.reset_index(drop=False, inplace=True)
df.head()

Unnamed: 0,Country,Rank,Gold,Silver,Bronze,Total
0,United States (USA),1,46,37,38,121
1,Great Britain (GBR),2,27,23,17,67
2,China (CHN),3,26,18,26,70
3,Russia (RUS),4,19,17,19,55
4,Germany (GER),5,17,10,15,42


In [27]:
# Now Search the Country field values
ia_index = df['Country'].str.contains('ia')
df[ia_index].head()

Unnamed: 0,Country,Rank,Gold,Silver,Bronze,Total
3,Russia (RUS),4,19,17,19,55
9,Australia (AUS),10,8,11,10,29
16,Croatia (CRO),17,5,3,2,10
22,Colombia (COL),23,3,2,3,8
31,Serbia (SRB),32,2,4,2,8
