# Unit 3 - Conditional selection




In [1]:
import pandas as pd

In [2]:
url = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations.csv'
vacc_df = pd.read_csv(url)

### Selection with one condition

Our condition: only data from Israel

In [3]:
vacc_df.loc[:,'location'] == 'Israel'

0         False
1         False
2         False
3         False
4         False
          ...  
157991    False
157992    False
157993    False
157994    False
157995    False
Name: location, Length: 157996, dtype: bool

This creates a series of true/false 

We can insert this into the dataframe to select only that task:

In [4]:
vacc_df[vacc_df.loc[:,'location'] == 'Israel']

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
68378,Israel,ISR,2020-12-19,75.0,71.0,4.0,,,,0.00,0.00,0.00,,,,
68379,Israel,ISR,2020-12-20,7471.0,7467.0,4.0,,7396.0,7396.0,0.08,0.08,0.00,,783.0,7396.0,0.078
68380,Israel,ISR,2020-12-21,32394.0,32390.0,4.0,,24923.0,16160.0,0.34,0.34,0.00,,1710.0,16160.0,0.171
68381,Israel,ISR,2020-12-22,77062.0,77058.0,4.0,,44668.0,25662.0,0.82,0.82,0.00,,2716.0,25662.0,0.272
68382,Israel,ISR,2020-12-23,139981.0,139977.0,4.0,,62919.0,34976.0,1.48,1.48,0.00,,3702.0,34976.0,0.370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69175,Israel,ISR,2023-02-24,18642265.0,6722543.0,6158848.0,5760874.0,37.0,162.0,197.29,71.15,65.18,60.97,17.0,11.0,0.000
69176,Israel,ISR,2023-02-25,18642270.0,6722546.0,6158848.0,5760876.0,5.0,145.0,197.29,71.15,65.18,60.97,15.0,10.0,0.000
69177,Israel,ISR,2023-02-26,18642484.0,6722558.0,6158868.0,5761058.0,214.0,159.0,197.30,71.15,65.18,60.97,17.0,11.0,0.000
69178,Israel,ISR,2023-02-27,18642689.0,6722569.0,6158893.0,5761227.0,205.0,162.0,197.30,71.15,65.18,60.97,17.0,11.0,0.000


Another way - since we're interested in all of the rows, we don't need the `loc`

In [5]:
vacc_df[vacc_df['location'] == 'Israel'].head(2)

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
68378,Israel,ISR,2020-12-19,75.0,71.0,4.0,,,,0.0,0.0,0.0,,,,
68379,Israel,ISR,2020-12-20,7471.0,7467.0,4.0,,7396.0,7396.0,0.08,0.08,0.0,,783.0,7396.0,0.078


Another way - since the column name contains no spaces

In [6]:
vacc_df.loc[vacc_df.location == 'Israel'].head(2)

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
68378,Israel,ISR,2020-12-19,75.0,71.0,4.0,,,,0.0,0.0,0.0,,,,
68379,Israel,ISR,2020-12-20,7471.0,7467.0,4.0,,7396.0,7396.0,0.08,0.08,0.0,,783.0,7396.0,0.078


### Selection using query `query`

In [7]:
vacc_df.query('location == "Israel"').head(2)

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
68378,Israel,ISR,2020-12-19,75.0,71.0,4.0,,,,0.0,0.0,0.0,,,,
68379,Israel,ISR,2020-12-20,7471.0,7467.0,4.0,,7396.0,7396.0,0.08,0.08,0.0,,783.0,7396.0,0.078


### Selection using more than one condition

In [8]:
two_countries = vacc_df[(vacc_df.location == 'Israel') | (vacc_df.location == 'Denmark')]
len(two_countries)

1614

### <span style="color:blue"> Exercise:</span>
> Select two countries using `Query`
>
> What is the length of `two_countries`? (How many rows?)

### Selection using a variable

Find the country with the maximum vaccinations:

In [9]:
max_vacc = vacc_df['total_vaccinations'].max()
max_vacc

13312958715.0

but is this correct?\
What country has the maximum vaccinations?

In [10]:
vacc_df.query('total_vaccinations == @max_vacc')

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
156078,World,OWID_WRL,2023-02-28,13312960000.0,5556363000.0,5107791000.0,2739484000.0,,,166.93,69.67,64.05,34.35,,,


This is the world.. we want the data without the world..


### Selection using part of the string

In [11]:
vacc_df['location'].str.contains('Wo')

0         False
1         False
2         False
3         False
4         False
          ...  
157991    False
157992    False
157993    False
157994    False
157995    False
Name: location, Length: 157996, dtype: bool

In [12]:
vacc_df[vacc_df['location'].str.contains('Wo')]                         

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
155260,World,OWID_WRL,2020-12-02,0.000000e+00,0.000000e+00,,,0.0,0.0,0.00,0.00,,,0.0,0.0,0.0
155261,World,OWID_WRL,2020-12-03,0.000000e+00,0.000000e+00,,,1.0,1.0,0.00,0.00,,,0.0,1.0,0.0
155262,World,OWID_WRL,2020-12-04,1.000000e+00,1.000000e+00,,,1.0,1.0,0.00,0.00,,,0.0,1.0,0.0
155263,World,OWID_WRL,2020-12-05,1.000000e+00,1.000000e+00,,,1.0,1.0,0.00,0.00,,,0.0,1.0,0.0
155264,World,OWID_WRL,2020-12-06,1.000000e+00,1.000000e+00,,,1.0,1.0,0.00,0.00,,,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156074,World,OWID_WRL,2023-02-24,1.331029e+10,5.555324e+09,5.106773e+09,2.738176e+09,,,166.90,69.66,64.03,34.33,,,
156075,World,OWID_WRL,2023-02-25,1.331061e+10,5.555342e+09,5.106896e+09,2.738352e+09,,,166.90,69.66,64.04,34.34,,,
156076,World,OWID_WRL,2023-02-26,1.331260e+10,5.556292e+09,5.107729e+09,2.739255e+09,,,166.93,69.67,64.05,34.35,,,
156077,World,OWID_WRL,2023-02-27,1.331266e+10,5.556296e+09,5.107735e+09,2.739311e+09,,,166.93,69.67,64.05,34.35,,,


Wait.. how do we know there aren't any other countries with `Wo` in their name?

In [13]:
vacc_df[vacc_df['location'].str.contains('Wo')]['location'].unique()    

array(['World'], dtype=object)

### Select using `!=`


Remove the world data:

In [14]:
vacc_df_noWorld = vacc_df.loc[vacc_df.location != 'World']
#len(vacc_df_noWorld)

In [15]:
vacc_df_noWorld.loc[vacc_df.location == 'World']

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred


In [16]:
len(vacc_df)

157996

Find the country with the maximum vaccinations:

In [17]:
max_vacc = vacc_df_noWorld['total_vaccinations'].max()
max_vacc

8987651826.0

In [18]:
vacc_df_noWorld.query('total_vaccinations == @max_vacc')

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
8343,Asia,OWID_ASI,2023-02-28,8987652000.0,3665997000.0,3432526000.0,1781602000.0,,,190.36,77.65,72.7,37.73,,,


### <span style="color:blue"> Exercise:</span>

> Find the average `total_vaccinations` in `location=Asia`

----
### <span style="color:blue"> Exercise:</span>


> Select the number of daily vaccinations in Israel on date 2021-02-06 
>
> How many countries have more than 50000000 vaccinations?

---
## Summary of the functions in this unit:


>* `.index.values` - the row indexes of this part of the dataframes
>* `.str.contains` - selects rows and columns that contain a string
>* `.max` - maximum value
>* `.mean` - average value
>* `.count` - the number of rows that contain a value
>* `len()` - dataframe length