# Unit 3 - Conditional selection




In [1]:
import pandas as pd

In [2]:
url = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations.csv'
vacc_df = pd.read_csv(url)

### Selection with one condition

Our condition: only data from Israel

In [3]:
vacc_df.loc[:,'location'] == 'Israel'

0         False
1         False
2         False
3         False
4         False
          ...  
186366    False
186367    False
186368    False
186369    False
186370    False
Name: location, Length: 186371, dtype: bool

This creates a series of true/false 

We can insert this into the dataframe to select only that task:

In [4]:
vacc_df[vacc_df.loc[:,'location'] == 'Israel']

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
81520,Israel,ISR,2020-12-19,75.0,71.0,4.0,,,,0.00,0.00,0.00,,,,
81521,Israel,ISR,2020-12-20,7472.0,7468.0,4.0,,7397.0,7397.0,0.08,0.08,0.00,,783.0,7397.0,0.078
81522,Israel,ISR,2020-12-21,32395.0,32391.0,4.0,,24923.0,16160.0,0.34,0.34,0.00,,1710.0,16160.0,0.171
81523,Israel,ISR,2020-12-22,77064.0,77060.0,4.0,,44669.0,25663.0,0.82,0.82,0.00,,2716.0,25663.0,0.272
81524,Israel,ISR,2020-12-23,139981.0,139977.0,4.0,,62917.0,34976.0,1.48,1.48,0.00,,3702.0,34976.0,0.370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82382,Israel,ISR,2023-04-30,18648958.0,6723106.0,6159805.0,5766047.0,,46.0,197.36,71.15,65.19,61.02,5.0,2.0,0.000
82383,Israel,ISR,2023-05-01,18649010.0,6723107.0,6159810.0,5766093.0,52.0,36.0,197.36,71.15,65.19,61.02,4.0,2.0,0.000
82384,Israel,ISR,2023-05-02,18649126.0,6723117.0,6159820.0,5766189.0,116.0,50.0,197.37,71.15,65.19,61.02,5.0,3.0,0.000
82385,Israel,ISR,2023-05-03,18649185.0,6723118.0,6159822.0,5766245.0,59.0,52.0,197.37,71.15,65.19,61.02,6.0,3.0,0.000


Another way - since we're interested in all of the rows, we don't need the `loc`

In [5]:
vacc_df[vacc_df['location'] == 'Israel'].head(2)

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
81520,Israel,ISR,2020-12-19,75.0,71.0,4.0,,,,0.0,0.0,0.0,,,,
81521,Israel,ISR,2020-12-20,7472.0,7468.0,4.0,,7397.0,7397.0,0.08,0.08,0.0,,783.0,7397.0,0.078


Another way - since the column name contains no spaces

In [6]:
vacc_df.loc[vacc_df.location == 'Israel'].head(2)

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
81520,Israel,ISR,2020-12-19,75.0,71.0,4.0,,,,0.0,0.0,0.0,,,,
81521,Israel,ISR,2020-12-20,7472.0,7468.0,4.0,,7397.0,7397.0,0.08,0.08,0.0,,783.0,7397.0,0.078


### Selection using `query`

In [7]:
vacc_df.query('location == "Israel"').head(2)

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
81520,Israel,ISR,2020-12-19,75.0,71.0,4.0,,,,0.0,0.0,0.0,,,,
81521,Israel,ISR,2020-12-20,7472.0,7468.0,4.0,,7397.0,7397.0,0.08,0.08,0.0,,783.0,7397.0,0.078


### Selection using more than one condition

In [8]:
two_countries = vacc_df[(vacc_df.location == 'Israel') | (vacc_df.location == 'Denmark')]

In [9]:
vacc_df[(vacc_df.location == 'Israel') & (vacc_df.total_vaccinations == 75)]

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
81520,Israel,ISR,2020-12-19,75.0,71.0,4.0,,,,0.0,0.0,0.0,,,,


### <span style="color:blue"> Exercise:</span>
> Select two countries using `Query`
>
> What is the length of `two_countries`? (How many rows?)

### Selection using a variable

Find the country with the maximum vaccinations:

In [10]:
max_vacc = vacc_df['total_vaccinations'].max()
max_vacc

13532617012.0

but is this correct?\
What country has the maximum vaccinations?

In [11]:
vacc_df.query('total_vaccinations == @max_vacc')

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
184036,World,OWID_WRL,2024-01-07,13532620000.0,5630594000.0,5177748000.0,2802250000.0,,,169.69,70.6,64.92,35.14,,,


This is the world.. we want the data without the world..


### Selection using part of the string

In [12]:
vacc_df['location'].str.contains('Wo')

0         False
1         False
2         False
3         False
4         False
          ...  
186366    False
186367    False
186368    False
186369    False
186370    False
Name: location, Length: 186371, dtype: bool

In [13]:
vacc_df[vacc_df['location'].str.contains('Wo')]                         

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
182905,World,OWID_WRL,2020-12-02,0.000000e+00,0.000000e+00,,,0.0,0.0,0.00,0.0,,,0.0,0.0,0.0
182906,World,OWID_WRL,2020-12-03,0.000000e+00,0.000000e+00,,,1.0,1.0,0.00,0.0,,,0.0,1.0,0.0
182907,World,OWID_WRL,2020-12-04,1.000000e+00,1.000000e+00,,,1.0,1.0,0.00,0.0,,,0.0,1.0,0.0
182908,World,OWID_WRL,2020-12-05,1.000000e+00,1.000000e+00,,,1.0,1.0,0.00,0.0,,,0.0,1.0,0.0
182909,World,OWID_WRL,2020-12-06,1.000000e+00,1.000000e+00,,,1.0,1.0,0.00,0.0,,,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184032,World,OWID_WRL,2024-01-03,1.353260e+10,5.630594e+09,5.177748e+09,2.802230e+09,,,169.69,70.6,64.92,35.14,,,
184033,World,OWID_WRL,2024-01-04,1.353261e+10,5.630594e+09,5.177748e+09,2.802241e+09,,,169.69,70.6,64.92,35.14,,,
184034,World,OWID_WRL,2024-01-05,1.353262e+10,5.630594e+09,5.177748e+09,2.802249e+09,,,169.69,70.6,64.92,35.14,,,
184035,World,OWID_WRL,2024-01-06,1.353262e+10,5.630594e+09,5.177748e+09,2.802250e+09,,,169.69,70.6,64.92,35.14,,,


Wait.. how do we know there aren't any other countries with `Wo` in their name?

In [14]:
vacc_df[vacc_df['location'].str.contains('Wo')]['location'].unique()    

array(['World'], dtype=object)

### Select using `!=`


Remove the world data:

In [15]:
vacc_df_noWorld = vacc_df.loc[vacc_df.location != 'World']
#len(vacc_df_noWorld)

In [16]:
vacc_df_noWorld.loc[vacc_df.location == 'World']

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred


In [17]:
len(vacc_df)

186371

Find the country with the maximum vaccinations:

In [18]:
max_vacc = vacc_df_noWorld['total_vaccinations'].max()
max_vacc

9084319350.0

In [19]:
vacc_df_noWorld.query('total_vaccinations == @max_vacc')

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
11064,Asia,OWID_ASI,2024-01-07,9084319000.0,3688364000.0,3461818000.0,1811766000.0,,,192.41,78.12,73.32,38.37,,,


### <span style="color:blue"> Exercise:</span>

> Find the average `total_vaccinations` in `location=Asia`

----
### <span style="color:blue"> Exercise:</span>


> Select the number of daily vaccinations in Israel on date 2021-02-06 
>
> How many countries have more than 50000000 vaccinations?

---
## Summary of the functions in this unit:


>* `.index.values` - the row indexes of this part of the dataframes
>* `.str.contains` - selects rows and columns that contain a string
>* `.max` - maximum value
>* `.mean` - average value
>* `.count` - the number of rows that contain a value
>* `len()` - dataframe length