#  Normalisation challenges

### Challenge 1 - prepare dataset for normalisation 
---
1. Read Covid vaccination data from the `by_country` sheet in the Excel file at this link : https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true
2. Use .info() to find out which columns have missing values
3. Remove all rows with missing data in the total_vaccination column
4. Find the median vaccinations per hundred 
5. display the mean people vaccinated per hundred for each country in descending order
6. find the range of total_vaccinations across the dataframe 


**Test output**:  
1. dataframe is saved in a variable
2. 
```
RangeIndex: 14994 entries, 0 to 14993
Data columns (total 15 columns):
    Column                               Non-Null Count  Dtype         
                                
 0   country                              14994 non-null  object        
 1   iso_code                             14994 non-null  object        
 2   date                                 14994 non-null  datetime64[ns]
 3   total_vaccinations                   9011 non-null   float64       
 4   people_vaccinated                    8370 non-null   float64       
 5   people_fully_vaccinated              6158 non-null   float64       
 6   daily_vaccinations_raw               7575 non-null   float64       
 7   daily_vaccinations                   14796 non-null  float64       
 8   total_vaccinations_per_hundred       9011 non-null   float64       
 9   people_vaccinated_per_hundred        8370 non-null   float64       
 10  people_fully_vaccinated_per_hundred  6158 non-null   float64       
 11  daily_vaccinations_per_million       14796 non-null  float64       
 12  vaccines                             14994 non-null  object        
 13  source_name                          14994 non-null  object        
 14  source_website                       14994 non-null  object        
dtypes: datetime64[ns](1), float64(9), object(5)
memory usage: 1.7+ MB
```
3. 9011 rows × 15 columns
4. 7.78
5. 
```
country
Gibraltar       72.172462
Maldives        51.276087
Israel          48.931008
Seychelles      48.233333
Aruba           42.155000
                  ...    
Guinea           0.677000
Sierra Leone     0.530000
Namibia          0.317143
South Africa     0.261667
Albania          0.080000
Name: people_vaccinated_per_hundred, Length: 104, dtype: float64
```
6. 275338000.0


In [None]:
import pandas as pd

def prepare_dataset():
  excel_url = 'https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true'
  
  # 1. Read Covid vaccination data from the by_country sheet
  country_vaccine_df = pd.read_excel(excel_url, sheet_name='by_country')
  
  #2. Use .info() to find out which columns have missing values
  #print(country_vaccine_df.info())
  print(country_vaccine_df.columns[country_vaccine_df.isna().any()].tolist())

  #3. Remove all rows with missing data in the total_vaccination column
  country_vaccine_df.dropna(subset = ["total_vaccinations"], inplace=True)
  if (country_vaccine_df["total_vaccinations"].isna().values.any() == True):
    print(f'dropna did not drop all missing rows in column total_vaccinations')
  #print(country_vaccine_df.info())

  #4. Find the median vaccinations per hundred
  # FIXME Outputs do not match expected values
  cname = "total_vaccinations_per_hundred"
  country_vaccine_df.dropna(subset = [cname], inplace=True)
  print(f' Median vaccinations_per_hundred = {country_vaccine_df[cname].median()}')

  #5. display the mean people vaccinated per hundred for each country in descending order
  df_vaccine_per_hun_country = country_vaccine_df[["total_vaccinations_per_hundred", "country"]]
  df_country = df_vaccine_per_hun_country.groupby("country").median()
  df_country.sort_values(by=["total_vaccinations_per_hundred"], ascending=[False], inplace=True)
  print(f' {df_country}')

  #6. find the range of total_vaccinations across the dataframe
  cname = "total_vaccinations"
  print(f' Range of total_vaccinations = {country_vaccine_df[cname].max() - country_vaccine_df[cname].min() }')
  

prepare_dataset()

['total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'daily_vaccinations_raw', 'daily_vaccinations', 'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred', 'daily_vaccinations_per_million']
 Median vaccinations_per_hundred = 6.3
                               total_vaccinations_per_hundred
country                                                     
Gibraltar                                            109.250
Israel                                                90.160
Seychelles                                            64.350
Turks and Caicos Islands                              62.760
United Arab Emirates                                  62.370
...                                                      ...
Niger                                                  0.005
Papua New Guinea                                       0.005
Cameroon                                               0.000
Democratic Republic of Congo  

### Challenge 2 - normalise daily vaccinations 
---

1. Find the median daily vaccinations per million 
2. Write a function to normalise daily vaccinations per million, where values greater than or equal to median = 1 and values less than median = 0 

**Test output**: 

1. 1915.5
2. 
```
0        0
6        0
22       0
44       0
59       0
        ..
14989    0
14990    0
14991    0
14992    0
14993    0
Name: daily_vaccinations_per_million, Length: 9011, dtype: int64
```

In [None]:
import pandas as pd

# Remove NaN in a column and return updated df
# FIXME Need to pass a string as cname
def clean_col(df, **kwds):
  cname= kwds['cname']
  print(f'clean_col: Cname = {cname}')
  if (df[cname].isna().values.any() == True):
    df.dropna(subset=[cname], inplace=True)
    # Check for no NaN values
    if (df[cname].isna().values.any() == True) :
      # FIXME How to print error?
      print(f'clean_col: Could not drop all NaN from {cname}')
    
  return df

def df_normalise(val, **kwds):
  med_val = kwds['median']
  if val >= med_val:
    return 1
  else:
    return 0

def normalise_daily_vaccinations():
  excel_url = 'https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true'
  
  # 1. Read Covid vaccination data from the by_country sheet
  country_vaccine_df = pd.read_excel(excel_url, sheet_name='by_country')
  
  #2. Remove all rows with missing data in the daily_vaccinations_per_million column
  cname = "daily_vaccinations_per_million"
  #df_clean = country_vaccine_df.apply(clean_col, cname=cname1) 
  
  #3. Find the median daily vaccinations per million
  # Get only daily_vaccinations_per_million
  print(len(country_vaccine_df[cname]))
  #df_dvpm = country_vaccine_df.dropna(subset=[cname])
  df_dvpm = country_vaccine_df[cname].dropna()
  print(df_dvpm.head(10))
  print(len(df_dvpm))

  #print(df_dvpm)
  dvpm_median = df_dvpm.median()
  print(f'median daily vaccinations per million = {dvpm_median}')

  #5. Write a function to normalise daily vaccinations per million, where values greater than or equal to median = 1 and values less than median = 0
  # Normalise number of houses
  df_normalise_dvpm = df_dvpm.apply(df_normalise, median=dvpm_median)
  # Just checking if some entry has got 1
  print(f'Max normalised value {df_normalise_dvpm.max()}')
  print(df_normalise_dvpm)
   

normalise_daily_vaccinations()

14994
1     35.0
2     35.0
3     35.0
4     35.0
5     35.0
6     35.0
7     41.0
8     46.0
9     52.0
10    57.0
Name: daily_vaccinations_per_million, dtype: float64
14796
median daily vaccinations per million = 1475.0
Max normalised value 1
1        0
2        0
3        0
4        0
5        0
        ..
14989    0
14990    0
14991    0
14992    0
14993    0
Name: daily_vaccinations_per_million, Length: 14796, dtype: int64


### Challenge 3 - Normalising total vaccinations   
---
The United Kingdom has been praised for its fast vaccine rollout. 
1. Find the minimum total vaccinations for the United Kingdom 
2. Save this value in a variable rounded down to an integer
3. Write a function to normalise total_vaccinations column so that all values less than the UK's min are 0 and all values greater than or equal to the UK's min are coded as 1 
4. Display the countries for which total vaccinated is at the same rate or more than the UK

**Test output**:

1. 1402432.0
2. 1402432
3. `df['total_vaccinations']` should output:
```
0        0
6        0
22       0
44       0
59       0
        ..
14989    0
14990    0
14991    0
14992    0
14993    0
Name: total_vaccinations, Length: 9011, dtype: int64
```
4. 
```
array(['Argentina', 'Australia', 'Austria', 'Azerbaijan', 'Bangladesh',
       'Belgium', 'Brazil', 'Cambodia', 'Canada', 'Chile', 'China',
       'Colombia', 'Czechia', 'Denmark', 'Dominican Republic', 'England',
       'Finland', 'France', 'Germany', 'Greece', 'Hong Kong', 'Hungary',
       'India', 'Indonesia', 'Ireland', 'Israel', 'Italy', 'Japan',
       'Kazakhstan', 'Malaysia', 'Mexico', 'Morocco', 'Nepal',
       'Netherlands', 'Norway', 'Pakistan', 'Peru', 'Philippines',
       'Poland', 'Portugal', 'Qatar', 'Romania', 'Russia', 'Saudi Arabia',
       'Scotland', 'Serbia', 'Singapore', 'Slovakia', 'South Korea',
       'Spain', 'Sweden', 'Switzerland', 'Thailand', 'Turkey',
       'United Arab Emirates', 'United Kingdom', 'United States',
       'Uruguay', 'Wales'], dtype=object)
```




In [None]:
import pandas as pd
import numpy as np

def df_normalise(val, **kwds):
  min_val = kwds['uk_min']
  if val >= min_val:
    return 1
  else:
    return 0

def normalise_total_vaccinations():
  excel_url = 'https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true'
  
  # 1. Read Covid vaccination data from the by_country sheet
  country_vaccine_df = pd.read_excel(excel_url, sheet_name='by_country')
     
  # 1. Find the minimum total vaccinations for the United Kingdom
  # 2. Save this value in a variable rounded down to an integer
  filter_uk = country_vaccine_df['country'] == 'United Kingdom'
  # df_uk is a series
  df_uk = country_vaccine_df.loc[filter_uk, 'total_vaccinations']
  #print(df_uk)
  # min_total_vaccinations_uk is dtype float64
  min_total_vaccinations_uk = int(df_uk.min())
  print(f'minimum total vaccinations for the United Kingdom = {min_total_vaccinations_uk}')
  
  # 3. Write a function to normalise total_vaccinations column so that all values less than the UK's min are 0 and all values greater than or equal to the UK's min are coded as 1
  # Get country and total_vaccinations
  # Type is DataFrame
  df_country_total_vaccinations = country_vaccine_df[['country', 'total_vaccinations']]
  #print(type(df_country_total_vaccinations))

  # Remove all NaN and reset the index
  df_country_total_vaccinations.dropna(inplace=True)
  df_country_total_vaccinations.reset_index(drop=True, inplace=True)
  #print(df_country_total_vaccinations.head(10))

  df_country_total_vaccinations["total_vaccinations"] = df_country_total_vaccinations["total_vaccinations"].apply(df_normalise, uk_min=min_total_vaccinations_uk)
  print(df_country_total_vaccinations.head(10))

  # 4. Display the countries for which total vaccinated is at the same rate or more than the UK
  #df_compare is of type DataFrameGroupBy
  df_compare = df_country_total_vaccinations.groupby("total_vaccinations") #("country")
  
  #df_compare.sort_values(by=["total_vaccinations"], ascending=[False], inplace=True)
  print(df_compare.head(10))

  print(f'Iterating through df_compare\n')
  df1 = df_compare.get_group(1)
  print(df1['country'].unique())
    

normalise_total_vaccinations()

minimum total vaccinations for the United Kingdom = 1402432
       country  total_vaccinations
0  Afghanistan                   0
1  Afghanistan                   0
2  Afghanistan                   0
3  Afghanistan                   0
4  Afghanistan                   0
5      Albania                   0
6      Albania                   0
7      Albania                   0
8      Albania                   0
9      Albania                   0
         country  total_vaccinations
0    Afghanistan                   0
1    Afghanistan                   0
2    Afghanistan                   0
3    Afghanistan                   0
4    Afghanistan                   0
5        Albania                   0
6        Albania                   0
7        Albania                   0
8        Albania                   0
9        Albania                   0
160    Argentina                   1
161    Argentina                   1
162    Argentina                   1
163    Argentina                   1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Challenge 4 - create new series of total vaccinations for each manufacturer
---

To create a new column in your dataframe:

`df['new_column'] = ...`

For example:

* to duplicate an existing column
  * `df['new_column'] = df['old_column']`
* to add two columns together 
  * `df['new_column'] = df['column1'] + df['column2']`
* to make a percentages column 
  * `df['new_column'] = (df['column1']/df['column1].sum()) * 100`

  
1. read data from 'by_manufacturer' sheet from Covid data 
2. find the sum of total vaccinations for each manufacturer
3. create a new column that has the total vaccinations as a percentage of the overall sum of total vaccinations 
4. find the median percentage 
5. create a new column called 'normalised_percentages' which duplicates the percentages column
6. normalise the normalised_percentages column so that any values greater than or equal to the median percentage = 1 and any lesser than = 0 


**Test output**:

1.
2. 
```
vaccine
Johnson&Johnson        264839828
Moderna               5548036383
Oxford/AstraZeneca     539433203
Pfizer/BioNTech       8690461304
Sinovac                604660293
Name: total_vaccinations, dtype: int64
```
3. 
```
	location	date	vaccine	total_vaccinations	percentages
0	Chile	2020-12-24	Pfizer/BioNTech	420	0.000003
1	Chile	2020-12-25	Pfizer/BioNTech	5198	0.000033
2	Chile	2020-12-26	Pfizer/BioNTech	8338	0.000053
3	Chile	2020-12-27	Pfizer/BioNTech	8649	0.000055
4	Chile	2020-12-28	Pfizer/BioNTech	8649	0.000055
...	...	...	...	...	...
3291	United States	2021-05-01	Moderna	105947940	0.677095
3292	United States	2021-05-01	Pfizer/BioNTech	129013657	0.824504
3293	United States	2021-05-02	Johnson&Johnson	8374395	0.053519
3294	United States	2021-05-02	Moderna	106780082	0.682413
3295	United States	2021-05-02	Pfizer/BioNTech	130252779	0.832423
3296 rows × 5 columns
```
4. 0.0011110194374896931
5. 
6. 
```
	location	date	vaccine	total_vaccinations	percentages	normalise	normalised
0	Chile	2020-12-24	Pfizer/BioNTech	420	0.000003	0.000003	0
1	Chile	2020-12-25	Pfizer/BioNTech	5198	0.000033	0.000033	0
2	Chile	2020-12-26	Pfizer/BioNTech	8338	0.000053	0.000053	0
3	Chile	2020-12-27	Pfizer/BioNTech	8649	0.000055	0.000055	0
4	Chile	2020-12-28	Pfizer/BioNTech	8649	0.000055	0.000055	0
...	...	...	...	...	...	...	...
3291	United States	2021-05-01	Moderna	105947940	0.677095	0.677095	1
3292	United States	2021-05-01	Pfizer/BioNTech	129013657	0.824504	0.824504	1
3293	United States	2021-05-02	Johnson&Johnson	8374395	0.053519	0.053519	1
3294	United States	2021-05-02	Moderna	106780082	0.682413	0.682413	1
3295	United States	2021-05-02	Pfizer/BioNTech	130252779	0.832423	0.832423	1
3296 rows × 7 columns
```



In [9]:
import pandas as pd
import numpy as np

def df_normalise(val, **kwds):
  med_val = kwds['med']
  if val >= med_val:
    return 1
  else:
    return 0

def vaccinations_per_manufacture():
  excel_url = 'https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true'
  
  # 1. Read Covid vaccination data from the by_country sheet
  country_vaccine_df = pd.read_excel(excel_url, sheet_name='by_manufacturer')

  #2. find the sum of total vaccinations for each manufacturer
  # Cleanup the data first, just drop all NaN
  country_vaccine_df.dropna(inplace=True)

  gby1 = country_vaccine_df.groupby('vaccine')
  #print(gby1.first())
  print(gby1['total_vaccinations'].sum())

  #3. create a new column that has the total vaccinations as a percentage of the overall sum of total vaccinations
  sum_total_vaccinations = country_vaccine_df['total_vaccinations'].sum()
  print(f'Sum of total vaccinations = {sum_total_vaccinations}')
  country_vaccine_df['total_vaccinations_%'] = (country_vaccine_df['total_vaccinations']/sum_total_vaccinations) * 100
  print(country_vaccine_df.head(5))

  #4. find the median percentage
  median_percentage = country_vaccine_df["total_vaccinations_%"].median()
  print(f'Median total_vaccinations_% = {median_percentage} ')

  #5. create a new column called 'normalised_percentages' which duplicates the percentages column
  country_vaccine_df['normalised_percentages'] = country_vaccine_df['total_vaccinations_%']

  #6. normalise the normalised_percentages column so that any values greater than or equal to the median percentage = 1 and any lesser than = 0
  country_vaccine_df["normalised_percentages"] = country_vaccine_df["normalised_percentages"].apply(df_normalise, med=median_percentage)
  print(country_vaccine_df)


vaccinations_per_manufacture()

vaccine
Johnson&Johnson        264839828
Moderna               5548036383
Oxford/AstraZeneca     539433203
Pfizer/BioNTech       8690461304
Sinovac                604660293
Name: total_vaccinations, dtype: int64
Sum of total vaccinations = 15647431011
  location       date  ... total_vaccinations  total_vaccinations_%
0    Chile 2020-12-24  ...                420              0.000003
1    Chile 2020-12-25  ...               5198              0.000033
2    Chile 2020-12-26  ...               8338              0.000053
3    Chile 2020-12-27  ...               8649              0.000055
4    Chile 2020-12-28  ...               8649              0.000055

[5 rows x 5 columns]
Median total_vaccinations_% = 0.0011110194374896931 
           location       date  ... total_vaccinations_%  normalised_percentages
0             Chile 2020-12-24  ...             0.000003                       0
1             Chile 2020-12-25  ...             0.000033                       0
2             Chile 20