# Session 05 - Tutorial. Data Assembly in Pandas

## This session will cover:

1. Concatenating data
2. Advance merging of data sets
3. Grouping

# 1. Let's load some libraries

In [30]:
import pandas as pd

# 2. Let's load some datasets

In [31]:
## The air_quality_no2_long.csv data set provides 𝑁𝑂2 values for the measurement stations FR04014, BETR801 and London Westminster in respectively Paris, Antwerp and London.
air_quality_no2 = pd.read_csv('https://www.dropbox.com/s/70230oct6p0ovnv/air_quality_no2_long.csv?dl=1',parse_dates=True)

In [32]:
air_quality_no2.head()

Unnamed: 0,city,country,date.utc,location,parameter,value,unit
0,Paris,FR,2019-06-21 00:00:00+00:00,FR04014,no2,20.0,µg/m³
1,Paris,FR,2019-06-20 23:00:00+00:00,FR04014,no2,21.8,µg/m³
2,Paris,FR,2019-06-20 22:00:00+00:00,FR04014,no2,26.5,µg/m³
3,Paris,FR,2019-06-20 21:00:00+00:00,FR04014,no2,24.9,µg/m³
4,Paris,FR,2019-06-20 20:00:00+00:00,FR04014,no2,21.4,µg/m³


In [33]:
## The air_quality_pm25_long.csv data set provides 𝑃𝑀25 values for the measurement stations FR04014, BETR801 and London Westminster in respectively Paris, Antwerp and London.

air_quality_pm25 = pd.read_csv('https://www.dropbox.com/s/d0ef5l5rm95fkdx/air_quality_pm25_long.csv?dl=1',parse_dates=True)

In [34]:
air_quality_pm25.head()

Unnamed: 0,city,country,date.utc,location,parameter,value,unit
0,Antwerpen,BE,2019-06-18 06:00:00+00:00,BETR801,pm25,18.0,µg/m³
1,Antwerpen,BE,2019-06-17 08:00:00+00:00,BETR801,pm25,6.5,µg/m³
2,Antwerpen,BE,2019-06-17 07:00:00+00:00,BETR801,pm25,18.5,µg/m³
3,Antwerpen,BE,2019-06-17 06:00:00+00:00,BETR801,pm25,16.0,µg/m³
4,Antwerpen,BE,2019-06-17 05:00:00+00:00,BETR801,pm25,7.5,µg/m³


In [35]:
## The air quality measurement station coordinates are stored in a data file air_quality_stations.csv

stations_coord = pd.read_csv("https://www.dropbox.com/s/1wd3n5m1chg1b1k/air_quality_stations.csv?dl=1")

In [36]:
stations_coord.head()

Unnamed: 0,location,coordinates.latitude,coordinates.longitude
0,BELAL01,51.23619,4.38522
1,BELHB23,51.1703,4.341
2,BELLD01,51.10998,5.00486
3,BELLD02,51.12038,5.02155
4,BELR833,51.32766,4.36226


In [37]:
## The air quality parameters metadata are stored in a data file air_quality_parameters.csv

air_quality_parameters = pd.read_csv("https://www.dropbox.com/s/qnp2myzjbukpbgj/air_quality_parameters.csv?dl=1")

In [38]:
air_quality_parameters

Unnamed: 0,id,description,name
0,bc,Black Carbon,BC
1,co,Carbon Monoxide,CO
2,no2,Nitrogen Dioxide,NO2
3,o3,Ozone,O3
4,pm10,Particulate matter less than 10 micrometers in...,PM10
5,pm25,Particulate matter less than 2.5 micrometers i...,PM2.5
6,so2,Sulfur Dioxide,SO2


# 3. Concatenation

## Combine the measurements of 𝑁𝑂2 "air_quality_no2" and 𝑃𝑀25 "air_quality_pm25" , two dataframe with a similar structure, in a single dataframe

In [41]:
air_quality = pd.concat([air_quality_pm25, air_quality_no2],ignore_index=True)

# The number of rows of the outcome is equal to the sum of the number of rows of input
print(air_quality.shape,air_quality_pm25.shape,air_quality_no2.shape)

(3178, 7) (1110, 7) (2068, 7)


In [42]:
air_quality

Unnamed: 0,city,country,date.utc,location,parameter,value,unit
0,Antwerpen,BE,2019-06-18 06:00:00+00:00,BETR801,pm25,18.0,µg/m³
1,Antwerpen,BE,2019-06-17 08:00:00+00:00,BETR801,pm25,6.5,µg/m³
2,Antwerpen,BE,2019-06-17 07:00:00+00:00,BETR801,pm25,18.5,µg/m³
3,Antwerpen,BE,2019-06-17 06:00:00+00:00,BETR801,pm25,16.0,µg/m³
4,Antwerpen,BE,2019-06-17 05:00:00+00:00,BETR801,pm25,7.5,µg/m³
...,...,...,...,...,...,...,...
3173,London,GB,2019-05-07 06:00:00+00:00,London Westminster,no2,26.0,µg/m³
3174,London,GB,2019-05-07 04:00:00+00:00,London Westminster,no2,16.0,µg/m³
3175,London,GB,2019-05-07 03:00:00+00:00,London Westminster,no2,19.0,µg/m³
3176,London,GB,2019-05-07 02:00:00+00:00,London Westminster,no2,19.0,µg/m³


# 4. Join

## Add the station coordinates "stations_coord" to the corresponding rows in the measurements dataframe "air_quality".


In [43]:
air_quality_with_station_cooord=pd.merge(air_quality,stations_coord,on='location')
air_quality_with_station_cooord

Unnamed: 0,city,country,date.utc,location,parameter,value,unit,coordinates.latitude,coordinates.longitude
0,Antwerpen,BE,2019-06-18 06:00:00+00:00,BETR801,pm25,18.0,µg/m³,51.20966,4.43182
1,Antwerpen,BE,2019-06-17 08:00:00+00:00,BETR801,pm25,6.5,µg/m³,51.20966,4.43182
2,Antwerpen,BE,2019-06-17 07:00:00+00:00,BETR801,pm25,18.5,µg/m³,51.20966,4.43182
3,Antwerpen,BE,2019-06-17 06:00:00+00:00,BETR801,pm25,16.0,µg/m³,51.20966,4.43182
4,Antwerpen,BE,2019-06-17 05:00:00+00:00,BETR801,pm25,7.5,µg/m³,51.20966,4.43182
...,...,...,...,...,...,...,...,...,...
4177,Paris,FR,2019-05-07 03:00:00+00:00,FR04014,no2,50.4,µg/m³,48.83722,2.39390
4178,Paris,FR,2019-05-07 02:00:00+00:00,FR04014,no2,27.7,µg/m³,48.83724,2.39390
4179,Paris,FR,2019-05-07 02:00:00+00:00,FR04014,no2,27.7,µg/m³,48.83722,2.39390
4180,Paris,FR,2019-05-07 01:00:00+00:00,FR04014,no2,25.0,µg/m³,48.83724,2.39390


## Add the parameter full description and name, provided by the parameters metadata dataframe "air_quality_parameters", to the measurements dataframe


In [44]:
air_quality_with_station_cooord_params=pd.merge(air_quality,air_quality_parameters,left_on='parameter',right_on='id')
air_quality_with_station_cooord_params

Unnamed: 0,city,country,date.utc,location,parameter,value,unit,id,description,name
0,Antwerpen,BE,2019-06-18 06:00:00+00:00,BETR801,pm25,18.0,µg/m³,pm25,Particulate matter less than 2.5 micrometers i...,PM2.5
1,Antwerpen,BE,2019-06-17 08:00:00+00:00,BETR801,pm25,6.5,µg/m³,pm25,Particulate matter less than 2.5 micrometers i...,PM2.5
2,Antwerpen,BE,2019-06-17 07:00:00+00:00,BETR801,pm25,18.5,µg/m³,pm25,Particulate matter less than 2.5 micrometers i...,PM2.5
3,Antwerpen,BE,2019-06-17 06:00:00+00:00,BETR801,pm25,16.0,µg/m³,pm25,Particulate matter less than 2.5 micrometers i...,PM2.5
4,Antwerpen,BE,2019-06-17 05:00:00+00:00,BETR801,pm25,7.5,µg/m³,pm25,Particulate matter less than 2.5 micrometers i...,PM2.5
...,...,...,...,...,...,...,...,...,...,...
3173,London,GB,2019-05-07 06:00:00+00:00,London Westminster,no2,26.0,µg/m³,no2,Nitrogen Dioxide,NO2
3174,London,GB,2019-05-07 04:00:00+00:00,London Westminster,no2,16.0,µg/m³,no2,Nitrogen Dioxide,NO2
3175,London,GB,2019-05-07 03:00:00+00:00,London Westminster,no2,19.0,µg/m³,no2,Nitrogen Dioxide,NO2
3176,London,GB,2019-05-07 02:00:00+00:00,London Westminster,no2,19.0,µg/m³,no2,Nitrogen Dioxide,NO2


# 5. Grouping

## Find the average value of pm25 and no2 in each city over time

In [45]:
air_quality_with_station_cooord_params.groupby(['city','parameter']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,value
city,parameter,Unnamed: 2_level_1
Antwerpen,no2,25.778947
Antwerpen,pm25,21.50495
London,no2,24.77709
London,pm25,8.993062
Paris,no2,27.740538


In [46]:
# we can reset the index so we have the index starting from zero:
air_quality_with_station_cooord_params.groupby(['city','parameter']).mean().reset_index()

Unnamed: 0,city,parameter,value
0,Antwerpen,no2,25.778947
1,Antwerpen,pm25,21.50495
2,London,no2,24.77709
3,London,pm25,8.993062
4,Paris,no2,27.740538


## Find the average value of pm25 and no2 in each station over time

In [47]:
air_quality_with_station_cooord_params.groupby(['location','parameter']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,value
location,parameter,Unnamed: 2_level_1
BETR801,no2,25.778947
BETR801,pm25,21.50495
FR04014,no2,27.740538
London Westminster,no2,24.77709
London Westminster,pm25,8.993062
