In [1]:
import pandas as pd
import ee
from tqdm import tqdm # Progress bar
from datetime import timedelta
import datetime

In [2]:
ee.Initialize() # Run each session

In [3]:
UK_PM10 = pd.read_csv('UK_PM10.csv')

In [4]:
######################################## Renaming columns to make clearer #############################################
######################################## This has already been done for UK_main #############################################

UK_PM10 = UK_PM10.rename(columns={"date.utc": "Date", "coordinates.latitude": "Lat", "coordinates.longitude":"Long", "value": "PM10"})
UK_PM10['Date'] = UK_PM10['Date'].astype(str).str[:10]
UK_PM10 = UK_PM10.drop(columns=['unit', 'parameter'], axis =1)
UK_PM10 = UK_PM10[['location', 'Date', 'city', 'country', 'PM10', 'Lat','Long']]
UK_PM10['PM10'] = UK_PM10.groupby(['Date','location'])['PM10'].transform(lambda x: x.mean())
UK_PM10 = UK_PM10.groupby(['Date', 'location']).first().reset_index()

In [9]:
UK_PM10.head()

Unnamed: 0,Date,location,city,country,PM10,Lat,Long
0,2018-09-27,Aberdeen,Aberdeen,GB,6.764706,57.15736,-2.094278
1,2018-09-27,Armagh Roadside,Armagh,GB,13.941176,54.353728,-6.654558
2,2018-09-27,Auchencorth Moss,Auchencorth,GB,3.533333,55.79216,-3.2429
3,2018-09-27,Barnstaple A39,Barnstaple,GB,9.764706,51.074793,-4.041924
4,2018-09-27,Belfast Centre,Belfast,GB,13.529412,54.59965,-5.928833


In [5]:
UK_PM10 = UK_PM10[UK_PM10['country'] == 'GB']

In [6]:
# Locations of Interest

locations = UK_PM10.groupby('location').first().reset_index()
locations.head()

Unnamed: 0,location,Date,city,country,PM10,Lat,Long
0,Aberdeen,2018-09-27,Aberdeen,GB,6.764706,57.15736,-2.094278
1,Armagh Roadside,2018-09-27,Armagh,GB,13.941176,54.353728,-6.654558
2,Auchencorth Moss,2018-09-27,Auchencorth,GB,3.533333,55.79216,-3.2429
3,Barnstaple A39,2018-09-27,Barnstaple,GB,9.764706,51.074793,-4.041924
4,Belfast Centre,2018-09-27,Belfast,GB,13.529412,54.59965,-5.928833


In [7]:
############################ A utility function to pull data for a set of locations ######################################
def sample(im, prop, lats, lons, scale=100, reducer=ee.Reducer.first(), tileScale=4):
    points = []
    for lat, lon in zip(lats, lons):
        xy = ee.Geometry.Point([lon, lat])
        points.append(xy.buffer(scale))
    vals = im.reduceRegions(collection=ee.FeatureCollection(points), scale=scale, reducer=reducer, tileScale=tileScale).getInfo()
    if prop == '':
        return [v['properties'] for v in vals['features']]
    return [v['properties'][prop] for v in vals['features']]

#############################################################################################################################

In [8]:
############################ Population Density ############################################################################
def add_static_vars(df, scale = 2000):
    lights = ee.ImageCollection("NOAA/DMSP-OLS/CALIBRATED_LIGHTS_V4").filter(ee.Filter.date('2010-01-01', '2018-03-08')).first()
    pop = ee.ImageCollection("CIESIN/GPWv411/GPW_UNWPP-Adjusted_Population_Density").filter(ee.Filter.date('2010-01-01', '2018-03-08')).first()
    ims = [lights, pop]

    for im in tqdm(ims):
        for i, reducer in enumerate([ee.Reducer.mean(), ee.Reducer.min(), ee.Reducer.max()]):
            sampled_values = sample(im, '', df['Lat'].values, df['Long'].values, reducer=reducer)
            for k in sampled_values[0].keys():
                arr = ['mean', 'min', 'max']
                df[k+'_'+ arr[i]] = [sv[k] if k in sv.keys() else None for sv in sampled_values]
                if k == arr[i]:
                    df = df.rename(columns={k+'_'+ arr[i]:'pop_density2010'+'_'+ arr[i]})
    return df

locations_w_population = add_static_vars(locations.copy())

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.18s/it]


In [17]:
locations_w_population.head()

Unnamed: 0,location,Date,city,country,PM10,Lat,Long,avg_vis_mean,cf_cvg_mean,avg_vis_min,cf_cvg_min,avg_vis_max,cf_cvg_max,pop_density2010_mean,pop_density2010_min,pop_density2010_max
0,Aberdeen,2018-09-27,Aberdeen,GB,6.764706,57.15736,-2.094278,334.982815,13.0,332.663116,13,332.663116,13,8788.521484,8788.521484,8788.521484
1,Armagh Roadside,2018-09-27,Armagh,GB,13.941176,54.353728,-6.654558,59.158239,12.0,40.966034,12,74.85836,12,1216.19812,1216.19812,1216.19812
2,Auchencorth Moss,2018-09-27,Auchencorth,GB,3.533333,55.79216,-3.2429,12.667045,48.0,12.667045,48,12.667045,48,7.703879,7.703878,7.703879
3,Barnstaple A39,2018-09-27,Barnstaple,GB,9.764706,51.074793,-4.041924,37.478218,62.0,37.478218,62,37.478218,62,3878.105249,3026.625244,4439.924805
4,Belfast Centre,2018-09-27,Belfast,GB,13.529412,54.59965,-5.928833,288.115729,15.0,284.262726,15,289.223877,15,2318.533691,2303.730713,2303.730713


In [9]:
locations_w_population = locations_w_population.drop(columns=['Date', 'PM10', 'Lat', 'Long', 'city','country'], axis=1)

In [10]:
world_w_population = pd.merge(UK_PM10, locations_w_population, on='location')

In [11]:
################################################## Vegetation index ########################################################
def add_static_vars2(df, scale = 8000):
    vegs = ee.ImageCollection("NOAA/CDR/AVHRR/NDVI/V5").filter(ee.Filter.date('2018-09-27', '2021-04-01')).first()
    veg_list = [vegs]

    for veg in tqdm(veg_list):
        for i, reducer in enumerate([ee.Reducer.mean(), ee.Reducer.min(), ee.Reducer.max()]):
            sampled_values = sample(veg, '', df['Lat'].values, df['Long'].values, reducer=reducer, scale=scale)
            for k in sampled_values[0].keys():
                arr = ['mean', 'min', 'max']
                df[k+'_'+str(scale)+'_' + arr[i]] = [sv[k] if k in sv.keys() else None for sv in sampled_values]
                if k == arr[i]:
                    df = df.rename(columns={k+'_'+str(scale)+'_' + arr[i]:'veg_index'+'_'+str(scale)+'_' + arr[i]})
    return df

locations_w_veg = add_static_vars2(locations.copy())

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.86s/it]


In [12]:
locations_w_veg = locations_w_veg.drop(['Date', 'PM10', 'Lat', 'Long', 'city','country'], axis = 1)
locations_w_veg.head(3)

Unnamed: 0,location,NDVI_8000_mean,QA_8000_mean,TIMEOFDAY_8000_mean,NDVI_8000_min,QA_8000_min,TIMEOFDAY_8000_min,NDVI_8000_max,QA_8000_max,TIMEOFDAY_8000_max
0,Aberdeen,1220.846695,-310.897349,1556.695652,-288,-32632,1428,3437,16768,1596
1,Armagh Roadside,-120.392099,16514.0,1574.598391,-513,16514,1595,222,16514,1595
2,Auchencorth Moss,318.642001,16514.0,1513.44186,0,16514,1428,981,16514,1595


In [13]:
world_w_veg = pd.merge(UK_PM10, locations_w_veg, on='location')

In [14]:
world_w_veg.head(3)

Unnamed: 0,Date,location,city,country,PM10,Lat,Long,NDVI_8000_mean,QA_8000_mean,TIMEOFDAY_8000_mean,NDVI_8000_min,QA_8000_min,TIMEOFDAY_8000_min,NDVI_8000_max,QA_8000_max,TIMEOFDAY_8000_max
0,2018-09-27,Aberdeen,Aberdeen,GB,6.764706,57.15736,-2.094278,1220.846695,-310.897349,1556.695652,-288,-32632,1428,3437,16768,1596
1,2018-09-28,Aberdeen,Aberdeen,GB,8.190476,57.15736,-2.094278,1220.846695,-310.897349,1556.695652,-288,-32632,1428,3437,16768,1596
2,2018-09-29,Aberdeen,Aberdeen,GB,7.388889,57.15736,-2.094278,1220.846695,-310.897349,1556.695652,-288,-32632,1428,3437,16768,1596


In [15]:
################################################ Elevation ###################################################################

def add_static_vars3(df, scale = 2000):
    elevations = ee.Image('USGS/GMTED2010').select('be75')
    elevation_list = [elevations]

    for elevation in tqdm(elevation_list):
        for i, reducer in enumerate([ee.Reducer.mean(), ee.Reducer.min(), ee.Reducer.max()]):
            sampled_values = sample(elevation, '', df['Lat'].values, df['Long'].values, reducer=reducer, scale=scale)
            for k in sampled_values[0].keys():
                arr = ['mean', 'min', 'max']
                df[k+'_'+str(scale)+'_' + arr[i]] = [sv[k] if k in sv.keys() else None for sv in sampled_values]
                if k == arr[i]:
                    df = df.rename(columns={k+'_'+str(scale)+'_' + arr[i]:'elevation'+'_'+str(scale)+'_' + arr[i]})
    return df

locations_w_elevation = add_static_vars3(locations.copy())

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.54s/it]


In [16]:
locations_w_elevation = locations_w_elevation.drop(['Date', 'PM10', 'Lat', 'Long', 'city','country'],axis = 1)
locations_w_elevation.head(3)

Unnamed: 0,location,elevation_2000_mean,elevation_2000_min,elevation_2000_max
0,Aberdeen,21.609085,5,31
1,Armagh Roadside,51.233138,41,72
2,Auchencorth Moss,255.626686,230,273


In [17]:
world_w_elevation = pd.merge(UK_PM10, locations_w_elevation, on='location')
world_w_elevation.head(3)

##############################################################################################################################

Unnamed: 0,Date,location,city,country,PM10,Lat,Long,elevation_2000_mean,elevation_2000_min,elevation_2000_max
0,2018-09-27,Aberdeen,Aberdeen,GB,6.764706,57.15736,-2.094278,21.609085,5,31
1,2018-09-28,Aberdeen,Aberdeen,GB,8.190476,57.15736,-2.094278,21.609085,5,31
2,2018-09-29,Aberdeen,Aberdeen,GB,7.388889,57.15736,-2.094278,21.609085,5,31


In [32]:
############################################## Sentinel 5 Data ###############################################################




# Image Collections
gfs = ee.ImageCollection("NOAA/GFS0P25") # Weather data

S5p_collections = {} # Sentinel 5p data, which comes in multiple collections
for COL in ['L3_NO2', 'L3_O3', 'L3_CO', 'L3_AER_AI']: # 
    S5p_collections[COL] = ee.ImageCollection('COPERNICUS/S5P/OFFL/'+COL).map(lambda image: image.addBands\
                                                                              (image.metadata('system:time_start')))
# Properties for each image we want to keep
s5p_props = {
    'L3_NO2':['NO2_column_number_density', 'tropospheric_NO2_column_number_density', 'stratospheric_NO2_column_number_density',\
              'NO2_slant_column_number_density', 'tropopause_pressure'],
    'L3_O3':['O3_column_number_density'],
    'L3_CO':['CO_column_number_density', 'H2O_column_number_density', 'cloud_height'],
    'L3_AER_AI':['absorbing_aerosol_index']
}

def add_timeseries(df, dates, reducer=ee.Reducer.first()):
    # Prepare dataframe with date x city
    date_col = []
    location_col = []
    for d in dates:
        for c in df.location.unique():
            date_col.append(d)
            location_col.append(c)

    data = pd.DataFrame({
        'Date':date_col,
        'location':location_col
    })
    data = pd.merge(data, df[['location', 'Lat', 'Long']], how='left', on='location')
    
    for d in tqdm(dates):
        # Weather is easy - a single image from the right date
        weather_image = gfs.filter(ee.Filter.date(str(d.date()), str((d+timedelta(days=1)).date()))).first() # Filter to get the relevant image
        # For the sentinel data, we get images from each collection and merge them
        s5p_images = []
        for COL in ['L3_NO2', 'L3_O3', 'L3_CO', 'L3_AER_AI']:
            collection = S5p_collections[COL].filter(ee.Filter.date(str((d-timedelta(days=5)).date()), str(d.date())))
            image = collection.qualityMosaic('system:time_start') # The most recent image
            image = image.select(s5p_props[COL])
            s5p_images.append(image)
        s5p_image = ee.ImageCollection(s5p_images).toBands() # Merge into one image
    
        # Sample the weather data
        samples = sample(weather_image, '', df['Lat'].values, df['Long'].values, reducer=reducer)
        for prop in samples[0].keys():
            data.loc[data.Date==d, prop] = [p[prop] for p in samples]
            
        # Sample the sentinel data
        samples = sample(s5p_image, '', df['Lat'].values, df['Long'].values)
        for prop in samples[0].keys():
            data.loc[data.Date==d, prop] = [p[prop] for p in samples]
            
    return data

In [54]:
dates_2018 = pd.date_range('2018-09-27','2018-12-31' , freq='1D')
dates_2019 = pd.date_range('2019-01-01','2019-12-31' , freq='1D')
dates_2020 = pd.date_range('2020-01-01','2020-03-31' , freq='1D')

In [53]:
UK_PM10.iloc[17950:]

Unnamed: 0,Date,location,city,country,PM10,Lat,Long
17950,2019-12-31,York Fishergate,York,GB,22.583333,53.951889,-1.075861
17951,2020-01-01,Aberdeen,Aberdeen,GB,10.500000,57.157360,-2.094278
17952,2020-01-01,Armagh Roadside,Armagh,GB,46.260870,54.353728,-6.654558
17953,2020-01-01,Auchencorth Moss,Auchencorth,GB,8.041667,55.792160,-3.242900
17954,2020-01-01,Barnstaple A39,Barnstaple,GB,24.875000,51.074793,-4.041924
...,...,...,...,...,...,...,...
24575,2020-03-31,Warrington,Warrington,GB,11.000000,53.389280,-2.615358
24576,2020-03-31,Wigan Centre,Wigan,GB,9.000000,53.549140,-2.638139
24577,2020-03-31,Wirral Tranmere,Liverpool,GB,9.000000,53.372870,-3.022722
24578,2020-03-31,Wrexham,Wrexham,GB,11.000000,53.042220,-3.002778


In [55]:
# 2018 - index 0 to 1106
UK1_PM10 = UK_PM10.iloc[:1106]

# 2019 - index 1106 to 17951
UK2_PM10 = UK_PM10.iloc[1106:5106]
UK3_PM10 = UK_PM10.iloc[5106:9106]
UK4_PM10 = UK_PM10.iloc[9106:13106]
UK5_PM10 = UK_PM10.iloc[13106:17106]
UK6_PM10 = UK_PM10.iloc[17106:17951]

# 2020
UK7_PM10 = UK_PM10.iloc[17951:21951]
UK8_PM10 = UK_PM10.iloc[21951:]

In [56]:
ts1_PM10 = add_timeseries(UK1_PM10, dates_2018)
ts1_PM10.to_csv('ts1_PM10_train.csv')

100%|██████████████████████████████████████████████████████████████████████████████████| 96/96 [06:14<00:00,  3.90s/it]


In [57]:
ts2_PM10 = add_timeseries(UK2_PM10, dates_2019)
ts2_PM10.to_csv('ts2_PM10_train.csv')

100%|████████████████████████████████████████████████████████████████████████████████| 365/365 [59:06<00:00,  9.72s/it]


In [58]:
ts3_PM10 = add_timeseries(UK3_PM10, dates_2019)
ts3_PM10.to_csv('ts3_PM10_train.csv')

100%|████████████████████████████████████████████████████████████████████████████████| 365/365 [57:40<00:00,  9.48s/it]


In [59]:
ts4_PM10 = add_timeseries(UK4_PM10, dates_2019)
ts4_PM10.to_csv('ts4_PM10_train.csv')

100%|████████████████████████████████████████████████████████████████████████████████| 365/365 [52:28<00:00,  8.63s/it]


In [60]:
ts5_PM10 = add_timeseries(UK5_PM10, dates_2019)
ts5_PM10.to_csv('ts5_PM10_train.csv')

100%|████████████████████████████████████████████████████████████████████████████████| 365/365 [52:09<00:00,  8.57s/it]


In [61]:
ts6_PM10 = add_timeseries(UK6_PM10, dates_2019)
ts6_PM10.to_csv('ts6_PM10_train.csv')

100%|████████████████████████████████████████████████████████████████████████████████| 365/365 [20:33<00:00,  3.38s/it]


In [62]:
ts7_PM10 = add_timeseries(UK7_PM10, dates_2020)
ts7_PM10.to_csv('ts7_PM10_train.csv')

100%|██████████████████████████████████████████████████████████████████████████████████| 91/91 [13:47<00:00,  9.10s/it]


In [63]:
ts8_PM10 = add_timeseries(UK8_PM10, dates_2020)
ts8_PM10.to_csv('ts8_PM10_train.csv')

100%|██████████████████████████████████████████████████████████████████████████████████| 91/91 [09:57<00:00,  6.57s/it]


In [56]:
ts1_PM10 = pd.read_csv('ts1_PM10_train.csv')
ts1_PM10_groupby = ts1_PM10.groupby(['location', 'Date'], as_index=False).mean(lambda x: set(x))
ts1_PM10_groupby = ts1_PM10_groupby.drop('Unnamed: 0',axis = 1)
ts1_PM10_groupby.to_csv('ts1_PM10_train_groupby.csv')
ts1_PM10_groupby.shape

(2016, 20)

In [57]:
ts2_PM10 = pd.read_csv('ts2_PM10_train.csv')
ts2_PM10_groupby = ts2_PM10.groupby(['location', 'Date'], as_index=False).mean(lambda x: set(x))
ts2_PM10_groupby = ts2_PM10_groupby.drop('Unnamed: 0',axis = 1)
ts2_PM10_groupby.to_csv('ts2_PM10_train_groupby.csv')
ts2_PM10_groupby.shape

(13140, 20)

In [58]:
ts3_PM10 = pd.read_csv('ts3_PM10_train.csv')
ts3_PM10_groupby = ts3_PM10.groupby(['location', 'Date'], as_index=False).mean(lambda x: set(x))
ts3_PM10_groupby = ts3_PM10_groupby.drop('Unnamed: 0',axis = 1)
ts3_PM10_groupby.to_csv('ts3_PM10_train_groupby.csv')
ts3_PM10_groupby.shape

(25550, 20)

In [59]:
ts4_PM10 = pd.read_csv('ts4_PM10_train.csv')
ts4_PM10_groupby = ts4_PM10.groupby(['location', 'Date'], as_index=False).mean(lambda x: set(x))
ts4_PM10_groupby = ts4_PM10_groupby.drop('Unnamed: 0',axis = 1)
ts4_PM10_groupby.to_csv('ts4_PM10_train_groupby.csv')
ts4_PM10_groupby.shape

(27010, 20)

In [60]:
ts5_PM10 = pd.read_csv('ts5_PM10_train.csv')
ts5_PM10_groupby = ts5_PM10.groupby(['location', 'Date'], as_index=False).mean(lambda x: set(x))
ts5_PM10_groupby = ts5_PM10_groupby.drop('Unnamed: 0',axis = 1)
ts5_PM10_groupby.to_csv('ts5_PM10_train_groupby.csv')
ts5_PM10_groupby.shape

(27010, 20)

In [61]:
ts6_PM10 = pd.read_csv('ts6_PM10_train.csv')
ts6_PM10_groupby = ts6_PM10.groupby(['location', 'Date'], as_index=False).mean(lambda x: set(x))
ts6_PM10_groupby = ts6_PM10_groupby.drop('Unnamed: 0',axis = 1)
ts6_PM10_groupby.to_csv('ts6_PM10_train_groupby.csv')
ts6_PM10_groupby.shape

(26645, 20)

In [62]:
ts7_PM10 = pd.read_csv('ts7_PM10_train.csv')
ts7_PM10_groupby = ts7_PM10.groupby(['location', 'Date'], as_index=False).mean(lambda x: set(x))
ts7_PM10_groupby = ts7_PM10_groupby.drop('Unnamed: 0',axis = 1)
ts7_PM10_groupby.to_csv('ts7_PM10_train_groupby.csv')
ts7_PM10_groupby.shape

(7007, 20)

In [63]:
ts8_PM10 = pd.read_csv('ts8_PM10_train.csv')
ts8_PM10_groupby = ts8_PM10.groupby(['location', 'Date'], as_index=False).mean(lambda x: set(x))
ts8_PM10_groupby = ts8_PM10_groupby.drop('Unnamed: 0',axis = 1)
ts8_PM10_groupby.to_csv('ts8_PM10_train_groupby.csv')
ts8_PM10_groupby.shape

(7007, 20)

In [64]:
group = [ts1_PM10_groupby,ts2_PM10_groupby,ts3_PM10_groupby,ts4_PM10_groupby,ts5_PM10_groupby,ts6_PM10_groupby,ts7_PM10_groupby,ts8_PM10_groupby]
group_timeseries = pd.concat(group)
group_timeseries.to_csv('ts_PM10_groupby_all.csv', index=False)

In [65]:
group_timeseries

Unnamed: 0,location,Date,Lat,Long,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,specific_humidity_2m_above_ground,temperature_2m_above_ground,u_component_of_wind_10m_above_ground,v_component_of_wind_10m_above_ground,0_NO2_column_number_density,0_NO2_slant_column_number_density,0_stratospheric_NO2_column_number_density,0_tropopause_pressure,0_tropospheric_NO2_column_number_density,1_O3_column_number_density,2_CO_column_number_density,2_H2O_column_number_density,2_cloud_height,3_absorbing_aerosol_index
0,Aberdeen,2018-09-27,57.157360,-2.094278,23.255320,89.427661,0.008697,13.653708,3.786878,3.440247,0.000057,0.000138,0.000039,12898.770623,0.000015,0.113651,0.028194,1337.110130,324.160284,-1.390420
1,Aberdeen,2018-09-28,57.157360,-2.094278,8.919149,85.722955,0.005021,6.282986,1.911551,-0.805813,0.000064,0.000137,0.000040,14478.561874,0.000015,0.122597,0.027934,1330.971686,309.657850,-1.537680
2,Aberdeen,2018-09-29,57.157360,-2.094278,15.993617,84.827661,0.005108,6.751541,2.340752,1.596252,0.000073,0.000188,0.000041,13524.785691,0.000019,0.132621,0.032483,537.321218,34.176636,-0.453005
3,Aberdeen,2018-09-30,57.157360,-2.094278,10.525532,86.162767,0.005997,8.624053,4.391677,-0.440455,0.000060,0.000198,0.000045,13389.998598,0.000021,0.124832,0.030707,501.726865,465.395359,-0.640018
4,Aberdeen,2018-10-01,57.157360,-2.094278,9.844311,86.417022,0.004921,5.851817,3.032515,-1.645190,0.000063,0.000188,0.000048,20685.284353,0.000018,0.146016,0.030200,647.773253,738.735490,-0.868015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7002,York Fishergate,2020-03-27,53.951889,-1.075861,7.505556,80.300001,0.003905,3.595739,-1.800300,-1.595661,0.000122,0.000232,0.000042,16588.585195,0.000076,0.156670,0.039273,533.693178,435.087312,-1.030407
7003,York Fishergate,2020-03-28,53.951889,-1.075861,9.391667,79.602779,0.004016,4.181032,-2.311500,-3.736741,0.000093,0.000197,0.000041,16782.219951,0.000051,0.161038,0.039598,645.047130,379.933920,-1.372574
7004,York Fishergate,2020-03-29,53.951889,-1.075861,5.772222,70.813890,0.003082,2.306375,-2.491352,-4.997044,0.000064,0.000141,0.000041,17581.644063,0.000042,0.171794,0.038786,486.002414,1115.962200,-1.264324
7005,York Fishergate,2020-03-30,53.951889,-1.075861,9.119445,75.413890,0.003118,1.585222,0.495904,-2.529340,0.000065,0.000133,0.000042,18408.356228,0.000031,0.173615,0.039768,381.810120,1802.998179,-1.625593


In [66]:
group_timeseries = pd.read_csv('ts_PM10_groupby_all.csv')

In [67]:
group_timeseries.columns

Index(['location', 'Date', 'Lat', 'Long',
       'precipitable_water_entire_atmosphere',
       'relative_humidity_2m_above_ground',
       'specific_humidity_2m_above_ground', 'temperature_2m_above_ground',
       'u_component_of_wind_10m_above_ground',
       'v_component_of_wind_10m_above_ground', '0_NO2_column_number_density',
       '0_NO2_slant_column_number_density',
       '0_stratospheric_NO2_column_number_density', '0_tropopause_pressure',
       '0_tropospheric_NO2_column_number_density',
       '1_O3_column_number_density', '2_CO_column_number_density',
       '2_H2O_column_number_density', '2_cloud_height',
       '3_absorbing_aerosol_index'],
      dtype='object')

In [35]:
# Vegetation & Elevation

veg_elev = pd.merge(world_w_veg, world_w_elevation, on = ['location', 'Date', 'Lat', 'Long'])

In [37]:
veg_elev = veg_elev.drop(['city_y', 'country_y', 'PM10_y'], axis = 1)
veg_elev = veg_elev.rename(columns = {'city_x':'city', 'country_x':'country', 'PM10_x':'PM10'})

veg_elev.columns

Index(['Date', 'location', 'city', 'country', 'PM10', 'Lat', 'Long',
       'NDVI_8000_mean', 'QA_8000_mean', 'TIMEOFDAY_8000_mean',
       'NDVI_8000_min', 'QA_8000_min', 'TIMEOFDAY_8000_min', 'NDVI_8000_max',
       'QA_8000_max', 'TIMEOFDAY_8000_max', 'elevation_2000_mean',
       'elevation_2000_min', 'elevation_2000_max'],
      dtype='object')

In [38]:
# Vegetation, Population & Elevation

veg_elev_pop = pd.merge(veg_elev, world_w_population, on = ['location', 'Date'])

In [40]:
veg_elev_pop = veg_elev_pop.drop(['city_y', 'country_y', 'PM10_y', 'Lat_y', 'Long_y'], axis = 1)
veg_elev_pop = veg_elev_pop.rename(columns = {'city_x':'city', 'country_x':'country', 'PM10_x':'PM10', 'Lat_x':'Lat', 'Long_x':'Long'})

veg_elev_pop.columns

Index(['Date', 'location', 'city', 'country', 'PM10', 'Lat', 'Long',
       'NDVI_8000_mean', 'QA_8000_mean', 'TIMEOFDAY_8000_mean',
       'NDVI_8000_min', 'QA_8000_min', 'TIMEOFDAY_8000_min', 'NDVI_8000_max',
       'QA_8000_max', 'TIMEOFDAY_8000_max', 'elevation_2000_mean',
       'elevation_2000_min', 'elevation_2000_max', 'avg_vis_mean',
       'cf_cvg_mean', 'avg_vis_min', 'cf_cvg_min', 'avg_vis_max', 'cf_cvg_max',
       'pop_density2010_mean', 'pop_density2010_min', 'pop_density2010_max'],
      dtype='object')

In [72]:
# Vegetation, Population, Sentinel & Elevation

veg_elev_pop_sat = pd.merge(veg_elev_pop, group_timeseries, on = ['location', 'Date'])

In [73]:
veg_elev_pop_sat = veg_elev_pop_sat.drop(['Lat_y', 'Long_y'], axis = 1)
veg_elev_pop_sat = veg_elev_pop_sat.rename(columns = {'Lat_x':'Lat', 'Long_x':'Long'})

In [74]:
veg_elev_pop_sat.columns

Index(['Date', 'location', 'city', 'country', 'PM10', 'Lat', 'Long',
       'NDVI_8000_mean', 'QA_8000_mean', 'TIMEOFDAY_8000_mean',
       'NDVI_8000_min', 'QA_8000_min', 'TIMEOFDAY_8000_min', 'NDVI_8000_max',
       'QA_8000_max', 'TIMEOFDAY_8000_max', 'elevation_2000_mean',
       'elevation_2000_min', 'elevation_2000_max', 'avg_vis_mean',
       'cf_cvg_mean', 'avg_vis_min', 'cf_cvg_min', 'avg_vis_max', 'cf_cvg_max',
       'pop_density2010_mean', 'pop_density2010_min', 'pop_density2010_max',
       'precipitable_water_entire_atmosphere',
       'relative_humidity_2m_above_ground',
       'specific_humidity_2m_above_ground', 'temperature_2m_above_ground',
       'u_component_of_wind_10m_above_ground',
       'v_component_of_wind_10m_above_ground', '0_NO2_column_number_density',
       '0_NO2_slant_column_number_density',
       '0_stratospheric_NO2_column_number_density', '0_tropopause_pressure',
       '0_tropospheric_NO2_column_number_density',
       '1_O3_column_number_density'

In [75]:
veg_elev_pop_sat.shape

(91925, 44)

In [77]:
data_PM10_all = veg_elev_pop_sat.groupby(['location', 'Date'], as_index=False).mean(lambda x: set(x))

In [78]:
data_PM10_all.columns

Index(['location', 'Date', 'PM10', 'Lat', 'Long', 'NDVI_8000_mean',
       'QA_8000_mean', 'TIMEOFDAY_8000_mean', 'NDVI_8000_min', 'QA_8000_min',
       'TIMEOFDAY_8000_min', 'NDVI_8000_max', 'QA_8000_max',
       'TIMEOFDAY_8000_max', 'elevation_2000_mean', 'elevation_2000_min',
       'elevation_2000_max', 'avg_vis_mean', 'cf_cvg_mean', 'avg_vis_min',
       'cf_cvg_min', 'avg_vis_max', 'cf_cvg_max', 'pop_density2010_mean',
       'pop_density2010_min', 'pop_density2010_max',
       'precipitable_water_entire_atmosphere',
       'relative_humidity_2m_above_ground',
       'specific_humidity_2m_above_ground', 'temperature_2m_above_ground',
       'u_component_of_wind_10m_above_ground',
       'v_component_of_wind_10m_above_ground', '0_NO2_column_number_density',
       '0_NO2_slant_column_number_density',
       '0_stratospheric_NO2_column_number_density', '0_tropopause_pressure',
       '0_tropospheric_NO2_column_number_density',
       '1_O3_column_number_density', '2_CO_column_numb

In [79]:
data_PM10_all.shape

(24580, 42)

In [82]:
data_PM10_all.to_csv('data_PM10_all_train.csv')