In [1]:
import xarray as xr

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
DATADIR = '/rds/general/user/mc4117/home/WeatherBench/data/'  

In [3]:
! ls $DATADIR

10m_u_component_of_wind  potential_vorticity	       total_cloud_cover
10m_v_component_of_wind  relative_humidity	       total_precipitation
2m_temperature		 specific_humidity	       u_component_of_wind
constants		 temperature		       v_component_of_wind
geopotential		 temperature_850	       vorticity
geopotential_500	 toa_incident_solar_radiation


In [4]:
def covariance(x, y, dim=None):
    valid_values = x.notnull() & y.notnull()
    valid_count = valid_values.sum(dim)

    demeaned_x = (x - x.mean(dim)).fillna(0)
    demeaned_y = (y - y.mean(dim)).fillna(0)
    
    return xr.dot(demeaned_x, demeaned_y, dims=dim) / valid_count

def correlation(x, y, dim=None):
    # dim should default to the intersection of x.dims and y.dims
    return covariance(x, y, dim) / (x.std(dim) * y.std(dim))

In [5]:
u_wind = xr.open_mfdataset(f'{DATADIR}10m_u_component_of_wind/*.nc', combine='by_coords')
v_wind = xr.open_mfdataset(f'{DATADIR}10m_v_component_of_wind/*.nc', combine='by_coords')
temp_2m = xr.open_mfdataset(f'{DATADIR}2m_temperature/*.nc', combine='by_coords')
constants = xr.open_mfdataset(f'{DATADIR}constants/*.nc', combine='by_coords')
geo_500 = xr.open_mfdataset(f'{DATADIR}geopotential_500/*.nc', combine='by_coords')
pot_vort = xr.open_mfdataset(f'{DATADIR}potential_vorticity/*.nc', combine='by_coords')
rel_hum = xr.open_mfdataset(f'{DATADIR}relative_humidity/*.nc', combine='by_coords')
spec_hum = xr.open_mfdataset(f'{DATADIR}specific_humidity/*.nc', combine='by_coords')
temp850 = xr.open_mfdataset(f'{DATADIR}temperature_850/*.nc', combine='by_coords')
solar = xr.open_mfdataset(f'{DATADIR}toa_incident_solar_radiation/*.nc', combine='by_coords')
cloud_cover = xr.open_mfdataset(f'{DATADIR}total_cloud_cover/*.nc', combine='by_coords')
precip = xr.open_mfdataset(f'{DATADIR}total_precipitation/*.nc', combine='by_coords')
vort = xr.open_mfdataset(f'{DATADIR}vorticity/*.nc', combine='by_coords')

In [6]:
variables = [
             u_wind.u10, v_wind.v10, temp_2m.t2m, geo_500.z, pot_vort.pv.sel(level = 500), pot_vort.pv.sel(level=850),
             rel_hum.r.sel(level = 500), rel_hum.r.sel(level=850), spec_hum.q.sel(level=500),
             spec_hum.q.sel(level=850), temp850.t, solar.tisr, cloud_cover.tcc, precip.tp, 
             vort.vo.sel(level=500), vort.vo.sel(level=850)
            ]
variables_str = [
             'u_wind', 'v_wind', 'temp_2m', 'geo_500', 'pot_vort_500', 'pot_vort_850',
             'rel_hum_500', 'rel_hum_850', 'spec_hum_500',
             'spec_hum_850', 'temp850', 'solar', 'cloud_cover', 'precip', 
             'vort_500', 'vort_850'
            ]
correlation_tmp_850 = []
correlation_z500 = []

for i in variables:
    correlation_tmp_850.append(correlation(i, temp850.t).values)
    correlation_z500.append(correlation(i, geo_500.z).values)


In [7]:
variables_str.append('orography')
correlation_tmp_850.append(np.mean([correlation(constants.orography, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
correlation_z500.append(np.mean([correlation(constants.orography, geo_500.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500.z)/100))]))

variables_str.append('lsm')
correlation_tmp_850.append(np.mean([correlation(constants.lsm, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
correlation_z500.append(np.mean([correlation(constants.lsm, geo_500.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500.z)/100))]))

variables_str.append('slt')
correlation_tmp_850.append(np.mean([correlation(constants.slt, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
correlation_z500.append(np.mean([correlation(constants.slt, geo_500.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500.z)/100))]))


In [8]:
df = pd.concat([pd.DataFrame(variables_str[:-1], columns = ['Variable names']), pd.DataFrame(correlation_tmp_850, columns = ['Corr tmp_850']), pd.DataFrame(correlation_z500, columns = ['Corr z500'])], axis = 1)

def color_negative_red(value):

    if abs(value) < 0.25:
        color = 'red'
    elif abs(value) > 0.5:
        color = 'green'
    else:
        color = 'black'

    return 'color: %s' % color

df.style.applymap(color_negative_red, subset=['Corr tmp_850', 'Corr z500'])



Unnamed: 0,Variable names,Corr tmp_850,Corr z500
0,u_wind,-0.135888,-0.184077
1,v_wind,-0.050567,-0.032529
2,temp_2m,0.946879,0.881812
3,geo_500,0.934679,1.0
4,pot_vort_500,0.149889,0.20243
5,pot_vort_850,0.231328,0.219269
6,rel_hum_500,-0.412299,-0.459123
7,rel_hum_850,-0.351307,-0.338311
8,spec_hum_500,0.535203,0.504809
9,spec_hum_850,0.760842,0.735187


Solar radiation and cloud cover are closely inversely correlated so only need one. 
As solar radiation is just the daily sun patterns could be used as a variable to account for diurnal variations.