In the important variables the correlation increases as the number of days increases

In [1]:
import xarray as xr

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
DATADIR = '/rds/general/user/mc4117/home/WeatherBench/data/'  

In [3]:
! ls $DATADIR

10m_u_component_of_wind  potential_vorticity	       total_cloud_cover
10m_v_component_of_wind  relative_humidity	       total_precipitation
2m_temperature		 specific_humidity	       u_component_of_wind
constants		 temperature		       v_component_of_wind
geopotential		 temperature_850	       vorticity
geopotential_500	 toa_incident_solar_radiation


In [4]:
def covariance(x, y, dim=None):
    valid_values = x.notnull() & y.notnull()
    valid_count = valid_values.sum(dim)

    demeaned_x = (x - x.mean(dim)).fillna(0)
    demeaned_y = (y - y.mean(dim)).fillna(0)
    
    return xr.dot(demeaned_x, demeaned_y, dims=dim) / valid_count

def correlation(x, y, dim=None):
    # dim should default to the intersection of x.dims and y.dims
    return covariance(x, y, dim) / (x.std(dim) * y.std(dim))

In [5]:
def color_negative_red(value):

    if abs(value) < 0.25:
        color = 'red'
    elif abs(value) > 0.5:
        color = 'green'
    else:
        color = 'black'

    return 'color: %s' % color

In [6]:
u_wind = xr.open_mfdataset(f'{DATADIR}10m_u_component_of_wind/*.nc', combine='by_coords')
v_wind = xr.open_mfdataset(f'{DATADIR}10m_v_component_of_wind/*.nc', combine='by_coords')
temp_2m = xr.open_mfdataset(f'{DATADIR}2m_temperature/*.nc', combine='by_coords')
constants = xr.open_mfdataset(f'{DATADIR}constants/*.nc', combine='by_coords')
geo_500 = xr.open_mfdataset(f'{DATADIR}geopotential_500/*.nc', combine='by_coords')
pot_vort = xr.open_mfdataset(f'{DATADIR}potential_vorticity/*.nc', combine='by_coords')
rel_hum = xr.open_mfdataset(f'{DATADIR}relative_humidity/*.nc', combine='by_coords')
spec_hum = xr.open_mfdataset(f'{DATADIR}specific_humidity/*.nc', combine='by_coords')
temp850 = xr.open_mfdataset(f'{DATADIR}temperature_850/*.nc', combine='by_coords')
solar = xr.open_mfdataset(f'{DATADIR}toa_incident_solar_radiation/*.nc', combine='by_coords')
cloud_cover = xr.open_mfdataset(f'{DATADIR}total_cloud_cover/*.nc', combine='by_coords')
precip = xr.open_mfdataset(f'{DATADIR}total_precipitation/*.nc', combine='by_coords')
vort = xr.open_mfdataset(f'{DATADIR}vorticity/*.nc', combine='by_coords')

In [7]:
geo_500_north = geo_500.sel(lat=slice(23, 90))
geo_500_south = geo_500.sel(lat=slice(-90, -23))

t850_north = temp850.sel(lat=slice(23, 90))
t850_south = temp850.sel(lat=slice(-90, -23))

In [8]:
variables = [
             u_wind.u10, v_wind.v10, temp_2m.t2m, geo_500.z, pot_vort.pv.sel(level = 500), pot_vort.pv.sel(level=850),
             rel_hum.r.sel(level = 500), rel_hum.r.sel(level=850), spec_hum.q.sel(level=500),
             spec_hum.q.sel(level=850), temp850.t, solar.tisr, cloud_cover.tcc, precip.tp, 
             vort.vo.sel(level=500), vort.vo.sel(level=850)
            ]
variables_str = [
             'u_wind', 'v_wind', 'temp_2m', 'geo_500', 'pot_vort_500', 'pot_vort_850',
             'rel_hum_500', 'rel_hum_850', 'spec_hum_500',
             'spec_hum_850', 'temp850', 'solar', 'cloud_cover', 'precip', 
             'vort_500', 'vort_850'
            ]

def correlation_variables_table(data_list, y, lead_time):
    correlation_list = []
    
    y_corr = y.sel(time = slice(y.time[lead_time], None))
    
    for i in variables:
        i_known = i.sel(time = slice(None, i.time[-(lead_time +1)]))        
        correlation_list.append(correlation(i_known, y_corr).values)    

    return correlation_list


In [9]:
temp850_corr = correlation_variables_table(variables, temp850.t, lead_time = 0)
temp850_corr_north = correlation_variables_table(variables, t850_north.t, lead_time = 0)
temp850_corr_south = correlation_variables_table(variables, t850_south.t, lead_time = 0)

print('temp done')

geo500_corr = correlation_variables_table(variables, geo_500.z, lead_time = 0)
geo500_corr_north = correlation_variables_table(variables, geo_500_north.z, lead_time = 0)
geo500_corr_south = correlation_variables_table(variables, geo_500_south.z, lead_time = 0)

temp done


In [10]:
df = pd.concat([pd.DataFrame(variables_str, columns = ['Variable names']), pd.DataFrame(temp850_corr, columns = ['Corr_tmp_850 full']), pd.DataFrame(temp850_corr_north, columns = ['Corr_tmp_850 north']), pd.DataFrame(temp850_corr_south, columns = ['Corr_tmp_850 south'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr_tmp_850 full', 'Corr_tmp_850 north', 'Corr_tmp_850 south'])

Unnamed: 0,Variable names,Corr_tmp_850 full,Corr_tmp_850 north,Corr_tmp_850 south
0,u_wind,-0.135888,-0.052226,0.140799
1,v_wind,-0.050567,0.068968,-0.234561
2,temp_2m,0.946879,0.725881,1.028051
3,geo_500,0.934679,0.742316,0.89827
4,pot_vort_500,0.149889,-0.239135,0.295109
5,pot_vort_850,0.231328,-0.04255,0.474614
6,rel_hum_500,-0.412299,-0.371132,-0.41884
7,rel_hum_850,-0.351307,-0.38509,-0.437378
8,spec_hum_500,0.535203,0.402129,0.279819
9,spec_hum_850,0.760842,0.522441,0.470096


In [11]:
df = pd.concat([pd.DataFrame(variables_str, columns = ['Variable names']), pd.DataFrame(geo500_corr, columns = ['Corr_geo_500 full']), pd.DataFrame(geo500_corr_north, columns = ['Corr_geo_500 north']), pd.DataFrame(geo500_corr_south, columns = ['Corr_geo_500 south'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr_geo_500 full', 'Corr_geo_500 north', 'Corr_geo_500 south'])

Unnamed: 0,Variable names,Corr_geo_500 full,Corr_geo_500 north,Corr_geo_500 south
0,u_wind,-0.184077,-0.078783,0.056526
1,v_wind,-0.032529,0.044927,-0.148696
2,temp_2m,0.881812,0.676212,0.887969
3,geo_500,1.0,0.799011,1.00923
4,pot_vort_500,0.20243,-0.265711,0.305473
5,pot_vort_850,0.219269,-0.033909,0.361575
6,rel_hum_500,-0.459123,-0.420907,-0.486803
7,rel_hum_850,-0.338311,-0.396513,-0.4167
8,spec_hum_500,0.504809,0.379073,0.273226
9,spec_hum_850,0.735187,0.490002,0.451204


In [9]:
constant_variables_str = []
corr_z500_const = []
corr_z500_north_const = []
corr_z500_south_const = []

constant_variables_str.append('orography')
#corr_tmp_850_const.append(np.mean([correlation(constants.orography, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_z500_north_const.append(np.mean([correlation(constants.orography, geo_500_north.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_north.z)/100))]))
corr_z500_const.append(np.mean([correlation(constants.orography, geo_500.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500.z)/100))]))
corr_z500_south_const.append(np.mean([correlation(constants.orography, geo_500_south.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_south.z)/100))]))


constant_variables_str.append('lsm')
#corr_tmp_850_const.append(np.mean([correlation(constants.lsm, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_z500_north_const.append(np.mean([correlation(constants.lsm, geo_500_north.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_north.z)/100))]))
corr_z500_const.append(np.mean([correlation(constants.lsm, geo_500.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500.z)/100))]))
corr_z500_south_const.append(np.mean([correlation(constants.lsm, geo_500_south.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_south.z)/100))]))


constant_variables_str.append('slt')
#corr_tmp_850_const.append(np.mean([correlation(constants.slt, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_z500_north_const.append(np.mean([correlation(constants.slt, geo_500_north.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_north.z)/100))]))
corr_z500_const.append(np.mean([correlation(constants.slt, geo_500.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500.z)/100))]))
corr_z500_south_const.append(np.mean([correlation(constants.slt, geo_500_south.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_south.z)/100))]))


constant_variables_str.append('lat')
corr_z500_north_const.append(np.mean([correlation(constants.lat2d, geo_500_north.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_north.z)/100))]))
corr_z500_const.append(np.mean([correlation(constants.lat2d, geo_500.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500.z)/100))]))
corr_z500_south_const.append(np.mean([correlation(constants.lat2d, geo_500_south.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_south.z)/100))]))

constant_variables_str.append('lon')
corr_z500_north_const.append(np.mean([correlation(constants.lon2d, geo_500_north.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_north.z)/100))]))
corr_z500_const.append(np.mean([correlation(constants.lon2d, geo_500.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500.z)/100))]))
corr_z500_south_const.append(np.mean([correlation(constants.lon2d, geo_500_south.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_south.z)/100))]))


#df = pd.concat([pd.DataFrame(constant_variables_str, columns = ['Variable names']), pd.DataFrame(corr_tmp_850_const, columns = ['Corr tmp_850']), pd.DataFrame(corr_z500_const, columns = ['Corr z500'])], axis = 1)

df = pd.concat([pd.DataFrame(constant_variables_str, columns = ['Variable names']),pd.DataFrame(corr_z500_const, columns = ['Corr z500'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr z500'])


Unnamed: 0,Variable names,Corr z500
0,orography,-0.325298
1,lsm,-0.189865
2,slt,0.047839
3,lat,0.236026
4,lon,0.006956


In [11]:
df = pd.concat([pd.DataFrame(constant_variables_str, columns = ['Variable names']),pd.DataFrame(corr_z500_const, columns = ['Corr z500']), pd.DataFrame(corr_z500_north_const, columns = ['Corr z500 north']), pd.DataFrame(corr_z500_south_const, columns = ['Corr z500 south'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr z500', 'Corr z500 north', 'Corr z500 south'])

Unnamed: 0,Variable names,Corr z500,Corr z500 north,Corr z500 south
0,orography,-0.325298,0.132768,-0.617528
1,lsm,-0.189865,0.122213,-0.450104
2,slt,0.047839,0.166779,-0.106196
3,lat,0.236026,-0.322735,0.339856
4,lon,0.006956,-0.010264,0.026402
