In the important variables the correlation increases as the number of days increases

In [1]:
import xarray as xr

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
DATADIR = '/rds/general/user/mc4117/home/WeatherBench/data/'  

In [3]:
! ls $DATADIR

10m_u_component_of_wind  potential_vorticity	       total_cloud_cover
10m_v_component_of_wind  relative_humidity	       total_precipitation
2m_temperature		 specific_humidity	       u_component_of_wind
constants		 temperature		       v_component_of_wind
geopotential		 temperature_850	       vorticity
geopotential_500	 toa_incident_solar_radiation


In [18]:
def covariance(x, y, dim=None):
    x_arr = x
    y_arr = y
    valid_values = x_arr.notnull() & y_arr.notnull()
    valid_count = valid_values.sum(dim)

    demeaned_x = (x_arr - x_arr.mean(dim)).fillna(0)
    demeaned_y = (y_arr - y_arr.mean(dim)).fillna(0)
    
    return xr.dot(demeaned_x, demeaned_y, dims=dim) / valid_count

def correlation(x, y, dim=None):
    # dim should default to the intersection of x.dims and y.dims
    return covariance(x, y, dim) / (x.std(dim) * y.std(dim))

In [5]:
def color_negative_red(value):

    if abs(value) < 0.25:
        color = 'red'
    elif abs(value) > 0.5:
        color = 'green'
    else:
        color = 'black'

    return 'color: %s' % color

In [6]:
u_wind_10m = xr.open_mfdataset(f'{DATADIR}10m_u_component_of_wind/*.nc', combine='by_coords')
v_wind_10m = xr.open_mfdataset(f'{DATADIR}10m_v_component_of_wind/*.nc', combine='by_coords')
u_wind = xr.open_mfdataset(f'{DATADIR}u_component_of_wind/*.nc', combine='by_coords')
v_wind = xr.open_mfdataset(f'{DATADIR}v_component_of_wind/*.nc', combine='by_coords')
temp_2m = xr.open_mfdataset(f'{DATADIR}2m_temperature/*.nc', combine='by_coords')
constants = xr.open_mfdataset(f'{DATADIR}constants/*.nc', combine='by_coords')
geo_500 = xr.open_mfdataset(f'{DATADIR}geopotential_500/*.nc', combine='by_coords')
pot_vort = xr.open_mfdataset(f'{DATADIR}potential_vorticity/*.nc', combine='by_coords')
rel_hum = xr.open_mfdataset(f'{DATADIR}relative_humidity/*.nc', combine='by_coords')
spec_hum = xr.open_mfdataset(f'{DATADIR}specific_humidity/*.nc', combine='by_coords')
temp850 = xr.open_mfdataset(f'{DATADIR}temperature_850/*.nc', combine='by_coords')
solar = xr.open_mfdataset(f'{DATADIR}toa_incident_solar_radiation/*.nc', combine='by_coords')
cloud_cover = xr.open_mfdataset(f'{DATADIR}total_cloud_cover/*.nc', combine='by_coords')
precip = xr.open_mfdataset(f'{DATADIR}total_precipitation/*.nc', combine='by_coords')
vort = xr.open_mfdataset(f'{DATADIR}vorticity/*.nc', combine='by_coords')

In [7]:
geo_500_north = geo_500.sel(lat=slice(0, 90))
geo_500_south = geo_500.sel(lat=slice(-90, 0))

t850_north = temp850.sel(lat=slice(0, 90))
t850_south = temp850.sel(lat=slice(-90, 0))

In [16]:
variables = [
             u_wind.u.sel(level=500), u_wind.u.sel(level=850), v_wind.v.sel(level=500), v_wind.v.sel(level=850), 
             u_wind_10m.u10, v_wind_10m.v10, temp_2m.t2m, geo_500.z, pot_vort.pv.sel(level = 500), pot_vort.pv.sel(level=850),
             rel_hum.r.sel(level = 500), rel_hum.r.sel(level=850), spec_hum.q.sel(level=500),
             spec_hum.q.sel(level=850), temp850.t, solar.tisr, cloud_cover.tcc, precip.tp, 
             vort.vo.sel(level=500), vort.vo.sel(level=850)
            ]
variables_str = [
             'u_wind_500', 'v_wind_500', 'u_wind_850', 'v_wind_850', 'u_wind_10m', 'v_wind_10m', 
             'temp_2m', 'geo_500', 'pot_vort_500', 'pot_vort_850',
             'rel_hum_500', 'rel_hum_850', 'spec_hum_500',
             'spec_hum_850', 'temp850', 'solar', 'cloud_cover', 'precip', 
             'vort_500', 'vort_850'
            ]

def correlation_variables_table(data_list, y, lead_time, north = False, south = False):
    correlation_list = []
    
    y_corr = y.sel(time = slice(y.time[lead_time], None))
    
    for i in variables:
        if north:
            i_known = i.sel(lat = slice(0, 90), time = slice(None, i.time[-(lead_time +1)])) 
        elif south:
            i_known = i.sel(lat = slice(-90, 0), time = slice(None, i.time[-(lead_time +1)])) 
        else:
            i_known = i.sel(time = slice(None, i.time[-(lead_time +1)])) 
        correlation_list.append(correlation(i_known, y_corr).values)    

    return correlation_list


In [19]:
temp850_corr = correlation_variables_table(variables, temp850.t, lead_time = 72)
#temp850_corr_north = correlation_variables_table(variables, t850_north.t, lead_time = 72, north = True)
#temp850_corr_south = correlation_variables_table(variables, t850_south.t, lead_time = 72, south = True)

print('temp done')

geo500_corr = correlation_variables_table(variables, geo_500.z, lead_time = 72)
#geo500_corr_north = correlation_variables_table(variables, geo_500_north.z, lead_time = 0, north = True)
#geo500_corr_south = correlation_variables_table(variables, geo_500_south.z, lead_time = 0, south = True)

temp done


In [20]:
df = pd.concat([pd.DataFrame(variables_str, columns = ['Variable names']), pd.DataFrame(temp850_corr, columns = ['Corr_tmp_850 full'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr_tmp_850 full'])

Unnamed: 0,Variable names,Corr_tmp_850 full
0,u_wind_500,-0.059584
1,v_wind_500,-0.100996
2,u_wind_850,-0.015567
3,v_wind_850,-0.042132
4,u_wind_10m,-0.135853
5,v_wind_10m,-0.050571
6,temp_2m,0.946909
7,geo_500,0.934707
8,pot_vort_500,0.150038
9,pot_vort_850,0.231432


In [21]:
df = pd.concat([pd.DataFrame(variables_str, columns = ['Variable names']), pd.DataFrame(geo500_corr, columns = ['Corr_geo_500 full'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr_geo_500 full'])

Unnamed: 0,Variable names,Corr_geo_500 full
0,u_wind_500,-0.067465
1,v_wind_500,-0.147559
2,u_wind_850,-0.001903
3,v_wind_850,-0.029064
4,u_wind_10m,-0.184053
5,v_wind_10m,-0.032521
6,temp_2m,0.881828
7,geo_500,1.000012
8,pot_vort_500,0.202548
9,pot_vort_850,0.219346


In [22]:
temp850_corr = correlation_variables_table(variables, temp850.t, lead_time = 120)

print('temp done')

geo500_corr = correlation_variables_table(variables, geo_500.z, lead_time = 120)

temp done


In [23]:
df = pd.concat([pd.DataFrame(variables_str, columns = ['Variable names']), pd.DataFrame(temp850_corr, columns = ['Corr_tmp_850 full'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr_tmp_850 full'])

Unnamed: 0,Variable names,Corr_tmp_850 full
0,u_wind_500,-0.059588
1,v_wind_500,-0.100973
2,u_wind_850,-0.015579
3,v_wind_850,-0.042146
4,u_wind_10m,-0.135824
5,v_wind_10m,-0.050574
6,temp_2m,0.946927
7,geo_500,0.934722
8,pot_vort_500,0.15014
9,pot_vort_850,0.231494


In [24]:
df = pd.concat([pd.DataFrame(variables_str, columns = ['Variable names']), pd.DataFrame(geo500_corr, columns = ['Corr_geo_500 full'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr_geo_500 full'])

Unnamed: 0,Variable names,Corr_geo_500 full
0,u_wind_500,-0.067485
1,v_wind_500,-0.147548
2,u_wind_850,-0.001903
3,v_wind_850,-0.029069
4,u_wind_10m,-0.184033
5,v_wind_10m,-0.032517
6,temp_2m,0.881841
7,geo_500,1.000023
8,pot_vort_500,0.202622
9,pot_vort_850,0.219394


In [10]:
df = pd.concat([pd.DataFrame(variables_str, columns = ['Variable names']), pd.DataFrame(temp850_corr, columns = ['Corr_tmp_850 full']), pd.DataFrame(temp850_corr_north, columns = ['Corr_tmp_850 north']), pd.DataFrame(temp850_corr_south, columns = ['Corr_tmp_850 south'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr_tmp_850 full', 'Corr_tmp_850 north', 'Corr_tmp_850 south'])

Unnamed: 0,Variable names,Corr_tmp_850 full,Corr_tmp_850 north,Corr_tmp_850 south
0,u_wind,-0.135888,-0.177559,-0.102267
1,v_wind,-0.050567,0.035523,-0.091267
2,temp_2m,0.946879,0.938749,0.955072
3,geo_500,0.934679,0.948019,0.928605
4,pot_vort_500,0.149889,-0.493474,0.561674
5,pot_vort_850,0.231328,-0.133179,0.349039
6,rel_hum_500,-0.412299,-0.394094,-0.442111
7,rel_hum_850,-0.351307,-0.305744,-0.377034
8,spec_hum_500,0.535203,0.550061,0.515343
9,spec_hum_850,0.760842,0.746082,0.776319


In [11]:
df = pd.concat([pd.DataFrame(variables_str, columns = ['Variable names']), pd.DataFrame(geo500_corr, columns = ['Corr_geo_500 full']), pd.DataFrame(geo500_corr_north, columns = ['Corr_geo_500 north']), pd.DataFrame(geo500_corr_south, columns = ['Corr_geo_500 south'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr_geo_500 full', 'Corr_geo_500 north', 'Corr_geo_500 south'])

Unnamed: 0,Variable names,Corr_geo_500 full,Corr_geo_500 north,Corr_geo_500 south
0,u_wind,-0.184077,-0.212922,-0.160877
1,v_wind,-0.032529,0.012091,-0.034199
2,temp_2m,0.881812,0.896621,0.871599
3,geo_500,1.0,1.0,1.0
4,pot_vort_500,0.20243,-0.523387,0.573379
5,pot_vort_850,0.219269,-0.120584,0.288803
6,rel_hum_500,-0.459123,-0.438679,-0.499901
7,rel_hum_850,-0.338311,-0.298311,-0.355875
8,spec_hum_500,0.504809,0.525167,0.490178
9,spec_hum_850,0.735187,0.727431,0.756699


In [12]:
constant_variables_str = []
corr_z500_const = []
corr_z500_north_const = []
corr_z500_south_const = []

constant_variables_str.append('orography')
#corr_tmp_850_const.append(np.mean([correlation(constants.orography, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_z500_north_const.append(np.mean([correlation(constants.orography.sel(lat = slice(0,90)), geo_500_north.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_north.z)/100))]))
corr_z500_const.append(np.mean([correlation(constants.orography, geo_500.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500.z)/100))]))
corr_z500_south_const.append(np.mean([correlation(constants.orography.sel(lat = slice(-90, 0)), geo_500_south.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_south.z)/100))]))


constant_variables_str.append('lsm')
#corr_tmp_850_const.append(np.mean([correlation(constants.lsm, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_z500_north_const.append(np.mean([correlation(constants.lsm.sel(lat = slice(0,90)), geo_500_north.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_north.z)/100))]))
corr_z500_const.append(np.mean([correlation(constants.lsm, geo_500.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500.z)/100))]))
corr_z500_south_const.append(np.mean([correlation(constants.lsm.sel(lat = slice(-90, 0)), geo_500_south.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_south.z)/100))]))


constant_variables_str.append('slt')
#corr_tmp_850_const.append(np.mean([correlation(constants.slt, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_z500_north_const.append(np.mean([correlation(constants.slt.sel(lat = slice(0,90)), geo_500_north.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_north.z)/100))]))
corr_z500_const.append(np.mean([correlation(constants.slt, geo_500.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500.z)/100))]))
corr_z500_south_const.append(np.mean([correlation(constants.slt.sel(lat = slice(-90, 0)), geo_500_south.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_south.z)/100))]))


constant_variables_str.append('lat')
corr_z500_north_const.append(np.mean([correlation(constants.lat2d.sel(lat = slice(0,90)), geo_500_north.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_north.z)/100))]))
corr_z500_const.append(np.mean([correlation(constants.lat2d, geo_500.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500.z)/100))]))
corr_z500_south_const.append(np.mean([correlation(constants.lat2d.sel(lat = slice(-90,0)), geo_500_south.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_south.z)/100))]))

constant_variables_str.append('lon')
corr_z500_north_const.append(np.mean([correlation(constants.lon2d.sel(lat = slice(0,90)), geo_500_north.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_north.z)/100))]))
corr_z500_const.append(np.mean([correlation(constants.lon2d, geo_500.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500.z)/100))]))
corr_z500_south_const.append(np.mean([correlation(constants.lon2d.sel(lat = slice(-90, 0)), geo_500_south.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_south.z)/100))]))


#df = pd.concat([pd.DataFrame(constant_variables_str, columns = ['Variable names']), pd.DataFrame(corr_tmp_850_const, columns = ['Corr tmp_850']), pd.DataFrame(corr_z500_const, columns = ['Corr z500'])], axis = 1)

df = pd.concat([pd.DataFrame(constant_variables_str, columns = ['Variable names']),pd.DataFrame(corr_z500_const, columns = ['Corr z500'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr z500'])


Unnamed: 0,Variable names,Corr z500
0,orography,-0.325298
1,lsm,-0.189865
2,slt,0.047839
3,lat,0.236026
4,lon,0.006956


In [13]:
df = pd.concat([pd.DataFrame(constant_variables_str, columns = ['Variable names']),pd.DataFrame(corr_z500_const, columns = ['Corr z500']), pd.DataFrame(corr_z500_north_const, columns = ['Corr z500 north']), pd.DataFrame(corr_z500_south_const, columns = ['Corr z500 south'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr z500', 'Corr z500 north', 'Corr z500 south'])

Unnamed: 0,Variable names,Corr z500,Corr z500 north,Corr z500 south
0,orography,-0.325298,0.049917,-0.475823
1,lsm,-0.189865,-0.010477,-0.364817
2,slt,0.047839,0.045921,-0.014253
3,lat,0.236026,-0.895613,0.928474
4,lon,0.006956,-0.007477,0.017626


In [14]:
constant_variables_str = []
corr_t850_const = []
corr_t850_north_const = []
corr_t850_south_const = []

constant_variables_str.append('orography')
#corr_tmp_850_const.append(np.mean([correlation(constants.orography, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_north_const.append(np.mean([correlation(constants.orography.sel(lat = slice(0,90)), t850_north.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_north.t)/100))]))
corr_t850_const.append(np.mean([correlation(constants.orography, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_south_const.append(np.mean([correlation(constants.orography.sel(lat = slice(-90,0)), t850_south.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_south.t)/100))]))


constant_variables_str.append('lsm')
#corr_tmp_850_const.append(np.mean([correlation(constants.orography, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_north_const.append(np.mean([correlation(constants.lsm.sel(lat = slice(0,90)), t850_north.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_north.t)/100))]))
corr_t850_const.append(np.mean([correlation(constants.lsm, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_south_const.append(np.mean([correlation(constants.lsm.sel(lat = slice(-90,0)), t850_south.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_south.t)/100))]))


constant_variables_str.append('slt')
#corr_tmp_850_const.append(np.mean([correlation(constants.orography, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_north_const.append(np.mean([correlation(constants.slt.sel(lat = slice(0,90)), t850_north.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_north.t)/100))]))
corr_t850_const.append(np.mean([correlation(constants.slt, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_south_const.append(np.mean([correlation(constants.slt.sel(lat = slice(-90,0)), t850_south.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_south.t)/100))]))

constant_variables_str.append('lat')
#corr_tmp_850_const.append(np.mean([correlation(constants.orography, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_north_const.append(np.mean([correlation(constants.lat2d.sel(lat = slice(0,90)), t850_north.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_north.t)/100))]))
corr_t850_const.append(np.mean([correlation(constants.lat2d, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_south_const.append(np.mean([correlation(constants.lat2d.sel(lat = slice(-90,0)), t850_south.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_south.t)/100))]))

constant_variables_str.append('lon')
#corr_tmp_850_const.append(np.mean([correlation(constants.orography, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_north_const.append(np.mean([correlation(constants.lon2d.sel(lat = slice(0,90)), t850_north.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_north.t)/100))]))
corr_t850_const.append(np.mean([correlation(constants.lon2d, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_south_const.append(np.mean([correlation(constants.lon2d.sel(lat = slice(-90,0)), t850_south.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_south.t)/100))]))

df2 = pd.concat([pd.DataFrame(constant_variables_str, columns = ['Variable names']),pd.DataFrame(corr_t850_const, columns = ['Corr t850']), pd.DataFrame(corr_t850_north_const, columns = ['Corr t850 north']), pd.DataFrame(corr_t850_south_const, columns = ['Corr t850 south'])], axis = 1)

df2.style.applymap(color_negative_red, subset=['Corr t850', 'Corr t850 north', 'Corr t850 south'])


Unnamed: 0,Variable names,Corr t850,Corr t850 north,Corr t850 south
0,orography,-0.385886,0.084928,-0.627422
1,lsm,-0.214283,0.037092,-0.451531
2,slt,0.04049,0.085191,-0.046446
3,lat,0.167379,-0.895283,0.940024
4,lon,-0.001363,-0.06448,0.041545


In [1]:
def correlation_table(data1, data2_level, lead_time):

    data1_correct = data1.sel(time = slice(data1.time[lead_time], None))
    data2_known = data2_level.sel(time = slice(None, data2_level.time[-(lead_time +1)]))
    
    level_values = data2_known.level.values
    
    level_sets = [data2_known.sel(level = i) for i in level_values]
    
    corr = [correlation(i, data1_correct).values for i in level_sets]

    df = pd.concat([pd.DataFrame(level_values, columns = ['Levels']), pd.DataFrame(corr, columns = ['Correlation_' + str(lead_time)])], axis = 1)
    
    return level_values, corr, df

In [9]:
level_values_temp_0, corr_temp_0, df_temp_0 = correlation_table(temp850.t, pot_vort.pv, 0)

In [13]:
level_north, corr_temp_north, df_temp_north = correlation_table(t850_north.t, pot_vort.pv, 0)

In [14]:
level_south, corr_temp_south, df_temp_south = correlation_table(t850_south.t, pot_vort.pv, 0)

In [19]:
df = pd.concat([df_temp_0, df_temp_north.drop('Levels', axis = 1), df_temp_south.drop('Levels', axis =1)], axis =1)
df

Unnamed: 0,Levels,Correlation_0,Correlation_0.1,Correlation_0.2
0,50,0.181532,-0.164776,0.240234
1,100,0.155364,-0.262731,0.270999
2,150,0.140742,-0.326759,0.299715
3,200,0.111629,-0.36358,0.297244
4,250,0.071633,-0.48056,0.344711
5,300,0.053479,-0.49623,0.333421
6,400,0.085946,-0.364737,0.287475
7,500,0.149889,-0.239135,0.295109
8,600,0.15961,-0.039953,0.316152
9,700,0.247689,-0.032645,0.51006


In [23]:
level_geo_0, corr_geo_0, df_geo_0 = correlation_table(geo_500.z, pot_vort.pv, 0)
level_geo_north, corr_geo_north, df_geo_north = correlation_table(geo_500_north.z, pot_vort.pv, 0)
level_geo_south, corr_geo_south, df_geo_south = correlation_table(geo_500_south.z, pot_vort.pv, 0)

In [24]:
df = pd.concat([df_geo_0, df_geo_north.drop('Levels', axis = 1), df_geo_south.drop('Levels', axis =1)], axis =1)
df

Unnamed: 0,Levels,Correlation_0,Correlation_0.1,Correlation_0.2
0,50,0.2522,-0.174027,0.272803
1,100,0.237994,-0.272077,0.323329
2,150,0.224988,-0.338581,0.350332
3,200,0.196488,-0.389472,0.362647
4,250,0.154033,-0.534518,0.432857
5,300,0.130113,-0.560919,0.431081
6,400,0.14587,-0.412442,0.341333
7,500,0.20243,-0.265711,0.305473
8,600,0.154837,-0.039979,0.23898
9,700,0.219506,-0.028493,0.358381
