In the important variables the correlation increases as the number of days increases

In [22]:
import xarray as xr

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [23]:
DATADIR = '/Volumes/mc4117/home/WeatherBench/data/'  

In [24]:
! ls $DATADIR

[34m10m_u_component_of_wind[m[m      [34mtemperature[m[m
[34m10m_v_component_of_wind[m[m      [34mtemperature_850[m[m
[34m2m_temperature[m[m               [34mtoa_incident_solar_radiation[m[m
[34mconstants[m[m                    [34mtotal_cloud_cover[m[m
[34mgeopotential[m[m                 [34mtotal_precipitation[m[m
[34mgeopotential_500[m[m             [34mu_component_of_wind[m[m
[34mpotential_vorticity[m[m          [34mv_component_of_wind[m[m
[34mrelative_humidity[m[m            [34mvorticity[m[m
[34mspecific_humidity[m[m


In [4]:
def covariance(x, y, dim=None):
    x_arr = x
    y_arr = y
    valid_values = x_arr.notnull() & y_arr.notnull()
    valid_count = valid_values.sum(dim)

    demeaned_x = (x_arr - x_arr.mean(dim)).fillna(0)
    demeaned_y = (y_arr - y_arr.mean(dim)).fillna(0)
    
    return xr.dot(demeaned_x, demeaned_y, dims=dim) / valid_count

def correlation(x, y, dim=None):
    # dim should default to the intersection of x.dims and y.dims
    return covariance(x, y, dim) / (x.std(dim) * y.std(dim))

In [5]:
def color_negative_red(value):

    if abs(value) < 0.25:
        color = 'red'
    elif abs(value) > 0.5:
        color = 'green'
    else:
        color = 'black'

    return 'color: %s' % color

In [25]:
geo_500 = xr.open_mfdataset(f'{DATADIR}geopotential_500/*.nc', combine='by_coords')
temp850 = xr.open_mfdataset(f'{DATADIR}temperature_850/*.nc', combine='by_coords')

In [26]:
temp_arr = temp850.t.sel(time = '2018')
geo_arr = geo_500.z.sel(time = '2018')

In [6]:
from scipy.stats import pearsonr
pearsonr(temp_arr.values.flatten(), geo_arr.values.flatten())

(0.9323881005250092, 0.0)

In [29]:
geo_500_north = geo_500.sel(lat=slice(0, 90))
geo_500_south = geo_500.sel(lat=slice(-90, 0))

t850_north = temp850.sel(lat=slice(0, 90))
t850_south = temp850.sel(lat=slice(-90, 0))

In [None]:
variables = [
             u_wind.u.sel(level=500), u_wind.u.sel(level=850), v_wind.v.sel(level=500), v_wind.v.sel(level=850), 
             u_wind_10m.u10, v_wind_10m.v10, temp_2m.t2m, geo_500.z, pot_vort.pv.sel(level = 500), pot_vort.pv.sel(level=850),
             rel_hum.r.sel(level = 500), rel_hum.r.sel(level=850), spec_hum.q.sel(level=500),
             spec_hum.q.sel(level=850), temp850.t, solar.tisr, cloud_cover.tcc, precip.tp, 
             vort.vo.sel(level=500), vort.vo.sel(level=850)
            ]
variables_str = [
             'u_wind_500', 'v_wind_500', 'u_wind_850', 'v_wind_850', 'u_wind_10m', 'v_wind_10m', 
             'temp_2m', 'geo_500', 'pot_vort_500', 'pot_vort_850',
             'rel_hum_500', 'rel_hum_850', 'spec_hum_500',
             'spec_hum_850', 'temp850', 'solar', 'cloud_cover', 'precip', 
             'vort_500', 'vort_850'
            ]

def correlation_variables_table(data_list, y, lead_time, north = False, south = False):
    correlation_list = []
    
    y_corr = y.sel(time = slice(y.time[lead_time], None))
    
    for i in variables:
        if north:
            i_known = i.sel(lat = slice(0, 90), time = slice(None, i.time[-(lead_time +1)])) 
        elif south:
            i_known = i.sel(lat = slice(-90, 0), time = slice(None, i.time[-(lead_time +1)])) 
        else:
            i_known = i.sel(time = slice(None, i.time[-(lead_time +1)])) 
        correlation_list.append(correlation(i_known, y_corr).values)    

    return correlation_list


In [None]:
temp850_corr = correlation_variables_table(variables, temp850.t, lead_time = 72)
#temp850_corr_north = correlation_variables_table(variables, t850_north.t, lead_time = 72, north = True)
#temp850_corr_south = correlation_variables_table(variables, t850_south.t, lead_time = 72, south = True)

print('temp done')

geo500_corr = correlation_variables_table(variables, geo_500.z, lead_time = 72)
#geo500_corr_north = correlation_variables_table(variables, geo_500_north.z, lead_time = 0, north = True)
#geo500_corr_south = correlation_variables_table(variables, geo_500_south.z, lead_time = 0, south = True)

In [None]:
df = pd.concat([pd.DataFrame(variables_str, columns = ['Variable names']), pd.DataFrame(temp850_corr, columns = ['Corr_tmp_850 full'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr_tmp_850 full'])

In [None]:
df = pd.concat([pd.DataFrame(variables_str, columns = ['Variable names']), pd.DataFrame(geo500_corr, columns = ['Corr_geo_500 full'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr_geo_500 full'])

In [None]:
temp850_corr = correlation_variables_table(variables, temp850.t, lead_time = 120)

print('temp done')

geo500_corr = correlation_variables_table(variables, geo_500.z, lead_time = 120)

In [None]:
df = pd.concat([pd.DataFrame(variables_str, columns = ['Variable names']), pd.DataFrame(temp850_corr, columns = ['Corr_tmp_850 full'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr_tmp_850 full'])

In [None]:
df = pd.concat([pd.DataFrame(variables_str, columns = ['Variable names']), pd.DataFrame(geo500_corr, columns = ['Corr_geo_500 full'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr_geo_500 full'])

In [None]:
df = pd.concat([pd.DataFrame(variables_str, columns = ['Variable names']), pd.DataFrame(temp850_corr, columns = ['Corr_tmp_850 full']), pd.DataFrame(temp850_corr_north, columns = ['Corr_tmp_850 north']), pd.DataFrame(temp850_corr_south, columns = ['Corr_tmp_850 south'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr_tmp_850 full', 'Corr_tmp_850 north', 'Corr_tmp_850 south'])

In [None]:
df = pd.concat([pd.DataFrame(variables_str, columns = ['Variable names']), pd.DataFrame(geo500_corr, columns = ['Corr_geo_500 full']), pd.DataFrame(geo500_corr_north, columns = ['Corr_geo_500 north']), pd.DataFrame(geo500_corr_south, columns = ['Corr_geo_500 south'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr_geo_500 full', 'Corr_geo_500 north', 'Corr_geo_500 south'])

In [None]:
constant_variables_str = []
corr_z500_const = []
corr_z500_north_const = []
corr_z500_south_const = []

constant_variables_str.append('orography')
#corr_tmp_850_const.append(np.mean([correlation(constants.orography, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_z500_north_const.append(np.mean([correlation(constants.orography.sel(lat = slice(0,90)), geo_500_north.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_north.z)/100))]))
corr_z500_const.append(np.mean([correlation(constants.orography, geo_500.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500.z)/100))]))
corr_z500_south_const.append(np.mean([correlation(constants.orography.sel(lat = slice(-90, 0)), geo_500_south.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_south.z)/100))]))


constant_variables_str.append('lsm')
#corr_tmp_850_const.append(np.mean([correlation(constants.lsm, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_z500_north_const.append(np.mean([correlation(constants.lsm.sel(lat = slice(0,90)), geo_500_north.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_north.z)/100))]))
corr_z500_const.append(np.mean([correlation(constants.lsm, geo_500.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500.z)/100))]))
corr_z500_south_const.append(np.mean([correlation(constants.lsm.sel(lat = slice(-90, 0)), geo_500_south.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_south.z)/100))]))


constant_variables_str.append('slt')
#corr_tmp_850_const.append(np.mean([correlation(constants.slt, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_z500_north_const.append(np.mean([correlation(constants.slt.sel(lat = slice(0,90)), geo_500_north.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_north.z)/100))]))
corr_z500_const.append(np.mean([correlation(constants.slt, geo_500.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500.z)/100))]))
corr_z500_south_const.append(np.mean([correlation(constants.slt.sel(lat = slice(-90, 0)), geo_500_south.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_south.z)/100))]))


constant_variables_str.append('lat')
corr_z500_north_const.append(np.mean([correlation(constants.lat2d.sel(lat = slice(0,90)), geo_500_north.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_north.z)/100))]))
corr_z500_const.append(np.mean([correlation(constants.lat2d, geo_500.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500.z)/100))]))
corr_z500_south_const.append(np.mean([correlation(constants.lat2d.sel(lat = slice(-90,0)), geo_500_south.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_south.z)/100))]))

constant_variables_str.append('lon')
corr_z500_north_const.append(np.mean([correlation(constants.lon2d.sel(lat = slice(0,90)), geo_500_north.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_north.z)/100))]))
corr_z500_const.append(np.mean([correlation(constants.lon2d, geo_500.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500.z)/100))]))
corr_z500_south_const.append(np.mean([correlation(constants.lon2d.sel(lat = slice(-90, 0)), geo_500_south.z.isel(time = np.int(100*i))).values for i in range(np.int(len(geo_500_south.z)/100))]))


#df = pd.concat([pd.DataFrame(constant_variables_str, columns = ['Variable names']), pd.DataFrame(corr_tmp_850_const, columns = ['Corr tmp_850']), pd.DataFrame(corr_z500_const, columns = ['Corr z500'])], axis = 1)

df = pd.concat([pd.DataFrame(constant_variables_str, columns = ['Variable names']),pd.DataFrame(corr_z500_const, columns = ['Corr z500'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr z500'])


In [None]:
df = pd.concat([pd.DataFrame(constant_variables_str, columns = ['Variable names']),pd.DataFrame(corr_z500_const, columns = ['Corr z500']), pd.DataFrame(corr_z500_north_const, columns = ['Corr z500 north']), pd.DataFrame(corr_z500_south_const, columns = ['Corr z500 south'])], axis = 1)

df.style.applymap(color_negative_red, subset=['Corr z500', 'Corr z500 north', 'Corr z500 south'])

In [None]:
constant_variables_str = []
corr_t850_const = []
corr_t850_north_const = []
corr_t850_south_const = []

constant_variables_str.append('orography')
#corr_tmp_850_const.append(np.mean([correlation(constants.orography, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_north_const.append(np.mean([correlation(constants.orography.sel(lat = slice(0,90)), t850_north.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_north.t)/100))]))
corr_t850_const.append(np.mean([correlation(constants.orography, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_south_const.append(np.mean([correlation(constants.orography.sel(lat = slice(-90,0)), t850_south.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_south.t)/100))]))


constant_variables_str.append('lsm')
#corr_tmp_850_const.append(np.mean([correlation(constants.orography, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_north_const.append(np.mean([correlation(constants.lsm.sel(lat = slice(0,90)), t850_north.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_north.t)/100))]))
corr_t850_const.append(np.mean([correlation(constants.lsm, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_south_const.append(np.mean([correlation(constants.lsm.sel(lat = slice(-90,0)), t850_south.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_south.t)/100))]))


constant_variables_str.append('slt')
#corr_tmp_850_const.append(np.mean([correlation(constants.orography, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_north_const.append(np.mean([correlation(constants.slt.sel(lat = slice(0,90)), t850_north.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_north.t)/100))]))
corr_t850_const.append(np.mean([correlation(constants.slt, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_south_const.append(np.mean([correlation(constants.slt.sel(lat = slice(-90,0)), t850_south.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_south.t)/100))]))

constant_variables_str.append('lat')
#corr_tmp_850_const.append(np.mean([correlation(constants.orography, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_north_const.append(np.mean([correlation(constants.lat2d.sel(lat = slice(0,90)), t850_north.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_north.t)/100))]))
corr_t850_const.append(np.mean([correlation(constants.lat2d, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_south_const.append(np.mean([correlation(constants.lat2d.sel(lat = slice(-90,0)), t850_south.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_south.t)/100))]))

constant_variables_str.append('lon')
#corr_tmp_850_const.append(np.mean([correlation(constants.orography, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_north_const.append(np.mean([correlation(constants.lon2d.sel(lat = slice(0,90)), t850_north.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_north.t)/100))]))
corr_t850_const.append(np.mean([correlation(constants.lon2d, temp850.t.isel(time = np.int(100*i))).values for i in range(np.int(len(temp850.t)/100))]))
corr_t850_south_const.append(np.mean([correlation(constants.lon2d.sel(lat = slice(-90,0)), t850_south.t.isel(time = np.int(100*i))).values for i in range(np.int(len(t850_south.t)/100))]))

df2 = pd.concat([pd.DataFrame(constant_variables_str, columns = ['Variable names']),pd.DataFrame(corr_t850_const, columns = ['Corr t850']), pd.DataFrame(corr_t850_north_const, columns = ['Corr t850 north']), pd.DataFrame(corr_t850_south_const, columns = ['Corr t850 south'])], axis = 1)

df2.style.applymap(color_negative_red, subset=['Corr t850', 'Corr t850 north', 'Corr t850 south'])


In [None]:
def correlation_table(data1, data2_level, lead_time):

    data1_correct = data1.sel(time = slice(data1.time[lead_time], None))
    data2_known = data2_level.sel(time = slice(None, data2_level.time[-(lead_time +1)]))
    
    level_values = data2_known.level.values
    
    level_sets = [data2_known.sel(level = i) for i in level_values]
    
    corr = [correlation(i, data1_correct).values for i in level_sets]

    df = pd.concat([pd.DataFrame(level_values, columns = ['Levels']), pd.DataFrame(corr, columns = ['Correlation_' + str(lead_time)])], axis = 1)
    
    return level_values, corr, df

In [None]:
level_values_temp_0, corr_temp_0, df_temp_0 = correlation_table(temp850.t, pot_vort.pv, 0)

In [None]:
level_north, corr_temp_north, df_temp_north = correlation_table(t850_north.t, pot_vort.pv, 0)

In [None]:
level_south, corr_temp_south, df_temp_south = correlation_table(t850_south.t, pot_vort.pv, 0)

In [None]:
df = pd.concat([df_temp_0, df_temp_north.drop('Levels', axis = 1), df_temp_south.drop('Levels', axis =1)], axis =1)
df

In [None]:
level_geo_0, corr_geo_0, df_geo_0 = correlation_table(geo_500.z, pot_vort.pv, 0)
level_geo_north, corr_geo_north, df_geo_north = correlation_table(geo_500_north.z, pot_vort.pv, 0)
level_geo_south, corr_geo_south, df_geo_south = correlation_table(geo_500_south.z, pot_vort.pv, 0)

In [None]:
df = pd.concat([df_geo_0, df_geo_north.drop('Levels', axis = 1), df_geo_south.drop('Levels', axis =1)], axis =1)
df