In [1]:
import xarray as xr

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
DATADIR = '/rds/general/user/mc4117/home/WeatherBench/data/'  

In [3]:
temp = xr.open_mfdataset(f'{DATADIR}temperature/*.nc', combine='by_coords')
temp850 = xr.open_mfdataset(f'{DATADIR}temperature_850/*.nc', combine='by_coords')
spec_humid = xr.open_mfdataset(f'{DATADIR}specific_humidity/*.nc', combine='by_coords')
geo = xr.open_mfdataset(f'{DATADIR}geopotential/*.nc', combine='by_coords')
geo500 = xr.open_mfdataset(f'{DATADIR}geopotential_500/*.nc', combine='by_coords')

In [4]:
def covariance(x, y, dim=None):
    valid_values = x.notnull() & y.notnull()
    valid_count = valid_values.sum(dim)

    demeaned_x = (x - x.mean(dim)).fillna(0)
    demeaned_y = (y - y.mean(dim)).fillna(0)
    
    return xr.dot(demeaned_x, demeaned_y, dims=dim) / valid_count

def correlation(x, y, dim=None):
    # dim should default to the intersection of x.dims and y.dims
    return covariance(x, y, dim) / (x.std(dim) * y.std(dim))

In [29]:
temp_3days_correct = temp.sel(time = slice(temp.time[72], None))
temp_3days_known = temp.sel(time = slice(None, temp.time[-73]))

correlation(temp_3days_known.t.sel(level = 850), temp_3days_correct.t.sel(level = 850)).values

array(1.00002487)

## Correlation to tmp850

In [7]:
level_values = temp.level.values

temp_level_sets = [temp.t.sel(level = i) for i in level_values]

temp_corr = [correlation(i, temp850.t).values for i in temp_level_sets]

In [8]:
pd.concat([pd.DataFrame(temp.level.values, columns = ['Levels']), pd.DataFrame(temp_corr, columns = ['Temperature'])], axis = 1)

Unnamed: 0,Levels,Temperature
0,50,-0.034967
1,100,-0.458015
2,150,-0.284129
3,200,0.269933
4,250,0.762717
5,300,0.908472
6,400,0.94327
7,500,0.95395
8,600,0.963933
9,700,0.979782


In [9]:
level_values = spec_humid.level.values

spec_humid_sets = [spec_humid.q.sel(level = i) for i in level_values]

spec_hum_corr = [correlation(i, temp850.t).values for i in spec_humid_sets]

In [10]:
pd.concat([pd.DataFrame(spec_humid.level.values, columns=['Levels']), pd.DataFrame(spec_hum_corr, columns=['Specific humidity'])], axis = 1)

Unnamed: 0,Levels,Specific humidity
0,50,0.080888
1,100,0.210104
2,150,0.648467
3,200,0.619775
4,250,0.577071
5,300,0.547593
6,400,0.515883
7,500,0.535203
8,600,0.602929
9,700,0.671112


In [11]:
level_values = geo.level.values

geo_sets = [geo.z.sel(level = i) for i in level_values]

geo_corr = [correlation(i, temp850.t).values for i in geo_sets]

In [12]:
pd.concat([pd.DataFrame(geo.level.values, columns = ['levels']), pd.DataFrame(geo_corr, columns = ['Geopotential'])], axis = 1)

Unnamed: 0,levels,Geopotential
0,50,0.727277
1,100,0.902568
2,150,0.944963
3,200,0.951948
4,250,0.952404
5,300,0.950823
6,400,0.945276
7,500,0.93468
8,600,0.915054
9,700,0.87698


## Correlation to z500

In [14]:
level_values = temp.level.values

temp_level_sets = [temp.t.sel(level = i) for i in level_values]

temp_corr_geo = [correlation(i, geo500.z).values for i in temp_level_sets]

pd.concat([pd.DataFrame(temp.level.values, columns = ['Levels']), pd.DataFrame(temp_corr_geo, columns = ['Temperature'])], axis = 1)

Unnamed: 0,Levels,Temperature
0,50,-0.079411
1,100,-0.486386
2,150,-0.330765
3,200,0.201631
4,250,0.720894
5,300,0.894456
6,400,0.943129
7,500,0.953557
8,600,0.955405
9,700,0.946261


In [15]:
level_values = spec_humid.level.values

spec_humid_sets = [spec_humid.q.sel(level = i) for i in level_values]

spec_hum_corr_geo = [correlation(i, geo500.z).values for i in spec_humid_sets]

pd.concat([pd.DataFrame(spec_humid.level.values, columns=['Levels']), pd.DataFrame(spec_hum_corr_geo, columns=['Specific humidity'])], axis = 1)

Unnamed: 0,Levels,Specific humidity
0,50,0.077275
1,100,0.233139
2,150,0.645022
3,200,0.612807
4,250,0.568087
5,300,0.536333
6,400,0.498808
7,500,0.504809
8,600,0.556295
9,700,0.614486


In [None]:
level_values = geo.level.values

geo_sets = [geo.z.sel(level = i) for i in level_values]

geo_corr_geo = [correlation(i, geo500.z).values for i in geo_sets]

pd.concat([pd.DataFrame(geo.level.values, columns = ['levels']), pd.DataFrame(geo_corr_geo, columns = ['Geopotential'])], axis = 1)