# 相关分析和回归分析

相关分析与回归分析都是统计上研究变量之间关系的常用方法。用来判断两组变量之间的统计相关性。相关分析中两组变量的地位是平等的，而回归分析两个变量位置一般不能互换(自变量和因变量)。

## 相关分析

实际上相关系数有很多的计算公式和定义，这里只聊聊最常用的皮尔逊相关。

![Image Name](https://cdn.kesci.com/upload/image/qzclzmt316.png?imageView2/0/w/960/h/960)


函数：

scipy.stats.pearsonr(x, y)

x,y分别为变量1和变量2

返回值为Pearson相关系数r 和 p-value

In [1]:
from scipy.stats import pearsonr
import numpy as np
a = np.array([4,6,8,4,1,0,3,5,9,8,5,3,6,7,4,9,8,3,1,5,6,4,0,1,8])
b = np.array([9,8,5,9,3,6,2,7,7,6,3,0,1,4,5,8,9,6,15,3,7,8,2,4,5])

r,p = pearsonr(a,b)

print('r={},p={}'.format(np.round(r,2),np.round(p,2)))

r=0.11,p=0.6


In [1]:
import xarray as xr
from scipy.stats import pearsonr
import numpy as np
import datetime as dt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import cartopy.mpl.ticker as cticker
import matplotlib.pyplot as plt

f_t = xr.open_dataset('/home/mw/input/moyu1828/air.mon.mean.nc')
# t_nea = np.array(f_t.air.loc[f_t.time.dt.month.isin([12,1,2])].loc['1979-12-01':'2020-03-01',850,50:30,110:130].mean('lat').mean('lon')).reshape(41,3).mean((1))
t_nea = np.array(f_t.air.loc[f_t.time.dt.month.isin([12,1,2])].loc['1979-12-01':'2020-03-01',850,50:30,110:130]).mean((1,2)).reshape(41,3).mean((1))

f_z = xr.open_dataset('/home/mw/input/moyu1828/hgt.mon.mean.nc')
z = np.array(f_z.hgt.loc[f_z.time.dt.month.isin([12,1,2])].loc['1979-12-01':'2020-03-01',500]).reshape(41,3,73,144).mean((1))

lat = f_z.lat
lon = f_z.lon

print(t_nea.shape,z.shape)
r,p = np.zeros((73,144)),np.zeros((73,144))

for i in range(len(lat)):
    for j in range(len(lon)):
        r[i,j], p[i,j]  = pearsonr(z[:,i,j],t_nea)
        # r[i,j], p[i,j]  = pearsonr(t_neaz,[:,i,j])


(41,) (41, 73, 144)


In [2]:
fig = plt.figure(figsize=(12,8))
proj = ccrs.PlateCarree(central_longitude=180)
leftlon, rightlon, lowerlat, upperlat = (0,180,0,90)
img_extent = [leftlon, rightlon, lowerlat, upperlat]

ax = fig.add_axes([0.1, 0.1, 0.8, 0.6],projection = proj)

ax.set_extent(img_extent, crs=ccrs.PlateCarree())
# ax.add_feature(cfeature.COASTLINE) 
ax.set_xticks(np.arange(leftlon,rightlon+60,60), crs=ccrs.PlateCarree())
ax.set_yticks(np.arange(lowerlat,upperlat+30,30), crs=ccrs.PlateCarree())
lon_formatter = cticker.LongitudeFormatter()
lat_formatter = cticker.LatitudeFormatter()
ax.xaxis.set_major_formatter(lon_formatter)
ax.yaxis.set_major_formatter(lat_formatter)
c1 = ax.contourf(lon,lat, r, zorder=0,levels =np.arange(-1,1.1,0.1) , extend = 'both', transform=ccrs.PlateCarree(), cmap=plt.cm.bwr)
c1b = ax.contourf(lon,lat, p,levels=[0,0.1,1], zorder=1,hatches=['...', None],colors="none", transform=ccrs.PlateCarree())

position=fig.add_axes([0.3, 0.02,  0.35, 0.025])
fig.colorbar(c1,cax=position,orientation='horizontal',format='%.1f',)
plt.show()

## 回归分析

相关分析是回归分析的基础和前提，相关分析需要依靠回归分析来表现变量之间数量相关的具体形式，而回归分析则需要依靠相关分析来表现变量之间数量变化的相关程度，只有当变量之间存在高度相关时，进行回归分析寻求其相关的具体形式才有意义。

In [None]:
y = kx+b

实现最小二乘回归的函数:

scipy.stats.linregress(x, y)

输入值：

自变量x和因变量y

返回值：

1. 回归斜率 slope

2. 回归截距 intercept

3. 相关系数 r

4. p-value

5. 在残差正态性假设下，估计斜率（梯度）的标准误差 stderr

In [3]:
import xarray as xr
# from scipy.stats import pearsonr
from scipy.stats import linregress
import numpy as np
import datetime as dt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import cartopy.mpl.ticker as cticker
import matplotlib.pyplot as plt

f_t = xr.open_dataset('/home/mw/input/moyu1828/air.mon.mean.nc')
# t_nea = np.array(f_t.air.loc[f_t.time.dt.month.isin([12,1,2])].loc['1979-12-01':'2020-03-01',850,50:30,110:130].mean('lat').mean('lon')).reshape(41,3).mean((1))
t_nea = np.array(f_t.air.loc[f_t.time.dt.month.isin([12,1,2])].loc['1979-12-01':'2020-03-01',850,50:30,110:130]).mean((1,2)).reshape(41,3).mean((1))

f_z = xr.open_dataset('/home/mw/input/moyu1828/hgt.mon.mean.nc')
z = np.array(f_z.hgt.loc[f_z.time.dt.month.isin([12,1,2])].loc['1979-12-01':'2020-03-01',500]).reshape(41,3,73,144).mean((1))

lat = f_z.lat
lon = f_z.lon

print(t_nea.shape,z.shape)
s,r,p = np.zeros((73,144)),np.zeros((73,144)),np.zeros((73,144))

for i in range(len(lat)):
    for j in range(len(lon)):
        # s[i,j],_,r[i,j], p[i,j],_  = linregress(z[:,i,j],t_nea)
        s[i,j],_,r[i,j], p[i,j],_  = linregress(t_nea,z[:,i,j])



(41,) (41, 73, 144)


In [4]:
fig = plt.figure(figsize=(12,8))
proj = ccrs.PlateCarree(central_longitude=180)
leftlon, rightlon, lowerlat, upperlat = (0,180,0,90)
img_extent = [leftlon, rightlon, lowerlat, upperlat]

ax = fig.add_axes([0.1, 0.1, 0.8, 0.6],projection = proj)
ax.set_extent(img_extent, crs=ccrs.PlateCarree())
# ax.add_feature(cfeature.COASTLINE.with_scale('50m')) 
ax.set_xticks(np.arange(leftlon,rightlon+60,60), crs=ccrs.PlateCarree())
ax.set_yticks(np.arange(lowerlat,upperlat+30,30), crs=ccrs.PlateCarree())
lon_formatter = cticker.LongitudeFormatter()
lat_formatter = cticker.LatitudeFormatter()
ax.xaxis.set_major_formatter(lon_formatter)
ax.yaxis.set_major_formatter(lat_formatter)
c1 = ax.contourf(lon,lat, s, zorder=0,levels =np.arange(-25,30,5) , 
                 extend = 'both', transform=ccrs.PlateCarree(), cmap=plt.cm.bwr)
c1b = ax.contourf(lon,lat, p,[0,0.1,1], zorder=1,hatches=['...', None],colors="none", transform=ccrs.PlateCarree())

position=fig.add_axes([0.3, 0.02,  0.35, 0.025])
fig.colorbar(c1,cax=position,orientation='horizontal',format='%d',)
plt.show()

## 相关关系和因果关系

我们通过回归和相关得到的结果都是统计相关性，也就是仅从数据本身得到的数学上的相关，这不能反应数据之间的因果关系和真实物理意义。因果关系包含相关关系，但是相关关系不能完全反应因果关系。

In [33]:
from scipy.stats import pearsonr
import numpy as np
import matplotlib.pyplot as plt

a = np.array([4,6,8,4,1,0,3,5,9,8,5,3,6,7,4,9,8,3,1,5,6,4,0,1,8])
b = np.array([9,8,5,9,3,6,2,7,7,6,3,0,1,4,5,8,9,6,15,3,7,8,2,4,5])

a1 = a + 2*np.arange(a.shape[0])+1
b1 = b + 2*np.arange(b.shape[0])+1

fig = plt.figure(figsize=(12,8))
ax = fig.add_axes([0.1, 0.1, 0.4, 0.3])  
ax.plot(np.arange(a.shape[0]), a, color='k', linewidth=4,label='a')  
ax.plot(np.arange(b.shape[0]), b, color='r', linewidth=4,label='b')  
ax.legend()

ax2 = fig.add_axes([0.6, 0.1, 0.4, 0.3])  
ax2.plot(np.arange(a1.shape[0]), a1, color='k', linewidth=4,label='a1')  
ax2.plot(np.arange(b1.shape[0]), b1, color='r', linewidth=4,label='b1')  
ax2.legend()
plt.show()

print('Rab = {}'.format(pearsonr(a,b)[0]))
print('Ra1b1 = {}'.format(pearsonr(a1,b1)[0]))

Rab = 0.10956251815325795
Ra1b1 = 0.963179297720568


In [37]:
from scipy.stats import pearsonr
import numpy as np
import matplotlib.pyplot as plt

a = np.array([0,1+1,2-1,3+1,4-1,3+1,2-1,1+1,0-1,1+1,2-1,3+1,4-1,3+1,2-1,1+1,0-1,1+1,2-1,3+1,4-1,3+1,2-1,1+1,0-1])
b = np.array([0,1-1,2+1,3-1,4+1,3-1,2+1,1-1,0+1,1-1,2+1,3-1,4+1,3-1,2+1,1-1,0+1,1-1,2+1,3-1,4+1,3-1,2+1,1-1,0+1])

fig = plt.figure(figsize=(12,8))
ax = fig.add_axes([0.1, 0.1, 0.4, 0.3])  
ax.plot(np.arange(a.shape[0]), a, color='k', linewidth=4,label='a')  
ax.plot(np.arange(b.shape[0]), b, color='r', linewidth=4,label='b')  
ax.legend()

a1 = a -np.array([0,1,2,3,4,3,2,1,0,1,2,3,4,3,2,1,0,1,2,3,4,3,2,1,0])
b1 = b -np.array([0,1,2,3,4,3,2,1,0,1,2,3,4,3,2,1,0,1,2,3,4,3,2,1,0])
ax2 = fig.add_axes([0.6, 0.1, 0.4, 0.3])  
ax2.plot(np.arange(a1.shape[0]), a1, color='k', linewidth=4,label='a1')  
ax2.plot(np.arange(b1.shape[0]), b1, color='r', linewidth=4,label='b1')  
ax2.legend()
plt.show()

print('Rab = {}'.format(pearsonr(a,b)[0]))
print('Ra1b1 = {}'.format(pearsonr(a1,b1)[0]))

Rab = 0.24812030075187969
Ra1b1 = -0.9999999999999999


建议在进行相关分析和回归分析之前，还是要先看看数据特征(趋势和周期性会很大程度的干扰计算结果)，并进行适当的预处理(去趋势，滤波)。