# Spearman correlation

In this notebook, we compute the Spearman correlations between the time-series of the administrative regions of Yemen. Through this analysis, we can identify time-series with high correlations between them and then decide to exclude some for subsequent analyzes.

In [1]:
import dataframe_image as dfi
import pandas as pd
import numpy as np

In [2]:
COUNTRY = "Yemen"

In [3]:
PATH_TO_DATA_FOLDER = f"../Dataset time-series/output_data/{COUNTRY}/"

## Time-series dataset

In [4]:
# Load the time-series data (daily interpolation).
df = pd.read_csv(PATH_TO_DATA_FOLDER + COUNTRY + "-day.csv", header = [0, 1], index_col = 0)
df.index.name = "Datetime"
df.index = pd.to_datetime(df.index)
freq = "D"
df.index.freq = freq

In [5]:
df.columns.get_level_values(1).unique()

Index(['1 Month Anomaly Rainfalls (%)', '3 Months Anomaly Rainfalls (%)',
       'Code', 'Exchange rate', 'FCG', 'Fatalities', 'Lat', 'Lon', 'NDVI',
       'NDVI Anomaly', 'Population', 'Price cereals and tubers',
       'Rainfalls (mm)', 'Ramadan', 'rCSI'],
      dtype='object', name='Indicator')

In [6]:
# Consider only some indicators for the analysis with Spearman correlation.
INDICATORS_TO_CONSIDER = ["FCG", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)", 
                          "NDVI", "Price cereals and tubers", "Exchange rate", "Fatalities", "NDVI Anomaly", 
                          "rCSI", "Rainfalls (mm)"]

df = df.loc[:, df.columns.get_level_values(1).isin(INDICATORS_TO_CONSIDER)]
df.head()

AdminStrata,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,...,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI,...,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-08-22,90.454758,111.487446,0.207215,31.8098,10.0,0.098889,98.450151,0.164924,12.9544,37.4219,...,93.865658,147.652361,0.207371,39.0431,171.0,0.198394,104.340193,0.181576,41.6606,50.919
2018-08-23,89.92758,110.382403,0.210332,32.3747,10.0,0.098958,98.353086,0.166085,13.0949,37.3462,...,94.296436,145.973629,0.210846,39.3717,147.0,0.198516,104.184601,0.181735,42.6437,50.4159
2018-08-24,89.43678,109.316308,0.213449,33.7721,10.0,0.099033,98.254291,0.167246,13.2354,37.9994,...,94.778817,144.313356,0.214321,36.6621,145.0,0.198655,104.02175,0.181894,43.6268,49.3913
2018-08-25,88.982357,108.289161,0.216565,34.5337,10.0,0.099113,98.153766,0.168407,13.3759,36.0682,...,95.312802,142.671542,0.217796,37.2052,156.0,0.198812,103.851638,0.182052,44.6099,50.3024
2018-08-26,88.564312,107.300961,0.219682,32.3279,10.0,0.099197,98.05151,0.169568,13.5164,38.2185,...,95.89839,141.048187,0.221272,37.0257,164.0,0.198987,103.674267,0.182211,45.593,50.293


## Correlations

### Nature indicators

In [7]:
select = df.columns.get_level_values(1).isin(["NDVI", "NDVI Anomaly", "Rainfall (mm)", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)"])
df_nature = df.loc[:, select]
df_nature.head()

AdminStrata,Abyan,Abyan,Abyan,Abyan,Aden,Aden,Aden,Aden,Al Bayda,Al Bayda,...,Sana'a,Sana'a,Shabwah,Shabwah,Shabwah,Shabwah,Taizz,Taizz,Taizz,Taizz
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),...,NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-08-22,90.454758,111.487446,0.098889,98.450151,103.239156,156.402304,0.072064,99.190492,85.034755,108.16783,...,0.161509,99.498392,81.037435,100.409635,0.104272,99.41175,93.865658,147.652361,0.198394,104.340193
2018-08-23,89.92758,110.382403,0.098958,98.353086,102.928818,154.594533,0.072125,99.208464,84.49981,106.835572,...,0.162572,99.485905,80.327073,99.032806,0.104312,99.381895,94.296436,145.973629,0.198516,104.184601
2018-08-24,89.43678,109.316308,0.099033,98.254291,102.652003,152.798301,0.072188,99.226749,84.052257,105.60444,...,0.163661,99.480545,79.67166,97.728586,0.104355,99.351971,94.778817,144.313356,0.198655,104.02175
2018-08-25,88.982357,108.289161,0.099113,98.153766,102.408712,151.013609,0.072253,99.245345,83.692096,104.474432,...,0.164775,99.482315,79.071198,96.496975,0.104401,99.321979,95.312802,142.671542,0.198812,103.851638
2018-08-26,88.564312,107.300961,0.099197,98.05151,102.198944,149.240456,0.072319,99.264254,83.419328,103.44555,...,0.165916,99.491213,78.525686,95.337974,0.104449,99.291919,95.89839,141.048187,0.198987,103.674267


In [8]:
# Compute the correlation between the nature time-series for the various administrative regions: we then obtain a list of correlation matrices equal to the number of administrative regions.
corr_matrices_nature = list()

def correlation_matrices_nature(group):
    group.columns = group.columns.droplevel()
    mtrx = group.corr(method = "spearman")
    corr_matrices_nature.append(mtrx)

df_nature.groupby(level = 0, axis = 1).apply(correlation_matrices_nature);

In [9]:
# We take the mean of the various correlation matrices (administrative regions).
CORR_nature_mean = pd.DataFrame(np.mean(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
corr = CORR_nature_mean.style.background_gradient(cmap = "coolwarm")
#corr.export_png(f"./output_images/nature_indicators_{COUNTRY}.png")
corr

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),1.0,0.6103,0.257404,0.355852
3 Months Anomaly Rainfalls (%),0.6103,1.0,0.321676,0.505726
NDVI,0.257404,0.321676,1.0,0.791278
NDVI Anomaly,0.355852,0.505726,0.791278,1.0


In [10]:
# We take the median of the various correlation matrices (administrative regions).
CORR_nature_median = pd.DataFrame(np.median(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
CORR_nature_median.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),1.0,0.610371,0.256718,0.342457
3 Months Anomaly Rainfalls (%),0.610371,1.0,0.308328,0.477129
NDVI,0.256718,0.308328,1.0,0.84372
NDVI Anomaly,0.342457,0.477129,0.84372,1.0


In [11]:
# We take the variance of the various correlation matrices (administrative regions).
CORR_nature_variance = pd.DataFrame(np.var(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
CORR_nature_variance.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),0.0,0.011448,0.016634,0.033902
3 Months Anomaly Rainfalls (%),0.011448,0.0,0.034115,0.042992
NDVI,0.016634,0.034115,0.0,0.024893
NDVI Anomaly,0.033902,0.042992,0.024893,0.0


### All indicators

In [12]:
# Compute the correlation between all the time-series for the various administrative regions: we then obtain a list of correlation matrices equal to the number of administrative regions.
corr_matrices = list()

def correlation_matrices(group):
    group.columns = group.columns.droplevel()
    mtrx = group.corr(method = "spearman").values
    if not np.isnan(np.sum(mtrx)): # nan if the time-series is completly flat (fatalities).
        corr_matrices.append(mtrx)

df.groupby(level = 0, axis = 1).apply(correlation_matrices);

In [13]:
# We take the mean of the various correlation matrices (administrative regions).
CORR_mean = pd.DataFrame(np.mean(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
corr = CORR_mean.style.background_gradient(cmap = "coolwarm")
corr.export_png(f"./output_images/all_indicators_{COUNTRY}.png")
corr

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1 Month Anomaly Rainfalls (%),1.0,0.6103,0.236153,0.019393,-0.004868,0.257404,0.355852,0.186847,0.27165,-0.101648
3 Months Anomaly Rainfalls (%),0.6103,1.0,0.317509,0.111607,0.025258,0.321676,0.505726,0.258921,0.15455,-0.173187
Exchange rate,0.236153,0.317509,1.0,0.335435,-0.049184,0.461822,0.360308,0.597249,0.032489,0.07298
FCG,0.019393,0.111607,0.335435,1.0,0.018496,0.190707,0.157465,0.288999,-0.103224,0.180914
Fatalities,-0.004868,0.025258,-0.049184,0.018496,1.0,-0.129264,-0.166039,-0.039507,0.00983,0.031123
NDVI,0.257404,0.321676,0.461822,0.190707,-0.129264,1.0,0.791278,0.33012,0.038061,0.014877
NDVI Anomaly,0.355852,0.505726,0.360308,0.157465,-0.166039,0.791278,1.0,0.32414,0.074102,-0.081521
Price cereals and tubers,0.186847,0.258921,0.597249,0.288999,-0.039507,0.33012,0.32414,1.0,-0.055175,0.012815
Rainfalls (mm),0.27165,0.15455,0.032489,-0.103224,0.00983,0.038061,0.074102,-0.055175,1.0,-0.11741
rCSI,-0.101648,-0.173187,0.07298,0.180914,0.031123,0.014877,-0.081521,0.012815,-0.11741,1.0


In [14]:
# We take the median of the various correlation matrices (administrative regions).
CORR_median = pd.DataFrame(np.median(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
CORR_median.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1 Month Anomaly Rainfalls (%),1.0,0.610371,0.25896,-0.011137,-0.011976,0.256718,0.342457,0.226675,0.267721,-0.093975
3 Months Anomaly Rainfalls (%),0.610371,1.0,0.338483,0.103525,0.025461,0.308328,0.477129,0.3141,0.165874,-0.190834
Exchange rate,0.25896,0.338483,1.0,0.331731,-0.021182,0.468452,0.358921,0.671934,-0.021747,0.126703
FCG,-0.011137,0.103525,0.331731,1.0,0.012102,0.188945,0.155028,0.290474,-0.128039,0.176237
Fatalities,-0.011976,0.025461,-0.021182,0.012102,1.0,-0.227296,-0.236042,-0.036664,0.036782,0.007243
NDVI,0.256718,0.308328,0.468452,0.188945,-0.227296,1.0,0.84372,0.380214,0.089774,0.012712
NDVI Anomaly,0.342457,0.477129,0.358921,0.155028,-0.236042,0.84372,1.0,0.3874,0.044014,-0.082089
Price cereals and tubers,0.226675,0.3141,0.671934,0.290474,-0.036664,0.380214,0.3874,1.0,0.005516,-0.059475
Rainfalls (mm),0.267721,0.165874,-0.021747,-0.128039,0.036782,0.089774,0.044014,0.005516,1.0,-0.102126
rCSI,-0.093975,-0.190834,0.126703,0.176237,0.007243,0.012712,-0.082089,-0.059475,-0.102126,1.0


In [15]:
# We take the variance of the various correlation matrices (administrative regions).
CORR_variance = pd.DataFrame(np.var(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
CORR_variance.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1 Month Anomaly Rainfalls (%),0.0,0.011448,0.014105,0.033661,0.064709,0.016634,0.033902,0.040431,0.011589,0.028758
3 Months Anomaly Rainfalls (%),0.011448,0.0,0.014193,0.041005,0.090873,0.034115,0.042992,0.058228,0.024783,0.030248
Exchange rate,0.014105,0.014193,0.0,0.024963,0.115835,0.038027,0.046128,0.062939,0.025946,0.040823
FCG,0.033661,0.041005,0.024963,0.0,0.033791,0.040918,0.060845,0.025088,0.028047,0.02658
Fatalities,0.064709,0.090873,0.115835,0.033791,0.0,0.139253,0.144039,0.101626,0.030183,0.025532
NDVI,0.016634,0.034115,0.038027,0.040918,0.139253,0.0,0.024893,0.061225,0.049286,0.034875
NDVI Anomaly,0.033902,0.042992,0.046128,0.060845,0.144039,0.024893,0.0,0.06001,0.025147,0.047289
Price cereals and tubers,0.040431,0.058228,0.062939,0.025088,0.101626,0.061225,0.06001,0.0,0.037672,0.035894
Rainfalls (mm),0.011589,0.024783,0.025946,0.028047,0.030183,0.049286,0.025147,0.037672,0.0,0.0219
rCSI,0.028758,0.030248,0.040823,0.02658,0.025532,0.034875,0.047289,0.035894,0.0219,0.0
