# Spearman correlation

In this notebook, we compute the Spearman correlations between the time-series of the administrative regions of Yemen. Through this analysis, we can identify time-series with high correlations between them and then decide to exclude some for subsequent analyzes.

In [1]:
import dataframe_image as dfi
import pandas as pd
import numpy as np

In [2]:
COUNTRY = "Yemen"

In [3]:
PATH_TO_DATA_FOLDER = "../Dataset time-series/output_data/" + COUNTRY + "/"

## Time-series dataset

In [4]:
# Load the time-series data (daily interpolation).
df = pd.read_csv(PATH_TO_DATA_FOLDER + COUNTRY + "-day.csv", header = [0, 1], index_col = 0)
df.index.name = "Datetime"
df.index = pd.to_datetime(df.index)
freq = "D"
df.index.freq = freq

In [5]:
df.columns.get_level_values(1).unique()

Index(['1 Month Anomaly Rainfalls (%)', '3 Months Anomaly Rainfalls (%)',
       'Code', 'Exchange rate', 'FCG <= 2', 'Fatalities', 'Lat', 'Lon', 'NDVI',
       'NDVI Anomaly', 'Population', 'Price cereals and tubers',
       'Rainfalls (mm)', 'Ramadan', 'rCSI >= 19'],
      dtype='object', name='Indicator')

In [6]:
# Consider only some indicators for the analysis with Spearman correlation.
INDICATORS_TO_CONSIDER = ["FCG <= 2", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)", "NDVI", "Price cereals and tubers", "Exchange rate", "Fatalities", "NDVI Anomaly", "rCSI >= 19", "Rainfalls (mm)"]
df = pd.concat([df.loc[:, (slice(None), feature)] for feature in INDICATORS_TO_CONSIDER], axis = 1).sort_index(axis = 1)
df

AdminStrata,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,...,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG <= 2,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI >= 19,...,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG <= 2,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI >= 19
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-08-22,90.454758,111.487446,0.133546,31.809805,10.0,0.098889,98.450151,0.164757,12.9544,37.421880,...,93.865658,147.652361,0.133716,39.043078,171.0,0.198394,104.340193,0.182603,41.6606,50.919038
2018-08-23,89.927580,110.382403,0.136952,32.374660,10.0,0.098958,98.353086,0.166101,13.0949,37.346207,...,94.296436,145.973629,0.137514,39.371670,147.0,0.198516,104.184601,0.182842,42.6437,50.415852
2018-08-24,89.436780,109.316308,0.140358,33.772110,10.0,0.099033,98.254291,0.167444,13.2354,37.999408,...,94.778817,144.313356,0.141312,36.662083,145.0,0.198655,104.021750,0.183081,43.6268,49.391298
2018-08-25,88.982357,108.289161,0.143765,34.533738,10.0,0.099113,98.153766,0.168787,13.3759,36.068234,...,95.312802,142.671542,0.145110,37.205170,156.0,0.198812,103.851638,0.183321,44.6099,50.302392
2018-08-26,88.564312,107.300961,0.147171,32.327892,10.0,0.099197,98.051510,0.170131,13.5164,38.218464,...,95.898390,141.048187,0.148908,37.025723,164.0,0.198987,103.674267,0.183560,45.5930,50.293046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-08-31,162.025191,200.286329,0.619324,42.763420,48.0,0.172453,144.744706,0.502412,20.4773,38.396000,...,155.333204,162.636261,0.628828,39.207890,33.0,0.255944,125.313621,0.461194,43.7862,51.377280
2020-09-01,159.268848,198.375167,0.622186,43.810070,38.0,0.172852,144.749188,0.505260,20.2776,38.852410,...,152.262088,162.045383,0.631621,38.234510,32.0,0.256475,125.151876,0.464238,44.2681,52.432860
2020-09-02,156.609087,196.526964,0.625047,43.068170,38.0,0.173202,144.721651,0.508108,20.0779,39.113680,...,149.358157,161.481710,0.634414,37.747650,32.0,0.256981,124.969963,0.467282,44.7500,51.407720
2020-09-03,154.045906,194.741718,0.627908,44.793680,38.0,0.173502,144.662094,0.510956,19.8782,39.012960,...,146.621410,160.945243,0.637207,37.762790,33.0,0.257461,124.767879,0.470326,45.2319,52.016400


## Correlations

### Nature indicators

In [7]:
select = df.columns.get_level_values(1).isin(["NDVI", "NDVI Anomaly", "Rainfall (mm)", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)"])
df_nature = df.loc[:, select]
df_nature.head()

AdminStrata,Abyan,Abyan,Abyan,Abyan,Aden,Aden,Aden,Aden,Al Bayda,Al Bayda,...,Sana'a,Sana'a,Shabwah,Shabwah,Shabwah,Shabwah,Taizz,Taizz,Taizz,Taizz
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),...,NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-08-22,90.454758,111.487446,0.098889,98.450151,103.239156,156.402304,0.072064,99.190492,85.034755,108.16783,...,0.161509,99.498392,81.037435,100.409635,0.104272,99.41175,93.865658,147.652361,0.198394,104.340193
2018-08-23,89.92758,110.382403,0.098958,98.353086,102.928818,154.594533,0.072125,99.208464,84.49981,106.835572,...,0.162572,99.485905,80.327073,99.032806,0.104312,99.381895,94.296436,145.973629,0.198516,104.184601
2018-08-24,89.43678,109.316308,0.099033,98.254291,102.652003,152.798301,0.072188,99.226749,84.052257,105.60444,...,0.163661,99.480545,79.67166,97.728586,0.104355,99.351971,94.778817,144.313356,0.198655,104.02175
2018-08-25,88.982357,108.289161,0.099113,98.153766,102.408712,151.013609,0.072253,99.245345,83.692096,104.474432,...,0.164775,99.482315,79.071198,96.496975,0.104401,99.321979,95.312802,142.671542,0.198812,103.851638
2018-08-26,88.564312,107.300961,0.099197,98.05151,102.198944,149.240456,0.072319,99.264254,83.419328,103.44555,...,0.165916,99.491213,78.525686,95.337974,0.104449,99.291919,95.89839,141.048187,0.198987,103.674267


In [8]:
# Compute the correlation between the nature time-series for the various administrative regions: we then obtain a list of correlation matrices equal to the number of administrative regions.
corr_matrices_nature = list()

def correlation_matrices_nature(group):
    group.columns = group.columns.droplevel()
    mtrx = group.corr(method = "spearman")#.values
    corr_matrices_nature.append(mtrx)

df_nature.groupby(level = 0, axis = 1).apply(correlation_matrices_nature);

In [9]:
# We take the mean of the various correlation matrices (administrative regions).
CORR_nature_mean = pd.DataFrame(np.mean(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
corr = CORR_nature_mean.style.background_gradient(cmap = "coolwarm")
corr.export_png("./output_images/nature_indicators_%s.png" % COUNTRY)
corr

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),1.0,0.665813,0.326543,0.381977
3 Months Anomaly Rainfalls (%),0.665813,1.0,0.196075,0.419404
NDVI,0.326543,0.196075,1.0,0.773735
NDVI Anomaly,0.381977,0.419404,0.773735,1.0


In [10]:
# We take the median of the various correlation matrices (administrative regions).
CORR_nature_median = pd.DataFrame(np.median(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
CORR_nature_median.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),1.0,0.652138,0.288106,0.348847
3 Months Anomaly Rainfalls (%),0.652138,1.0,0.145921,0.347663
NDVI,0.288106,0.145921,1.0,0.819071
NDVI Anomaly,0.348847,0.347663,0.819071,1.0


In [11]:
# We take the variance of the various correlation matrices (administrative regions).
CORR_nature_variance = pd.DataFrame(np.var(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
CORR_nature_variance.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),0.0,0.013214,0.021556,0.045554
3 Months Anomaly Rainfalls (%),0.013214,0.0,0.060913,0.071691
NDVI,0.021556,0.060913,0.0,0.020842
NDVI Anomaly,0.045554,0.071691,0.020842,0.0


### All indicators

In [12]:
# Compute the correlation between all the time-series for the various administrative regions: we then obtain a list of correlation matrices equal to the number of administrative regions.
corr_matrices = list()

def correlation_matrices(group):
    group.columns = group.columns.droplevel()
    mtrx = group.corr(method = "spearman").values
    corr_matrices.append(mtrx)

df.groupby(level = 0, axis = 1).apply(correlation_matrices);

In [13]:
# We take the mean of the various correlation matrices (administrative regions).
CORR_mean = pd.DataFrame(np.mean(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
corr = CORR_mean.style.background_gradient(cmap = "coolwarm")
corr.export_png("./output_images/all_indicators_%s.png" % COUNTRY)
corr

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG <= 2,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI >= 19
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1 Month Anomaly Rainfalls (%),1.0,0.665813,0.380389,0.078697,-0.022557,0.326543,0.381977,0.300407,0.214146,-0.09223
3 Months Anomaly Rainfalls (%),0.665813,1.0,0.275855,0.048881,0.036424,0.196075,0.419404,0.214165,0.190976,-0.179952
Exchange rate,0.380389,0.275855,1.0,0.194204,-0.056503,0.368126,0.299596,0.511189,0.201464,0.036487
FCG <= 2,0.078697,0.048881,0.194204,1.0,0.063,0.072247,0.068269,0.168391,-0.034735,0.186525
Fatalities,-0.022557,0.036424,-0.056503,0.063,1.0,-0.130152,-0.174773,-0.058731,-0.013646,0.02074
NDVI,0.326543,0.196075,0.368126,0.072247,-0.130152,1.0,0.773735,0.220194,0.105152,0.011957
NDVI Anomaly,0.381977,0.419404,0.299596,0.068269,-0.174773,0.773735,1.0,0.242642,0.113648,-0.091389
Price cereals and tubers,0.300407,0.214165,0.511189,0.168391,-0.058731,0.220194,0.242642,1.0,0.070338,-0.014635
Rainfalls (mm),0.214146,0.190976,0.201464,-0.034735,-0.013646,0.105152,0.113648,0.070338,1.0,-0.117414
rCSI >= 19,-0.09223,-0.179952,0.036487,0.186525,0.02074,0.011957,-0.091389,-0.014635,-0.117414,1.0


In [14]:
# We take the median of the various correlation matrices (administrative regions).
CORR_median = pd.DataFrame(np.median(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
CORR_median.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG <= 2,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI >= 19
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1 Month Anomaly Rainfalls (%),1.0,0.652138,0.337622,0.019394,-0.073893,0.288106,0.348847,0.374075,0.202061,-0.116995
3 Months Anomaly Rainfalls (%),0.652138,1.0,0.28342,0.040364,0.036715,0.145921,0.347663,0.282371,0.177708,-0.200913
Exchange rate,0.337622,0.28342,1.0,0.216777,-0.138126,0.383036,0.298991,0.619878,0.153206,0.13936
FCG <= 2,0.019394,0.040364,0.216777,1.0,0.077973,0.070691,0.067444,0.208115,-0.072436,0.178476
Fatalities,-0.073893,0.036715,-0.138126,0.077973,1.0,-0.202415,-0.231538,-0.077559,-0.03529,0.016429
NDVI,0.288106,0.145921,0.383036,0.070691,-0.202415,1.0,0.819071,0.255819,0.156037,0.015521
NDVI Anomaly,0.348847,0.347663,0.298991,0.067444,-0.231538,0.819071,1.0,0.298415,0.116125,-0.08463
Price cereals and tubers,0.374075,0.282371,0.619878,0.208115,-0.077559,0.255819,0.298415,1.0,0.121269,0.011536
Rainfalls (mm),0.202061,0.177708,0.153206,-0.072436,-0.03529,0.156037,0.116125,0.121269,1.0,-0.075983
rCSI >= 19,-0.116995,-0.200913,0.13936,0.178476,0.016429,0.015521,-0.08463,0.011536,-0.075983,1.0


In [15]:
# We take the variance of the various correlation matrices (administrative regions).
CORR_variance = pd.DataFrame(np.var(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
CORR_variance.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG <= 2,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI >= 19
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1 Month Anomaly Rainfalls (%),0.0,0.013214,0.021159,0.047416,0.078179,0.021556,0.045554,0.062723,0.015866,0.036376
3 Months Anomaly Rainfalls (%),0.013214,0.0,0.027739,0.054383,0.091191,0.060913,0.071691,0.068786,0.046919,0.039382
Exchange rate,0.021159,0.027739,0.0,0.040155,0.097576,0.053719,0.054195,0.056527,0.028316,0.049972
FCG <= 2,0.047416,0.054383,0.040155,0.0,0.035919,0.049164,0.072743,0.032536,0.044626,0.030678
Fatalities,0.078179,0.091191,0.097576,0.035919,0.0,0.130586,0.148857,0.078797,0.041579,0.047048
NDVI,0.021556,0.060913,0.053719,0.049164,0.130586,0.0,0.020842,0.057908,0.049609,0.039603
NDVI Anomaly,0.045554,0.071691,0.054195,0.072743,0.148857,0.020842,0.0,0.067934,0.028708,0.052251
Price cereals and tubers,0.062723,0.068786,0.056527,0.032536,0.078797,0.057908,0.067934,0.0,0.053419,0.047966
Rainfalls (mm),0.015866,0.046919,0.028316,0.044626,0.041579,0.049609,0.028708,0.053419,0.0,0.031827
rCSI >= 19,0.036376,0.039382,0.049972,0.030678,0.047048,0.039603,0.052251,0.047966,0.031827,0.0
