# Spearman correlation

In this notebook, we compute the Spearman correlations between the time-series of the administrative regions of Yemen. Through this analysis, we can identify time-series with high correlations between them and then decide to exclude some for subsequent analyzes.

In [1]:
import dataframe_image as dfi
import pandas as pd
import numpy as np

In [2]:
COUNTRY = "Burkina Faso"

In [3]:
PATH_TO_DATA_FOLDER = "../Dataset time-series/output_data/" + COUNTRY + "/"

## Time-series dataset

In [4]:
# Load the time-series data (daily interpolation).
df = pd.read_csv(PATH_TO_DATA_FOLDER + COUNTRY + "-day.csv", header = [0, 1], index_col = 0)
df.index.name = "Datetime"
df.index = pd.to_datetime(df.index)
freq = "D"
df.index.freq = freq

In [5]:
df.columns.get_level_values(1).unique()

Index(['1 Month Anomaly Rainfalls (%)', '3 Months Anomaly Rainfalls (%)',
       'Code', 'Exchange rate', 'FCG', 'Fatalities', 'Lat', 'Lon', 'NDVI',
       'NDVI Anomaly', 'Population', 'Price cereals and tubers',
       'Rainfalls (mm)', 'Ramadan', 'rCSI'],
      dtype='object', name='Indicator')

In [6]:
# Consider only some indicators for the analysis with Spearman correlation.
INDICATORS_TO_CONSIDER = ["FCG", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)", 
                          "NDVI", "Price cereals and tubers", "Exchange rate", "Fatalities", "NDVI Anomaly", 
                          "rCSI", "Rainfalls (mm)"]

df = df.loc[:, df.columns.get_level_values(1).isin(INDICATORS_TO_CONSIDER)]
df.head()

AdminStrata,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,...,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI,...,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2019-09-08,104.747834,106.271378,0.796155,30.9217,4.0,0.496905,95.011603,0.077833,424.5106,19.4995,...,117.742597,117.51808,0.796155,36.2381,0.0,0.6591,100.49678,0.295372,465.6788,17.7619
2019-09-09,104.800365,106.240116,0.799253,31.1039,4.0,0.500725,95.0447,0.078059,426.2755,19.5674,...,117.536886,117.497259,0.799253,35.8286,0.0,0.660844,100.491038,0.294013,467.8467,17.5612
2019-09-10,104.881777,106.220459,0.802352,32.0303,4.0,0.504537,95.083381,0.078284,428.0404,20.4007,...,117.317641,117.473018,0.802352,36.2542,0.0,0.66256,100.485497,0.292654,470.0146,17.8142
2019-09-11,104.988996,106.210471,0.805451,32.1673,4.0,0.508317,95.128519,0.078509,428.2382,20.4056,...,117.088827,117.445525,0.805451,35.8838,0.0,0.664241,100.480693,0.291296,471.5916,17.3366
2019-09-12,105.122024,106.210151,0.808549,31.7701,4.0,0.512066,95.180114,0.078734,428.436,19.2385,...,116.850446,117.414779,0.808549,34.995,0.0,0.665889,100.476627,0.289937,473.1686,17.2483


## Correlations

### Nature indicators

In [7]:
select = df.columns.get_level_values(1).isin(["NDVI", "NDVI Anomaly", "Rainfall (mm)", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)"])
df_nature = df.loc[:, select]
df_nature.head()

AdminStrata,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Cascades,Cascades,Cascades,Cascades,Centre,Centre,...,Plateau-Central,Plateau-Central,Sahel,Sahel,Sahel,Sahel,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),...,NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2019-09-08,104.747834,106.271378,0.496905,95.011603,120.695958,114.75658,0.639206,97.573089,119.617734,124.616223,...,0.432107,99.596995,107.588258,105.747809,0.285234,98.873611,117.742597,117.51808,0.6591,100.49678
2019-09-09,104.800365,106.240116,0.500725,95.0447,120.653097,114.893976,0.640395,97.538516,118.543365,124.24967,...,0.436231,99.652335,107.621997,105.809922,0.28897,99.15211,117.536886,117.497259,0.660844,100.491038
2019-09-10,104.881777,106.220459,0.504537,95.083381,120.554396,115.026871,0.641564,97.504471,117.477595,123.89301,...,0.440346,99.709246,107.640708,105.877591,0.29272,99.438098,117.317641,117.473018,0.66256,100.485497
2019-09-11,104.988996,106.210471,0.508317,95.128519,120.44291,115.15126,0.642714,97.471181,116.435078,123.546758,...,0.444421,99.765892,107.668223,105.952952,0.296412,99.724997,117.088827,117.445525,0.664241,100.480693
2019-09-12,105.122024,106.210151,0.512066,95.180114,120.318638,115.267143,0.643843,97.438646,115.415814,123.210914,...,0.448457,99.822274,107.704543,106.036006,0.300046,100.012805,116.850446,117.414779,0.665889,100.476627


In [8]:
# Compute the correlation between the nature time-series for the various administrative regions: we then obtain a list of correlation matrices equal to the number of administrative regions.
corr_matrices_nature = list()

def correlation_matrices_nature(group):
    group.columns = group.columns.droplevel()
    mtrx = group.corr(method = "spearman")#.values
    corr_matrices_nature.append(mtrx)

df_nature.groupby(level = 0, axis = 1).apply(correlation_matrices_nature);

In [9]:
# We take the mean of the various correlation matrices (administrative regions).
CORR_nature_mean = pd.DataFrame(np.mean(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
corr = CORR_nature_mean.style.background_gradient(cmap = "coolwarm")
corr.export_png("./output_images/nature_indicators_%s.png" % COUNTRY)
corr

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),1.0,0.391697,0.499572,0.265261
3 Months Anomaly Rainfalls (%),0.391697,1.0,0.300936,0.723721
NDVI,0.499572,0.300936,1.0,0.211057
NDVI Anomaly,0.265261,0.723721,0.211057,1.0


In [10]:
# We take the median of the various correlation matrices (administrative regions).
CORR_nature_median = pd.DataFrame(np.median(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
CORR_nature_median.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),1.0,0.333895,0.517471,0.32062
3 Months Anomaly Rainfalls (%),0.333895,1.0,0.31179,0.719357
NDVI,0.517471,0.31179,1.0,0.289576
NDVI Anomaly,0.32062,0.719357,0.289576,1.0


In [11]:
# We take the variance of the various correlation matrices (administrative regions).
CORR_nature_variance = pd.DataFrame(np.var(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
CORR_nature_variance.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),0.0,0.016766,0.019931,0.026266
3 Months Anomaly Rainfalls (%),0.016766,0.0,0.07124,0.015151
NDVI,0.019931,0.07124,0.0,0.191218
NDVI Anomaly,0.026266,0.015151,0.191218,0.0


### All indicators

In [12]:
# Compute the correlation between all the time-series for the various administrative regions: we then obtain a list of correlation matrices equal to the number of administrative regions.
corr_matrices = list()

def correlation_matrices(group):
    group.columns = group.columns.droplevel()
    mtrx = group.corr(method = "spearman").values
    if not np.isnan(np.sum(mtrx)): # nan if the time-series is completly flat (fatalities).
        corr_matrices.append(mtrx)

df.groupby(level = 0, axis = 1).apply(correlation_matrices);

In [13]:
# We take the mean of the various correlation matrices (administrative regions).
CORR_mean = pd.DataFrame(np.mean(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
corr = CORR_mean.style.background_gradient(cmap = "coolwarm")
corr.export_png("./output_images/all_indicators_%s.png" % COUNTRY)
corr

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1 Month Anomaly Rainfalls (%),1.0,0.400234,-0.136278,-0.168116,0.041829,0.491648,0.261971,-0.144959,0.29126,0.422878
3 Months Anomaly Rainfalls (%),0.400234,1.0,-0.264933,-0.09237,-0.048482,0.282125,0.724085,-0.269807,-0.38046,0.383398
Exchange rate,-0.136278,-0.264933,1.0,0.1917,0.242531,-0.562251,-0.154519,-0.547044,-0.247426,-0.385683
FCG,-0.168116,-0.09237,0.1917,1.0,0.010877,-0.604525,-0.024827,0.023751,-0.345928,-0.357319
Fatalities,0.041829,-0.048482,0.242531,0.010877,1.0,-0.125471,-0.079703,-0.202539,-0.044599,-0.148381
NDVI,0.491648,0.282125,-0.562251,-0.604525,-0.125471,1.0,0.184311,0.249726,0.567856,0.647162
NDVI Anomaly,0.261971,0.724085,-0.154519,-0.024827,-0.079703,0.184311,1.0,-0.231702,-0.460831,0.33532
Price cereals and tubers,-0.144959,-0.269807,-0.547044,0.023751,-0.202539,0.249726,-0.231702,1.0,0.478164,0.052859
Rainfalls (mm),0.29126,-0.38046,-0.247426,-0.345928,-0.044599,0.567856,-0.460831,0.478164,1.0,0.240257
rCSI,0.422878,0.383398,-0.385683,-0.357319,-0.148381,0.647162,0.33532,0.052859,0.240257,1.0


In [14]:
# We take the median of the various correlation matrices (administrative regions).
CORR_median = pd.DataFrame(np.median(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
CORR_median.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1 Month Anomaly Rainfalls (%),1.0,0.33619,-0.154978,-0.215575,0.022977,0.491258,0.328554,-0.104594,0.288773,0.43483
3 Months Anomaly Rainfalls (%),0.33619,1.0,-0.363496,-0.092257,0.01701,0.303799,0.751918,-0.269775,-0.400019,0.424601
Exchange rate,-0.154978,-0.363496,1.0,0.166973,0.157661,-0.562141,-0.196941,-0.576245,-0.254892,-0.390178
FCG,-0.215575,-0.092257,0.166973,1.0,-0.026069,-0.64285,-0.111064,0.064011,-0.377595,-0.368609
Fatalities,0.022977,0.01701,0.157661,-0.026069,1.0,-0.189059,-0.017419,-0.289786,-0.101159,-0.216671
NDVI,0.491258,0.303799,-0.562141,-0.64285,-0.189059,1.0,0.262325,0.238073,0.568179,0.673952
NDVI Anomaly,0.328554,0.751918,-0.196941,-0.111064,-0.017419,0.262325,1.0,-0.308422,-0.581967,0.388747
Price cereals and tubers,-0.104594,-0.269775,-0.576245,0.064011,-0.289786,0.238073,-0.308422,1.0,0.505593,0.050976
Rainfalls (mm),0.288773,-0.400019,-0.254892,-0.377595,-0.101159,0.568179,-0.581967,0.505593,1.0,0.27183
rCSI,0.43483,0.424601,-0.390178,-0.368609,-0.216671,0.673952,0.388747,0.050976,0.27183,1.0


In [15]:
# We take the variance of the various correlation matrices (administrative regions).
CORR_variance = pd.DataFrame(np.var(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
CORR_variance.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1 Month Anomaly Rainfalls (%),0.0,0.017216,0.071843,0.046816,0.104061,0.020776,0.028314,0.062858,0.007617,0.012637
3 Months Anomaly Rainfalls (%),0.017216,0.0,0.075372,0.091058,0.086667,0.072576,0.016412,0.078317,0.033768,0.058903
Exchange rate,0.071843,0.075372,0.0,0.01926,0.121062,0.001379,0.092473,0.008612,0.002092,0.028292
FCG,0.046816,0.091058,0.01926,0.0,0.119963,0.022207,0.168423,0.070358,0.074707,0.089288
Fatalities,0.104061,0.086667,0.121062,0.119963,0.0,0.17322,0.091626,0.125506,0.126856,0.109364
NDVI,0.020776,0.072576,0.001379,0.022207,0.17322,0.0,0.197853,0.014554,0.008255,0.036738
NDVI Anomaly,0.028314,0.016412,0.092473,0.168423,0.091626,0.197853,0.0,0.088265,0.121569,0.138489
Price cereals and tubers,0.062858,0.078317,0.008612,0.070358,0.125506,0.014554,0.088265,0.0,0.024167,0.108708
Rainfalls (mm),0.007617,0.033768,0.002092,0.074707,0.126856,0.008255,0.121569,0.024167,0.0,0.113839
rCSI,0.012637,0.058903,0.028292,0.089288,0.109364,0.036738,0.138489,0.108708,0.113839,0.0
