# Spearman correlation

In this notebook, we compute the Spearman correlations between the time-series of the administrative regions of Yemen. Through this analysis, we can identify time-series with high correlations between them and then decide to exclude some for subsequent analyzes.

In [1]:
import dataframe_image as dfi
import pandas as pd
import numpy as np

In [2]:
COUNTRY = "Cameroon"

In [3]:
PATH_TO_DATA_FOLDER = f"../Dataset time-series/output_data/{COUNTRY}/"

## Time-series dataset

In [4]:
# Load the time-series data (daily interpolation).
df = pd.read_csv(PATH_TO_DATA_FOLDER + COUNTRY + "-day.csv", header = [0, 1], index_col = 0)
df.index.name = "Datetime"
df.index = pd.to_datetime(df.index)
freq = "D"
df.index.freq = freq

In [5]:
df.columns.get_level_values(1).unique()

Index(['1 Month Anomaly Rainfalls (%)', '3 Months Anomaly Rainfalls (%)',
       'Code', 'FCG', 'Fatalities', 'Lat', 'Lon', 'NDVI', 'NDVI Anomaly',
       'Population', 'Rainfalls (mm)', 'Ramadan', 'rCSI'],
      dtype='object', name='Indicator')

In [6]:
# Consider only some indicators for the analysis with Spearman correlation.
INDICATORS_TO_CONSIDER = ["FCG", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)", 
                          "NDVI", "Price cereals and tubers", "Exchange rate", "Fatalities", "NDVI Anomaly", 
                          "rCSI", "Rainfalls (mm)"]

df = df.loc[:, df.columns.get_level_values(1).isin(INDICATORS_TO_CONSIDER)]
df.head()

AdminStrata,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Central,Central,...,South-West,South-West,West,West,West,West,West,West,West,West
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Rainfalls (mm),rCSI,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),...,Rainfalls (mm),rCSI,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Rainfalls (mm),rCSI
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2019-06-30,101.837038,102.942884,26.79795,0.0,0.698378,104.961832,514.7136,16.43836,94.105788,103.036325,...,250.133,57.68452,98.404472,114.494632,39.07455,1.0,0.733825,103.350519,478.4319,20.90831
2019-07-01,101.745991,102.802374,27.08333,0.0,0.700721,104.874987,520.9204,17.0,93.430054,102.645889,...,261.6982,56.5694,97.784255,113.824392,37.46919,1.0,0.735867,103.274955,489.1485,20.0493
2019-07-02,101.651256,102.667455,25.39063,0.0,0.703054,104.788066,527.1272,15.9375,92.797532,102.271692,...,273.2634,57.57078,97.234921,113.185486,37.77065,1.0,0.737881,103.199658,499.8651,19.56696
2019-07-03,101.552834,102.538126,24.86611,0.0,0.705376,104.701069,533.334,15.60826,92.208223,101.913731,...,284.8286,54.00416,96.756472,112.577914,38.15074,1.0,0.739869,103.124628,510.5817,20.51282
2019-07-04,101.450723,102.414388,24.41773,0.0,0.707687,104.613995,539.5408,15.32682,91.662127,101.572009,...,296.3938,54.08571,96.348907,112.001678,37.79831,1.0,0.74183,103.049865,521.2983,20.32333


## Correlations

### Nature indicators

In [7]:
select = df.columns.get_level_values(1).isin(["NDVI", "NDVI Anomaly", "Rainfall (mm)", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)"])
df_nature = df.loc[:, select]
df_nature.head()

AdminStrata,Adamawa,Adamawa,Adamawa,Adamawa,Central,Central,Central,Central,East,East,...,South,South,South-West,South-West,South-West,South-West,West,West,West,West
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),...,NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2019-06-30,101.837038,102.942884,0.698378,104.961832,94.105788,103.036325,0.806272,101.630051,95.792424,98.517289,...,0.833746,100.566246,88.646429,89.991535,0.819161,100.284568,98.404472,114.494632,0.733825,103.350519
2019-07-01,101.745991,102.802374,0.700721,104.874987,93.430054,102.645889,0.807503,101.617239,95.34998,98.397911,...,0.834372,100.573365,87.94169,89.368156,0.819536,100.28982,97.784255,113.824392,0.735867,103.274955
2019-07-02,101.651256,102.667455,0.703054,104.788066,92.797532,102.271692,0.808719,101.604242,94.924146,98.281927,...,0.83499,100.580691,87.2887,88.816451,0.819911,100.294394,97.234921,113.185486,0.737881,103.199658
2019-07-03,101.552834,102.538126,0.705376,104.701069,92.208223,101.913731,0.809918,101.591059,94.514922,98.169337,...,0.835599,100.588223,86.687459,88.336422,0.820285,100.29829,96.756472,112.577914,0.739869,103.124628
2019-07-04,101.450723,102.414388,0.707687,104.613995,91.662127,101.572009,0.811102,101.577691,94.122309,98.06014,...,0.8362,100.595963,86.137967,87.928067,0.820659,100.301508,96.348907,112.001678,0.74183,103.049865


In [8]:
# Compute the correlation between the nature time-series for the various administrative regions: we then obtain a list of correlation matrices equal to the number of administrative regions.
corr_matrices_nature = list()

def correlation_matrices_nature(group):
    group.columns = group.columns.droplevel()
    mtrx = group.corr(method = "spearman")#.values
    corr_matrices_nature.append(mtrx)

df_nature.groupby(level = 0, axis = 1).apply(correlation_matrices_nature);

In [9]:
# We take the mean of the various correlation matrices (administrative regions).
CORR_nature_mean = pd.DataFrame(np.mean(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
corr = CORR_nature_mean.style.background_gradient(cmap = "coolwarm")
#corr.export_png(f"./output_images/nature_indicators_{COUNTRY}.png")
corr

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),1.0,0.61054,0.360909,0.022488
3 Months Anomaly Rainfalls (%),0.61054,1.0,0.086732,0.32722
NDVI,0.360909,0.086732,1.0,-0.328852
NDVI Anomaly,0.022488,0.32722,-0.328852,1.0


In [10]:
# We take the median of the various correlation matrices (administrative regions).
CORR_nature_median = pd.DataFrame(np.median(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
CORR_nature_median.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),1.0,0.653719,0.453782,0.066178
3 Months Anomaly Rainfalls (%),0.653719,1.0,0.165051,0.423603
NDVI,0.453782,0.165051,1.0,-0.327369
NDVI Anomaly,0.066178,0.423603,-0.327369,1.0


In [11]:
# We take the variance of the various correlation matrices (administrative regions).
CORR_nature_variance = pd.DataFrame(np.var(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
CORR_nature_variance.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),0.0,0.020061,0.096457,0.153259
3 Months Anomaly Rainfalls (%),0.020061,0.0,0.112536,0.170862
NDVI,0.096457,0.112536,0.0,0.068751
NDVI Anomaly,0.153259,0.170862,0.068751,0.0


### All indicators

In [12]:
# Compute the correlation between all the time-series for the various administrative regions: we then obtain a list of correlation matrices equal to the number of administrative regions.
corr_matrices = list()

def correlation_matrices(group):
    group.columns = group.columns.droplevel()
    mtrx = group.corr(method = "spearman").values
    if not np.isnan(np.sum(mtrx)): # nan if the time-series is completly flat (fatalities).
        corr_matrices.append(mtrx)

df.groupby(level = 0, axis = 1).apply(correlation_matrices);

In [13]:
# We take the mean of the various correlation matrices (administrative regions).
CORR_mean = pd.DataFrame(np.mean(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
corr = CORR_mean.style.background_gradient(cmap = "coolwarm")
corr.export_png(f"./output_images/all_indicators_{COUNTRY}.png")
corr

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Rainfalls (mm),rCSI
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1 Month Anomaly Rainfalls (%),1.0,0.600878,-0.166683,-0.120542,0.342868,0.08497,0.357562,-0.117905
3 Months Anomaly Rainfalls (%),0.600878,1.0,-0.092191,-0.013921,0.07057,0.394949,0.006256,0.020339
FCG,-0.166683,-0.092191,1.0,-0.12699,0.149805,0.045007,0.031103,0.091544
Fatalities,-0.120542,-0.013921,-0.12699,1.0,-0.196345,-0.024586,-0.085932,-0.037406
NDVI,0.342868,0.07057,0.149805,-0.196345,1.0,-0.330367,0.79157,-0.288026
NDVI Anomaly,0.08497,0.394949,0.045007,-0.024586,-0.330367,1.0,-0.515188,0.232041
Rainfalls (mm),0.357562,0.006256,0.031103,-0.085932,0.79157,-0.515188,1.0,-0.341293
rCSI,-0.117905,0.020339,0.091544,-0.037406,-0.288026,0.232041,-0.341293,1.0


In [14]:
# We take the median of the various correlation matrices (administrative regions).
CORR_median = pd.DataFrame(np.median(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
CORR_median.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Rainfalls (mm),rCSI
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1 Month Anomaly Rainfalls (%),1.0,0.644945,-0.312985,-0.163721,0.442216,0.178841,0.373286,-0.054307
3 Months Anomaly Rainfalls (%),0.644945,1.0,-0.169572,-0.080716,0.097917,0.653908,-0.033603,0.042977
FCG,-0.312985,-0.169572,1.0,-0.203043,0.166,0.094686,0.13693,0.125755
Fatalities,-0.163721,-0.080716,-0.203043,1.0,-0.212818,-0.04294,-0.238187,-0.060157
NDVI,0.442216,0.097917,0.166,-0.212818,1.0,-0.339519,0.833564,-0.450401
NDVI Anomaly,0.178841,0.653908,0.094686,-0.04294,-0.339519,1.0,-0.518036,0.230515
Rainfalls (mm),0.373286,-0.033603,0.13693,-0.238187,0.833564,-0.518036,1.0,-0.413112
rCSI,-0.054307,0.042977,0.125755,-0.060157,-0.450401,0.230515,-0.413112,1.0


In [15]:
# We take the variance of the various correlation matrices (administrative regions).
CORR_variance = pd.DataFrame(np.var(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
CORR_variance.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Rainfalls (mm),rCSI
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1 Month Anomaly Rainfalls (%),0.0,0.021356,0.137294,0.150388,0.10392,0.131248,0.122448,0.077788
3 Months Anomaly Rainfalls (%),0.021356,0.0,0.137705,0.10656,0.122429,0.143975,0.131065,0.110376
FCG,0.137294,0.137705,0.0,0.059899,0.112579,0.053,0.170655,0.165947
Fatalities,0.150388,0.10656,0.059899,0.0,0.052185,0.087183,0.119599,0.070673
NDVI,0.10392,0.122429,0.112579,0.052185,0.0,0.076367,0.011253,0.166906
NDVI Anomaly,0.131248,0.143975,0.053,0.087183,0.076367,0.0,0.040538,0.089036
Rainfalls (mm),0.122448,0.131065,0.170655,0.119599,0.011253,0.040538,0.0,0.133949
rCSI,0.077788,0.110376,0.165947,0.070673,0.166906,0.089036,0.133949,0.0
