# Spearman correlation

In this notebook, we compute the Spearman correlations between the time-series of the administrative regions of Nigeria. Through this analysis, we can identify time-series with high correlations between them and then decide to exclude some for subsequent analyzes.

In [1]:
import dataframe_image as dfi
import pandas as pd
import numpy as np

In [2]:
COUNTRY = "Nigeria"

In [3]:
PATH_TO_DATA_FOLDER = "../Dataset time-series/output_data/" + COUNTRY + "/"

## Time-series dataset

In [4]:
# Load the time-series data (daily interpolation).
df = pd.read_csv(PATH_TO_DATA_FOLDER + COUNTRY + "-day.csv", header = [0, 1], index_col = 0)
df.index.name = "Datetime"
df.index = pd.to_datetime(df.index)
freq = "D"
df.index.freq = freq

In [5]:
df.columns.get_level_values(1).unique()

Index(['1 Month Anomaly Rainfalls (%)', '3 Months Anomaly Rainfalls (%)',
       'Code', 'FCG <= 2', 'Fatalities', 'Lat', 'Lon', 'NDVI', 'NDVI Anomaly',
       'Population', 'Price cereals and tubers', 'Rainfalls (mm)', 'Ramadan',
       'rCSI >= 19'],
      dtype='object', name='Indicator')

In [6]:
# Consider only some indicators for the analysis with Spearman correlation.
INDICATORS_TO_CONSIDER = ["FCG <= 2", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)", "NDVI", "Price cereals and tubers", "Fatalities", "NDVI Anomaly", "rCSI >= 19", "Rainfalls (mm)"]
df = pd.concat([df.loc[:, (slice(None), feature)] for feature in INDICATORS_TO_CONSIDER], axis = 1).sort_index(axis = 1)
df

AdminStrata,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Borno,Borno,Borno,Yobe,Yobe,Yobe,Yobe,Yobe,Yobe,Yobe,Yobe,Yobe
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG <= 2,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI >= 19,1 Month Anomaly Rainfalls (%),...,rCSI >= 19,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG <= 2,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI >= 19
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2019-01-18,100.259677,93.037140,23.586938,0.0,0.334429,96.868029,0.204482,0.0000,43.008741,100.000000,...,35.259528,100.000000,143.249296,30.722900,66.0,0.232465,99.634913,0.128119,0.0000,45.458462
2019-01-19,100.244839,93.977969,22.755425,0.0,0.332291,96.927069,0.204677,0.0000,43.619385,100.000000,...,34.719999,100.000000,143.210813,30.273423,66.0,0.231498,99.533540,0.127451,0.0000,45.499663
2019-01-20,100.230000,95.031538,22.547817,0.0,0.330208,96.985875,0.204873,0.0000,42.481559,100.000000,...,34.653470,100.000000,143.053244,31.047474,70.0,0.230541,99.430884,0.126782,0.0000,44.193480
2019-01-21,100.215191,96.061030,23.758271,0.0,0.328135,97.044913,0.205068,0.0006,42.558962,100.000000,...,34.362899,100.000000,142.693166,31.814861,70.0,0.229593,99.333907,0.126114,0.0000,45.048172
2019-01-22,100.200411,97.066445,22.374279,0.0,0.326072,97.104183,0.205264,0.0012,41.129399,100.000000,...,36.355709,100.000000,142.130581,30.674752,70.0,0.228655,99.242609,0.125446,0.0000,43.359342
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-09-26,77.297220,89.440721,41.470788,0.0,0.649718,96.235517,0.842524,206.0412,16.702326,114.770357,...,11.992484,113.934396,123.454277,39.356354,10.0,0.509697,99.367495,0.491878,121.6882,21.466849
2020-09-27,78.097507,89.420572,45.390480,0.0,0.650146,96.312743,0.841555,204.7794,16.707763,115.029275,...,13.156250,113.501432,123.491188,41.278181,10.0,0.508952,99.450523,0.492387,117.3144,21.713746
2020-09-28,78.983166,89.407210,45.110420,0.0,0.650531,96.387221,0.840585,203.5176,19.095040,115.303064,...,14.377357,113.097442,123.535658,42.569246,10.0,0.508080,99.541990,0.492896,112.9406,24.472861
2020-09-29,79.954197,89.400633,45.623021,0.0,0.650872,96.458949,0.839615,202.2558,20.885011,115.591724,...,14.936559,112.722428,123.587685,43.839630,10.0,0.507082,99.641896,0.493406,108.5668,26.089463


## Correlations

### Nature indicators

In [7]:
select = df.columns.get_level_values(1).isin(["NDVI", "NDVI Anomaly", "Rainfall (mm)", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)"])
df_nature = df.loc[:, select]
df_nature.head()

AdminStrata,Adamawa,Adamawa,Adamawa,Adamawa,Borno,Borno,Borno,Borno,Yobe,Yobe,Yobe,Yobe
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2019-01-18,100.259677,93.03714,0.334429,96.868029,100.0,117.367238,0.267711,102.864071,100.0,143.249296,0.232465,99.634913
2019-01-19,100.244839,93.977969,0.332291,96.927069,100.0,117.995892,0.266671,102.866521,100.0,143.210813,0.231498,99.53354
2019-01-20,100.23,95.031538,0.330208,96.985875,100.0,118.633264,0.265653,102.869594,100.0,143.053244,0.230541,99.430884
2019-01-21,100.215191,96.06103,0.328135,97.044913,100.0,119.148946,0.26463,102.874483,100.0,142.693166,0.229593,99.333907
2019-01-22,100.200411,97.066445,0.326072,97.104183,100.0,119.542937,0.263603,102.881188,100.0,142.130581,0.228655,99.242609


In [8]:
# Compute the correlation between the nature time-series for the various administrative regions: we then obtain a list of correlation matrices equal to the number of administrative regions.
corr_matrices_nature = list()

def correlation_matrices_nature(group):
    group.columns = group.columns.droplevel()
    mtrx = group.corr(method = "spearman")#.values
    corr_matrices_nature.append(mtrx)

df_nature.groupby(level = 0, axis = 1).apply(correlation_matrices_nature);

In [9]:
# We take the mean of the various correlation matrices (administrative regions).
CORR_nature_mean = pd.DataFrame(np.mean(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
corr = CORR_nature_mean.style.background_gradient(cmap = "coolwarm")
corr.export_png("./output_images/nature_indicators_%s.png" % COUNTRY)
corr

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),1.0,0.595842,0.345953,0.076218
3 Months Anomaly Rainfalls (%),0.595842,1.0,0.318319,0.167348
NDVI,0.345953,0.318319,1.0,-0.07788
NDVI Anomaly,0.076218,0.167348,-0.07788,1.0


In [10]:
# We take the median of the various correlation matrices (administrative regions).
CORR_nature_median = pd.DataFrame(np.median(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
CORR_nature_median.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),1.0,0.604473,0.38841,0.124334
3 Months Anomaly Rainfalls (%),0.604473,1.0,0.446368,0.338059
NDVI,0.38841,0.446368,1.0,-0.120673
NDVI Anomaly,0.124334,0.338059,-0.120673,1.0


In [11]:
# We take the variance of the various correlation matrices (administrative regions).
CORR_nature_variance = pd.DataFrame(np.var(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
CORR_nature_variance.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),0.0,0.000193,0.024009,0.058005
3 Months Anomaly Rainfalls (%),0.000193,0.0,0.055066,0.067709
NDVI,0.024009,0.055066,0.0,0.128062
NDVI Anomaly,0.058005,0.067709,0.128062,0.0


### All indicators

In [12]:
# Compute the correlation between all the time-series for the various administrative regions: we then obtain a list of correlation matrices equal to the number of administrative regions.
corr_matrices = list()

def correlation_matrices(group):
    group.columns = group.columns.droplevel()
    mtrx = group.corr(method = "spearman").values
    corr_matrices.append(mtrx)

df.groupby(level = 0, axis = 1).apply(correlation_matrices);

In [13]:
# We take the mean of the various correlation matrices (administrative regions).
CORR_mean = pd.DataFrame(np.mean(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
corr = CORR_mean.style.background_gradient(cmap = "coolwarm")
corr.export_png("./output_images/all_indicators_%s.png" % COUNTRY)
corr

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG <= 2,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI >= 19
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1 Month Anomaly Rainfalls (%),1.0,0.595842,-0.220405,-0.258842,0.345953,0.076218,-0.149659,0.358118,0.223438
3 Months Anomaly Rainfalls (%),0.595842,1.0,-0.306089,-0.052421,0.318319,0.167348,-0.26768,-0.01394,0.10163
FCG <= 2,-0.220405,-0.306089,1.0,0.32695,-0.152326,-0.048154,0.409736,0.085097,-0.214853
Fatalities,-0.258842,-0.052421,0.32695,1.0,-0.180462,0.294727,0.165891,-0.21641,-0.128283
NDVI,0.345953,0.318319,-0.152326,-0.180462,1.0,-0.07788,0.238114,0.541229,-0.179323
NDVI Anomaly,0.076218,0.167348,-0.048154,0.294727,-0.07788,1.0,0.052842,-0.419658,-0.321005
Price cereals and tubers,-0.149659,-0.26768,0.409736,0.165891,0.238114,0.052842,1.0,0.230016,-0.63622
Rainfalls (mm),0.358118,-0.01394,0.085097,-0.21641,0.541229,-0.419658,0.230016,1.0,0.10987
rCSI >= 19,0.223438,0.10163,-0.214853,-0.128283,-0.179323,-0.321005,-0.63622,0.10987,1.0


In [14]:
# We take the median of the various correlation matrices (administrative regions).
CORR_median = pd.DataFrame(np.median(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
CORR_median.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG <= 2,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI >= 19
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1 Month Anomaly Rainfalls (%),1.0,0.604473,-0.333627,-0.23484,0.38841,0.124334,-0.155359,0.438908,0.208437
3 Months Anomaly Rainfalls (%),0.604473,1.0,-0.410258,-0.084587,0.446368,0.338059,-0.354061,-0.001156,0.069267
FCG <= 2,-0.333627,-0.410258,1.0,0.188191,-0.136421,-0.010587,0.40853,0.078962,-0.091154
Fatalities,-0.23484,-0.084587,0.188191,1.0,-0.125883,0.275208,0.169111,-0.353719,-0.21229
NDVI,0.38841,0.446368,-0.136421,-0.125883,1.0,-0.120673,0.179893,0.549443,-0.284396
NDVI Anomaly,0.124334,0.338059,-0.010587,0.275208,-0.120673,1.0,0.279831,-0.581841,-0.321817
Price cereals and tubers,-0.155359,-0.354061,0.40853,0.169111,0.179893,0.279831,1.0,0.203268,-0.654856
Rainfalls (mm),0.438908,-0.001156,0.078962,-0.353719,0.549443,-0.581841,0.203268,1.0,0.122794
rCSI >= 19,0.208437,0.069267,-0.091154,-0.21229,-0.284396,-0.321817,-0.654856,0.122794,1.0


In [15]:
# We take the variance of the various correlation matrices (administrative regions).
CORR_variance = pd.DataFrame(np.var(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
CORR_variance.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG <= 2,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI >= 19
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1 Month Anomaly Rainfalls (%),0.0,0.000193,0.043082,0.003886,0.024009,0.058005,0.002586,0.027005,0.018034
3 Months Anomaly Rainfalls (%),0.000193,0.0,0.025592,0.004702,0.055066,0.067709,0.019684,0.042498,0.068179
FCG <= 2,0.043082,0.025592,0.0,0.046789,0.003619,0.07272,0.027065,0.000775,0.033134
Fatalities,0.003886,0.004702,0.046789,0.0,0.07748,0.01528,0.116134,0.05362,0.061177
NDVI,0.024009,0.055066,0.003619,0.07748,0.0,0.128062,0.008495,0.013928,0.030538
NDVI Anomaly,0.058005,0.067709,0.07272,0.01528,0.128062,0.0,0.124801,0.135119,0.067746
Price cereals and tubers,0.002586,0.019684,0.027065,0.116134,0.008495,0.124801,0.0,0.005962,0.002674
Rainfalls (mm),0.027005,0.042498,0.000775,0.05362,0.013928,0.135119,0.005962,0.0,0.003649
rCSI >= 19,0.018034,0.068179,0.033134,0.061177,0.030538,0.067746,0.002674,0.003649,0.0
