# Spearman correlation

In this notebook, we compute the Spearman correlations between the time-series of the administrative regions of Yemen. Through this analysis, we can identify time-series with high correlations between them and then decide to exclude some for subsequent analyzes.

In [1]:
import dataframe_image as dfi
import pandas as pd
import numpy as np

In [2]:
COUNTRY = "Nigeria"

In [3]:
PATH_TO_DATA_FOLDER = "../Dataset time-series/output_data/" + COUNTRY + "/"

## Time-series dataset

In [4]:
# Load the time-series data (daily interpolation).
df = pd.read_csv(PATH_TO_DATA_FOLDER + COUNTRY + "-day.csv", header = [0, 1], index_col = 0)
df.index.name = "Datetime"
df.index = pd.to_datetime(df.index)
freq = "D"
df.index.freq = freq

In [5]:
df.columns.get_level_values(1).unique()

Index(['1 Month Anomaly Rainfalls (%)', '3 Months Anomaly Rainfalls (%)',
       'Code', 'FCG', 'Fatalities', 'Lat', 'Lon', 'NDVI', 'NDVI Anomaly',
       'Population', 'Price cereals and tubers', 'Rainfalls (mm)', 'Ramadan',
       'rCSI'],
      dtype='object', name='Indicator')

In [6]:
# Consider only some indicators for the analysis with Spearman correlation.
INDICATORS_TO_CONSIDER = ["FCG", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)", 
                          "NDVI", "Price cereals and tubers", "Exchange rate", "Fatalities", "NDVI Anomaly", 
                          "rCSI", "Rainfalls (mm)"]

df = df.loc[:, df.columns.get_level_values(1).isin(INDICATORS_TO_CONSIDER)]
df.head()

AdminStrata,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Borno,Borno,Borno,Yobe,Yobe,Yobe,Yobe,Yobe,Yobe,Yobe,Yobe,Yobe
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI,1 Month Anomaly Rainfalls (%),...,rCSI,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2019-01-18,100.259677,93.03714,23.586938,0.0,0.334429,96.868029,0.351367,0.0,43.008741,100.0,...,35.259528,100.0,143.249296,30.7229,66.0,0.232465,99.634913,0.184969,0.0,45.458462
2019-01-19,100.244839,93.977969,22.755425,0.0,0.332291,96.927069,0.350677,0.0,43.619385,100.0,...,34.719999,100.0,143.210813,30.273423,66.0,0.231498,99.53354,0.18624,0.0,45.499663
2019-01-20,100.23,95.031538,22.547817,0.0,0.330208,96.985875,0.349988,0.0,42.481559,100.0,...,34.65347,100.0,143.053244,31.047474,70.0,0.230541,99.430884,0.18751,0.0,44.19348
2019-01-21,100.215191,96.06103,23.758271,0.0,0.328135,97.044913,0.349299,0.0006,42.558962,100.0,...,34.362899,100.0,142.693166,31.814861,70.0,0.229593,99.333907,0.18878,0.0,45.048172
2019-01-22,100.200411,97.066445,22.374279,0.0,0.326072,97.104183,0.348609,0.0012,41.129399,100.0,...,36.355709,100.0,142.130581,30.674752,70.0,0.228655,99.242609,0.190051,0.0,43.359342


## Correlations

### Nature indicators

In [7]:
select = df.columns.get_level_values(1).isin(["NDVI", "NDVI Anomaly", "Rainfall (mm)", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)"])
df_nature = df.loc[:, select]
df_nature.head()

AdminStrata,Adamawa,Adamawa,Adamawa,Adamawa,Borno,Borno,Borno,Borno,Yobe,Yobe,Yobe,Yobe
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2019-01-18,100.259677,93.03714,0.334429,96.868029,100.0,117.367238,0.267711,102.864071,100.0,143.249296,0.232465,99.634913
2019-01-19,100.244839,93.977969,0.332291,96.927069,100.0,117.995892,0.266671,102.866521,100.0,143.210813,0.231498,99.53354
2019-01-20,100.23,95.031538,0.330208,96.985875,100.0,118.633264,0.265653,102.869594,100.0,143.053244,0.230541,99.430884
2019-01-21,100.215191,96.06103,0.328135,97.044913,100.0,119.148946,0.26463,102.874483,100.0,142.693166,0.229593,99.333907
2019-01-22,100.200411,97.066445,0.326072,97.104183,100.0,119.542937,0.263603,102.881188,100.0,142.130581,0.228655,99.242609


In [8]:
# Compute the correlation between the nature time-series for the various administrative regions: we then obtain a list of correlation matrices equal to the number of administrative regions.
corr_matrices_nature = list()

def correlation_matrices_nature(group):
    group.columns = group.columns.droplevel()
    mtrx = group.corr(method = "spearman")#.values
    corr_matrices_nature.append(mtrx)

df_nature.groupby(level = 0, axis = 1).apply(correlation_matrices_nature);

In [9]:
# We take the mean of the various correlation matrices (administrative regions).
CORR_nature_mean = pd.DataFrame(np.mean(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
corr = CORR_nature_mean.style.background_gradient(cmap = "coolwarm")
#corr.export_png(f"./output_images/nature_indicators_{COUNTRY}.png")
corr

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),1.0,0.607348,0.264335,0.088873
3 Months Anomaly Rainfalls (%),0.607348,1.0,0.268769,0.225402
NDVI,0.264335,0.268769,1.0,-0.025976
NDVI Anomaly,0.088873,0.225402,-0.025976,1.0


In [10]:
# We take the median of the various correlation matrices (administrative regions).
CORR_nature_median = pd.DataFrame(np.median(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
CORR_nature_median.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),1.0,0.574781,0.405439,0.217168
3 Months Anomaly Rainfalls (%),0.574781,1.0,0.432348,0.356324
NDVI,0.405439,0.432348,1.0,0.001063
NDVI Anomaly,0.217168,0.356324,0.001063,1.0


In [11]:
# We take the variance of the various correlation matrices (administrative regions).
CORR_nature_variance = pd.DataFrame(np.var(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
CORR_nature_variance.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),0.0,0.005572,0.043701,0.041836
3 Months Anomaly Rainfalls (%),0.005572,0.0,0.099845,0.054135
NDVI,0.043701,0.099845,0.0,0.135925
NDVI Anomaly,0.041836,0.054135,0.135925,0.0


### All indicators

In [12]:
# Compute the correlation between all the time-series for the various administrative regions: we then obtain a list of correlation matrices equal to the number of administrative regions.
corr_matrices = list()

def correlation_matrices(group):
    group.columns = group.columns.droplevel()
    mtrx = group.corr(method = "spearman").values
    if not np.isnan(np.sum(mtrx)): # nan if the time-series is completly flat (fatalities).
        corr_matrices.append(mtrx)

df.groupby(level = 0, axis = 1).apply(correlation_matrices);

In [13]:
# We take the mean of the various correlation matrices (administrative regions).
CORR_mean = pd.DataFrame(np.mean(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
corr = CORR_mean.style.background_gradient(cmap = "coolwarm")
corr.export_png(f"./output_images/all_indicators_{COUNTRY}.png")
corr

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1 Month Anomaly Rainfalls (%),1.0,0.607348,-0.248541,-0.206187,0.264335,0.088873,-0.359558,0.379038,0.046697
3 Months Anomaly Rainfalls (%),0.607348,1.0,-0.285934,0.020338,0.268769,0.225402,-0.531196,0.027146,0.046375
FCG,-0.248541,-0.285934,1.0,0.31352,0.032336,0.004366,0.62048,0.089844,-0.03124
Fatalities,-0.206187,0.020338,0.31352,1.0,-0.165533,0.340617,0.202315,-0.214053,-0.062075
NDVI,0.264335,0.268769,0.032336,-0.165533,1.0,-0.025976,0.109699,0.51192,-0.020467
NDVI Anomaly,0.088873,0.225402,0.004366,0.340617,-0.025976,1.0,0.012725,-0.390909,-0.174879
Price cereals and tubers,-0.359558,-0.531196,0.62048,0.202315,0.109699,0.012725,1.0,0.024511,0.000913
Rainfalls (mm),0.379038,0.027146,0.089844,-0.214053,0.51192,-0.390909,0.024511,1.0,0.021745
rCSI,0.046697,0.046375,-0.03124,-0.062075,-0.020467,-0.174879,0.000913,0.021745,1.0


In [14]:
# We take the median of the various correlation matrices (administrative regions).
CORR_median = pd.DataFrame(np.median(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
CORR_median.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1 Month Anomaly Rainfalls (%),1.0,0.574781,-0.394161,-0.164568,0.405439,0.217168,-0.334948,0.464193,0.150397
3 Months Anomaly Rainfalls (%),0.574781,1.0,-0.377393,0.064081,0.432348,0.356324,-0.601071,0.002186,0.126398
FCG,-0.394161,-0.377393,1.0,0.218066,0.057655,0.158326,0.580518,0.076882,0.042504
Fatalities,-0.164568,0.064081,0.218066,1.0,-0.059771,0.323599,0.235736,-0.323216,-0.096649
NDVI,0.405439,0.432348,0.057655,-0.059771,1.0,0.001063,0.119129,0.520103,-0.070581
NDVI Anomaly,0.217168,0.356324,0.158326,0.323599,0.001063,1.0,0.222281,-0.559855,-0.113324
Price cereals and tubers,-0.334948,-0.601071,0.580518,0.235736,0.119129,0.222281,1.0,0.038256,0.077484
Rainfalls (mm),0.464193,0.002186,0.076882,-0.323216,0.520103,-0.559855,0.038256,1.0,0.034582
rCSI,0.150397,0.126398,0.042504,-0.096649,-0.070581,-0.113324,0.077484,0.034582,1.0


In [15]:
# We take the variance of the various correlation matrices (administrative regions).
CORR_variance = pd.DataFrame(np.var(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
CORR_variance.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1 Month Anomaly Rainfalls (%),0.0,0.005572,0.057157,0.028607,0.043701,0.041836,0.022103,0.026285,0.024436
3 Months Anomaly Rainfalls (%),0.005572,0.0,0.060606,0.007162,0.099845,0.054135,0.033243,0.029317,0.110162
FCG,0.057157,0.060606,0.0,0.074127,0.001855,0.1148,0.014186,0.000905,0.037812
Fatalities,0.028607,0.007162,0.074127,0.0,0.105402,0.008557,0.034843,0.03481,0.011148
NDVI,0.043701,0.099845,0.001855,0.105402,0.0,0.135925,0.000458,0.009302,0.017101
NDVI Anomaly,0.041836,0.054135,0.1148,0.008557,0.135925,0.0,0.095713,0.092991,0.011082
Price cereals and tubers,0.022103,0.033243,0.014186,0.034843,0.000458,0.095713,0.0,0.001205,0.066946
Rainfalls (mm),0.026285,0.029317,0.000905,0.03481,0.009302,0.092991,0.001205,0.0,0.004524
rCSI,0.024436,0.110162,0.037812,0.011148,0.017101,0.011082,0.066946,0.004524,0.0
