# Spearman correlation

In this notebook, we compute the Spearman correlations between the time-series of the administrative regions of Yemen. Through this analysis, we can identify time-series with high correlations between them and then decide to exclude some for subsequent analyzes.

In [1]:
import dataframe_image as dfi
import pandas as pd
import numpy as np

In [2]:
COUNTRY = "Syria"

In [3]:
PATH_TO_DATA_FOLDER = "../Dataset time-series/output_data/" + COUNTRY + "/"

## Time-series dataset

In [4]:
# Load the time-series data (daily interpolation).
df = pd.read_csv(PATH_TO_DATA_FOLDER + COUNTRY + "-day.csv", header = [0, 1], index_col = 0)
df.index.name = "Datetime"
df.index = pd.to_datetime(df.index)
freq = "D"
df.index.freq = freq

In [5]:
df.columns.get_level_values(1).unique()

Index(['1 Month Anomaly Rainfalls (%)', '3 Months Anomaly Rainfalls (%)',
       'Code', 'Exchange rate', 'FCG', 'Fatalities', 'Lat', 'Lon', 'NDVI',
       'NDVI Anomaly', 'Population', 'Price cereals and tubers',
       'Rainfalls (mm)', 'Ramadan', 'rCSI'],
      dtype='object', name='Indicator')

In [6]:
# Consider only some indicators for the analysis with Spearman correlation.
INDICATORS_TO_CONSIDER = ["FCG", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)", 
                          "NDVI", "Price cereals and tubers", "Exchange rate", "Fatalities", "NDVI Anomaly", 
                          "rCSI", "Rainfalls (mm)"]

df = df.loc[:, df.columns.get_level_values(1).isin(INDICATORS_TO_CONSIDER)]
df.head()

AdminStrata,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,...,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI,...,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-09-29,99.920932,103.281473,0.014694,36.83872,109.5,0.117052,89.349382,0.039965,0.1372,34.0544,...,113.829315,179.577637,0.014121,12.46537,21.0,0.445703,114.29633,0.128021,7.9716,27.31994
2018-09-30,99.894768,102.581411,0.014829,36.83872,99.5,0.116985,89.32613,0.040093,0.15,34.0544,...,113.631431,173.128816,0.014243,12.94964,21.0,0.44519,114.196334,0.128717,8.1647,26.22302
2018-10-01,99.936859,102.137474,0.015061,36.49929,97.5,0.116915,89.301813,0.040317,0.7876,33.67614,...,113.462981,166.727201,0.01443,12.45675,21.0,0.444679,114.087915,0.129239,10.0496,25.22491
2018-10-02,100.047045,101.810225,0.015157,36.37038,97.0,0.116842,89.27532,0.040412,1.4245,31.54574,...,113.241732,160.712897,0.014494,12.18369,21.0,0.44417,113.971913,0.129066,11.5057,27.92877
2018-10-03,100.225326,101.599662,0.015254,38.25699,85.0,0.116765,89.246651,0.040507,2.0614,32.7672,...,112.967684,155.085903,0.014559,12.72342,21.0,0.443665,113.848328,0.128892,12.9618,27.0827


## Correlations

### Nature indicators

In [7]:
select = df.columns.get_level_values(1).isin(["NDVI", "NDVI Anomaly", "Rainfall (mm)", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)"])
df_nature = df.loc[:, select]
df_nature.head()

AdminStrata,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Aleppo,Aleppo,Aleppo,Aleppo,Ar-Raqqa,Ar-Raqqa,...,Lattakia,Lattakia,Rural Damascus,Rural Damascus,Rural Damascus,Rural Damascus,Tartous,Tartous,Tartous,Tartous
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),...,NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-09-29,99.920932,103.281473,0.117052,89.349382,105.993454,133.452545,0.1436,91.675148,100.049355,102.916087,...,0.515435,111.358296,101.030439,102.970369,0.112772,99.005201,113.829315,179.577637,0.445703,114.29633
2018-09-30,99.894768,102.581411,0.116985,89.32613,106.154101,130.698066,0.143704,91.706311,100.060323,102.367125,...,0.515319,111.310457,101.259426,102.765071,0.112783,99.025906,113.631431,173.128816,0.44519,114.196334
2018-10-01,99.936859,102.137474,0.116915,89.301813,106.310695,128.03035,0.143805,91.738026,100.169167,102.077634,...,0.515204,111.256994,101.513572,102.746637,0.112794,99.046382,113.462981,166.727201,0.444679,114.087915
2018-10-02,100.047045,101.810225,0.116842,89.27532,106.443678,125.559649,0.1439,91.76878,100.37589,101.924694,...,0.515084,111.19752,101.792879,102.780557,0.112806,99.066249,113.241732,160.712897,0.44417,113.971913
2018-10-03,100.225326,101.599662,0.116765,89.246651,106.55305,123.285963,0.143991,91.798573,100.680489,101.908306,...,0.514961,111.132034,102.097346,102.86683,0.112819,99.085507,112.967684,155.085903,0.443665,113.848328


In [8]:
# Compute the correlation between the nature time-series for the various administrative regions: we then obtain a list of correlation matrices equal to the number of administrative regions.
corr_matrices_nature = list()

def correlation_matrices_nature(group):
    group.columns = group.columns.droplevel()
    mtrx = group.corr(method = "spearman")#.values
    corr_matrices_nature.append(mtrx)

df_nature.groupby(level = 0, axis = 1).apply(correlation_matrices_nature);

In [9]:
# We take the mean of the various correlation matrices (administrative regions).
CORR_nature_mean = pd.DataFrame(np.mean(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
corr = CORR_nature_mean.style.background_gradient(cmap = "coolwarm")
corr.export_png("./output_images/nature_indicators_%s.png" % COUNTRY)
corr

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),1.0,0.759017,0.195125,0.160703
3 Months Anomaly Rainfalls (%),0.759017,1.0,0.217311,0.230635
NDVI,0.195125,0.217311,1.0,0.708845
NDVI Anomaly,0.160703,0.230635,0.708845,1.0


In [10]:
# We take the median of the various correlation matrices (administrative regions).
CORR_nature_median = pd.DataFrame(np.median(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
CORR_nature_median.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),1.0,0.769881,0.205866,0.21592
3 Months Anomaly Rainfalls (%),0.769881,1.0,0.243957,0.387722
NDVI,0.205866,0.243957,1.0,0.860262
NDVI Anomaly,0.21592,0.387722,0.860262,1.0


In [11]:
# We take the variance of the various correlation matrices (administrative regions).
CORR_nature_variance = pd.DataFrame(np.var(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
CORR_nature_variance.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),0.0,0.002555,0.012537,0.052772
3 Months Anomaly Rainfalls (%),0.002555,0.0,0.025992,0.092015
NDVI,0.012537,0.025992,0.0,0.129384
NDVI Anomaly,0.052772,0.092015,0.129384,0.0


### All indicators

In [12]:
# Compute the correlation between all the time-series for the various administrative regions: we then obtain a list of correlation matrices equal to the number of administrative regions.
corr_matrices = list()

def correlation_matrices(group):
    group.columns = group.columns.droplevel()
    mtrx = group.corr(method = "spearman").values
    corr_matrices.append(mtrx)

df.groupby(level = 0, axis = 1).apply(correlation_matrices);

In [13]:
# We take the mean of the various correlation matrices (administrative regions).
CORR_mean = pd.DataFrame(np.mean(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
corr = CORR_mean.style.background_gradient(cmap = "coolwarm")
corr.export_png("./output_images/all_indicators_%s.png" % COUNTRY)
corr

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1 Month Anomaly Rainfalls (%),1.0,0.759017,-0.619515,-0.190199,0.058791,0.195125,0.160703,-0.353461,0.504985,-0.11462
3 Months Anomaly Rainfalls (%),0.759017,1.0,-0.733265,-0.367177,0.035203,0.217311,0.230635,-0.470658,0.37664,-0.218378
Exchange rate,-0.619515,-0.733265,1.0,0.497093,-0.102795,0.049969,-0.029906,0.78985,-0.41454,0.336606
FCG,-0.190199,-0.367177,0.497093,1.0,-0.125717,-0.036069,-0.186494,0.53115,0.032741,0.422791
Fatalities,0.058791,0.035203,-0.102795,-0.125717,1.0,0.048126,0.117147,-0.032519,0.041219,-0.145866
NDVI,0.195125,0.217311,0.049969,-0.036069,0.048126,1.0,0.708845,0.385978,0.45709,0.117395
NDVI Anomaly,0.160703,0.230635,-0.029906,-0.186494,0.117147,0.708845,1.0,0.102843,0.131714,0.056252
Price cereals and tubers,-0.353461,-0.470658,0.78985,0.53115,-0.032519,0.385978,0.102843,1.0,-0.062626,0.338384
Rainfalls (mm),0.504985,0.37664,-0.41454,0.032741,0.041219,0.45709,0.131714,-0.062626,1.0,-0.093556
rCSI,-0.11462,-0.218378,0.336606,0.422791,-0.145866,0.117395,0.056252,0.338384,-0.093556,1.0


In [14]:
# We take the median of the various correlation matrices (administrative regions).
CORR_median = pd.DataFrame(np.median(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
CORR_median.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1 Month Anomaly Rainfalls (%),1.0,0.769881,-0.635997,-0.170824,0.134238,0.205866,0.21592,-0.280389,0.497351,-0.110741
3 Months Anomaly Rainfalls (%),0.769881,1.0,-0.77434,-0.351889,0.042822,0.243957,0.387722,-0.4492,0.403527,-0.152307
Exchange rate,-0.635997,-0.77434,1.0,0.571797,-0.129886,-0.023409,-0.125054,0.80795,-0.410242,0.326135
FCG,-0.170824,-0.351889,0.571797,1.0,-0.133843,-0.064123,-0.274118,0.604574,-0.006534,0.383232
Fatalities,0.134238,0.042822,-0.129886,-0.133843,1.0,0.04938,0.159214,-0.015093,0.021504,-0.078427
NDVI,0.205866,0.243957,-0.023409,-0.064123,0.04938,1.0,0.860262,0.390026,0.48836,0.05877
NDVI Anomaly,0.21592,0.387722,-0.125054,-0.274118,0.159214,0.860262,1.0,-0.043738,0.302701,0.096897
Price cereals and tubers,-0.280389,-0.4492,0.80795,0.604574,-0.015093,0.390026,-0.043738,1.0,0.003783,0.373954
Rainfalls (mm),0.497351,0.403527,-0.410242,-0.006534,0.021504,0.48836,0.302701,0.003783,1.0,-0.138894
rCSI,-0.110741,-0.152307,0.326135,0.383232,-0.078427,0.05877,0.096897,0.373954,-0.138894,1.0


In [15]:
# We take the variance of the various correlation matrices (administrative regions).
CORR_variance = pd.DataFrame(np.var(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
CORR_variance.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1 Month Anomaly Rainfalls (%),0.0,0.002555,0.006818,0.025443,0.138807,0.012537,0.052772,0.023578,0.002502,0.02694
3 Months Anomaly Rainfalls (%),0.002555,0.0,0.007981,0.022077,0.170994,0.025992,0.092015,0.017292,0.005964,0.037579
Exchange rate,0.006818,0.007981,0.0,0.032618,0.21494,0.028743,0.098513,0.016911,0.000793,0.078601
FCG,0.025443,0.022077,0.032618,0.0,0.120213,0.04829,0.0877,0.025558,0.020914,0.024525
Fatalities,0.138807,0.170994,0.21494,0.120213,0.0,0.02624,0.050347,0.140283,0.067087,0.026827
NDVI,0.012537,0.025992,0.028743,0.04829,0.02624,0.0,0.129384,0.051293,0.019501,0.041801
NDVI Anomaly,0.052772,0.092015,0.098513,0.0877,0.050347,0.129384,0.0,0.104134,0.16016,0.060932
Price cereals and tubers,0.023578,0.017292,0.016911,0.025558,0.140283,0.051293,0.104134,0.0,0.05909,0.077533
Rainfalls (mm),0.002502,0.005964,0.000793,0.020914,0.067087,0.019501,0.16016,0.05909,0.0,0.025931
rCSI,0.02694,0.037579,0.078601,0.024525,0.026827,0.041801,0.060932,0.077533,0.025931,0.0
