# Spearman correlation

In this notebook, we compute the Spearman correlations between the time-series of the administrative regions of Syria. Through this analysis, we can identify time-series with high correlations between them and then decide to exclude some for subsequent analyzes.

In [1]:
import dataframe_image as dfi
import pandas as pd
import numpy as np

In [2]:
COUNTRY = "Syria"

In [3]:
PATH_TO_DATA_FOLDER = "../Dataset time-series/output_data/" + COUNTRY + "/"

## Time-series dataset

In [4]:
# Load the time-series data (daily interpolation).
df = pd.read_csv(PATH_TO_DATA_FOLDER + COUNTRY + "-day.csv", header = [0, 1], index_col = 0)
df.index.name = "Datetime"
df.index = pd.to_datetime(df.index)
freq = "D"
df.index.freq = freq

In [5]:
df.columns.get_level_values(1).unique()

Index(['1 Month Anomaly Rainfalls (%)', '3 Months Anomaly Rainfalls (%)',
       'Code', 'Exchange rate', 'FCG <= 2', 'Fatalities', 'Lat', 'Lon', 'NDVI',
       'NDVI Anomaly', 'Population', 'Price cereals and tubers',
       'Rainfalls (mm)', 'Ramadan', 'rCSI >= 19'],
      dtype='object', name='Indicator')

In [6]:
# Consider only some indicators for the analysis with Spearman correlation.
INDICATORS_TO_CONSIDER = ["FCG <= 2", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)", "NDVI", "Price cereals and tubers", "Exchange rate", "Fatalities", "NDVI Anomaly", "rCSI >= 19", "Rainfalls (mm)"]
df = pd.concat([df.loc[:, (slice(None), feature)] for feature in INDICATORS_TO_CONSIDER], axis = 1).sort_index(axis = 1)
df

AdminStrata,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,...,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG <= 2,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI >= 19,...,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG <= 2,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI >= 19
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-09-29,99.920932,103.281473,0.017435,36.83872,111.0,0.117052,89.349382,0.057275,0.1372,34.05440,...,113.829315,179.577637,0.016755,12.46537,24.0,0.445703,114.296330,0.146942,7.9716,27.31994
2018-09-30,99.894768,102.581411,0.017595,36.83872,101.0,0.116985,89.326130,0.057331,0.1500,34.05440,...,113.631431,173.128816,0.016900,12.94964,24.0,0.445190,114.196334,0.149815,8.1647,26.22302
2018-10-01,99.936859,102.137474,0.017870,36.49929,99.0,0.116915,89.301813,0.057651,0.7876,33.67614,...,113.462981,166.727201,0.017122,12.45675,24.0,0.444679,114.087915,0.152405,10.0496,25.22491
2018-10-02,100.047045,101.810225,0.017985,36.37038,97.0,0.116842,89.275320,0.057914,1.4245,31.54574,...,113.241732,160.712897,0.017198,12.18369,24.0,0.444170,113.971913,0.152121,11.5057,27.92877
2018-10-03,100.225326,101.599662,0.018099,38.25699,85.0,0.116765,89.246651,0.058178,2.0614,32.76720,...,112.967684,155.085903,0.017275,12.72342,24.0,0.443665,113.848328,0.151837,12.9618,27.08270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-08-31,100.000000,101.956171,0.783499,43.16290,75.0,0.150221,108.031057,0.528965,0.0000,51.48633,...,100.553289,110.028605,0.812500,48.10094,0.0,0.467886,116.644360,0.560945,0.2717,40.19883
2020-09-01,100.000129,101.497542,0.785452,44.98382,75.0,0.150224,107.919379,0.529923,0.0006,54.04531,...,100.589511,110.620376,0.813951,47.55924,0.0,0.467101,116.634847,0.562415,0.4032,39.50237
2020-09-02,100.000387,101.139241,0.787405,44.30129,75.0,0.150232,107.818228,0.530881,0.0012,53.22101,...,100.622048,111.154853,0.815402,46.92650,0.0,0.466332,116.621343,0.563884,0.5347,40.91314
2020-09-03,100.000774,100.881269,0.789357,48.09989,70.0,0.150245,107.727603,0.531840,0.0018,50.27144,...,100.650898,111.632037,0.816853,45.82597,0.0,0.465578,116.603849,0.565354,0.6662,43.88251


## Correlations

### Nature indicators

In [7]:
select = df.columns.get_level_values(1).isin(["NDVI", "NDVI Anomaly", "Rainfall (mm)", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)"])
df_nature = df.loc[:, select]
df_nature.head()

AdminStrata,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Aleppo,Aleppo,Aleppo,Aleppo,Ar-Raqqa,Ar-Raqqa,...,Lattakia,Lattakia,Rural Damascus,Rural Damascus,Rural Damascus,Rural Damascus,Tartous,Tartous,Tartous,Tartous
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),...,NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-09-29,99.920932,103.281473,0.117052,89.349382,105.993454,133.452545,0.1436,91.675148,100.049355,102.916087,...,0.515435,111.358296,101.030439,102.970369,0.112772,99.005201,113.829315,179.577637,0.445703,114.29633
2018-09-30,99.894768,102.581411,0.116985,89.32613,106.154101,130.698066,0.143704,91.706311,100.060323,102.367125,...,0.515319,111.310457,101.259426,102.765071,0.112783,99.025906,113.631431,173.128816,0.44519,114.196334
2018-10-01,99.936859,102.137474,0.116915,89.301813,106.310695,128.03035,0.143805,91.738026,100.169167,102.077634,...,0.515204,111.256994,101.513572,102.746637,0.112794,99.046382,113.462981,166.727201,0.444679,114.087915
2018-10-02,100.047045,101.810225,0.116842,89.27532,106.443678,125.559649,0.1439,91.76878,100.37589,101.924694,...,0.515084,111.19752,101.792879,102.780557,0.112806,99.066249,113.241732,160.712897,0.44417,113.971913
2018-10-03,100.225326,101.599662,0.116765,89.246651,106.55305,123.285963,0.143991,91.798573,100.680489,101.908306,...,0.514961,111.132034,102.097346,102.86683,0.112819,99.085507,112.967684,155.085903,0.443665,113.848328


In [8]:
# Compute the correlation between the nature time-series for the various administrative regions: we then obtain a list of correlation matrices equal to the number of administrative regions.
corr_matrices_nature = list()

def correlation_matrices_nature(group):
    group.columns = group.columns.droplevel()
    mtrx = group.corr(method = "spearman")#.values
    corr_matrices_nature.append(mtrx)

df_nature.groupby(level = 0, axis = 1).apply(correlation_matrices_nature);

In [9]:
# We take the mean of the various correlation matrices (administrative regions).
CORR_nature_mean = pd.DataFrame(np.mean(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
corr = CORR_nature_mean.style.background_gradient(cmap = "coolwarm")
corr.export_png("./output_images/nature_indicators_%s.png" % COUNTRY)
corr

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),1.0,0.75601,0.190659,0.144123
3 Months Anomaly Rainfalls (%),0.75601,1.0,0.215173,0.210666
NDVI,0.190659,0.215173,1.0,0.691345
NDVI Anomaly,0.144123,0.210666,0.691345,1.0


In [10]:
# We take the median of the various correlation matrices (administrative regions).
CORR_nature_median = pd.DataFrame(np.median(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
CORR_nature_median.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),1.0,0.767088,0.200285,0.213015
3 Months Anomaly Rainfalls (%),0.767088,1.0,0.247094,0.325149
NDVI,0.200285,0.247094,1.0,0.825174
NDVI Anomaly,0.213015,0.325149,0.825174,1.0


In [11]:
# We take the variance of the various correlation matrices (administrative regions).
CORR_nature_variance = pd.DataFrame(np.var(np.stack(corr_matrices_nature, axis = 0), axis = 0), index = df_nature.columns.droplevel().unique(), columns = df_nature.columns.droplevel().unique())
CORR_nature_variance.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),NDVI,NDVI Anomaly
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Month Anomaly Rainfalls (%),0.0,0.002679,0.013437,0.054271
3 Months Anomaly Rainfalls (%),0.002679,0.0,0.0283,0.095595
NDVI,0.013437,0.0283,0.0,0.137471
NDVI Anomaly,0.054271,0.095595,0.137471,0.0


### All indicators

In [12]:
# Compute the correlation between all the time-series for the various administrative regions: we then obtain a list of correlation matrices equal to the number of administrative regions.
corr_matrices = list()

def correlation_matrices(group):
    group.columns = group.columns.droplevel()
    mtrx = group.corr(method = "spearman").values
    corr_matrices.append(mtrx)

df.groupby(level = 0, axis = 1).apply(correlation_matrices);

In [13]:
# We take the mean of the various correlation matrices (administrative regions).
CORR_mean = pd.DataFrame(np.mean(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
corr = CORR_mean.style.background_gradient(cmap = "coolwarm")
corr.export_png("./output_images/all_indicators_%s.png" % COUNTRY)
corr

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG <= 2,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI >= 19
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1 Month Anomaly Rainfalls (%),1.0,0.75601,-0.616058,-0.170544,-0.001892,0.190659,0.144123,-0.190142,0.50867,-0.12998
3 Months Anomaly Rainfalls (%),0.75601,1.0,-0.730165,-0.350434,-0.032792,0.215173,0.210666,-0.294662,0.382799,-0.229949
Exchange rate,-0.616058,-0.730165,1.0,0.486792,-0.034793,0.070309,-0.004868,0.656326,-0.408378,0.360272
FCG <= 2,-0.170544,-0.350434,0.486792,1.0,-0.089088,-0.013077,-0.159828,0.482574,0.043979,0.431848
Fatalities,-0.001892,-0.032792,-0.034793,-0.089088,1.0,0.036721,0.109677,-0.003165,0.000393,-0.153029
NDVI,0.190659,0.215173,0.070309,-0.013077,0.036721,1.0,0.691345,0.437165,0.444881,0.133819
NDVI Anomaly,0.144123,0.210666,-0.004868,-0.159828,0.109677,0.691345,1.0,0.222248,0.107918,0.07128
Price cereals and tubers,-0.190142,-0.294662,0.656326,0.482574,-0.003165,0.437165,0.222248,1.0,-0.074647,0.41245
Rainfalls (mm),0.50867,0.382799,-0.408378,0.043979,0.000393,0.444881,0.107918,-0.074647,1.0,-0.108734
rCSI >= 19,-0.12998,-0.229949,0.360272,0.431848,-0.153029,0.133819,0.07128,0.41245,-0.108734,1.0


In [14]:
# We take the median of the various correlation matrices (administrative regions).
CORR_median = pd.DataFrame(np.median(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
CORR_median.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG <= 2,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI >= 19
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1 Month Anomaly Rainfalls (%),1.0,0.767088,-0.614447,-0.168846,0.098099,0.200285,0.213015,-0.205009,0.500959,-0.166268
3 Months Anomaly Rainfalls (%),0.767088,1.0,-0.781296,-0.313687,0.013115,0.247094,0.325149,-0.252221,0.404846,-0.195606
Exchange rate,-0.614447,-0.781296,1.0,0.552407,-0.055538,0.002578,-0.105129,0.599698,-0.407534,0.395789
FCG <= 2,-0.168846,-0.313687,0.552407,1.0,-0.099597,-0.028014,-0.169158,0.566977,0.003544,0.390342
Fatalities,0.098099,0.013115,-0.055538,-0.099597,1.0,0.004681,0.11941,0.036721,0.017924,-0.044018
NDVI,0.200285,0.247094,0.002578,-0.028014,0.004681,1.0,0.825174,0.455105,0.4442,0.077364
NDVI Anomaly,0.213015,0.325149,-0.105129,-0.169158,0.11941,0.825174,1.0,0.058156,0.263506,0.202142
Price cereals and tubers,-0.205009,-0.252221,0.599698,0.566977,0.036721,0.455105,0.058156,1.0,-0.006641,0.468931
Rainfalls (mm),0.500959,0.404846,-0.407534,0.003544,0.017924,0.4442,0.263506,-0.006641,1.0,-0.164523
rCSI >= 19,-0.166268,-0.195606,0.395789,0.390342,-0.044018,0.077364,0.202142,0.468931,-0.164523,1.0


In [15]:
# We take the variance of the various correlation matrices (administrative regions).
CORR_variance = pd.DataFrame(np.var(np.stack(corr_matrices, axis = 0), axis = 0), index = df.columns.droplevel().unique(), columns = df.columns.droplevel().unique())
CORR_variance.style.background_gradient(cmap = "coolwarm")

Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG <= 2,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI >= 19
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1 Month Anomaly Rainfalls (%),0.0,0.002679,0.007295,0.02312,0.104255,0.013437,0.054271,0.031484,0.002567,0.026558
3 Months Anomaly Rainfalls (%),0.002679,0.0,0.008591,0.02072,0.13319,0.0283,0.095595,0.028089,0.006051,0.039388
Exchange rate,0.007295,0.008591,0.0,0.03431,0.179055,0.026391,0.099949,0.0329,0.00041,0.079027
FCG <= 2,0.02312,0.02072,0.03431,0.0,0.118204,0.046336,0.08714,0.032961,0.0213,0.02577
Fatalities,0.104255,0.13319,0.179055,0.118204,0.0,0.030677,0.054642,0.102218,0.047411,0.03075
NDVI,0.013437,0.0283,0.026391,0.046336,0.030677,0.0,0.137471,0.065623,0.019485,0.042365
NDVI Anomaly,0.054271,0.095595,0.099949,0.08714,0.054642,0.137471,0.0,0.112672,0.167924,0.063761
Price cereals and tubers,0.031484,0.028089,0.0329,0.032961,0.102218,0.065623,0.112672,0.0,0.062475,0.06786
Rainfalls (mm),0.002567,0.006051,0.00041,0.0213,0.047411,0.019485,0.167924,0.062475,0.0,0.025524
rCSI >= 19,0.026558,0.039388,0.079027,0.02577,0.03075,0.042365,0.063761,0.06786,0.025524,0.0
