well structured data determining the “friendship” between people living in two geographic areas. The data repository is here. 

The task is to build a network for the all countries in the world, where the nodes are GADM NUTS3 areas. Only countries with sufficient data coverage should be considered, and the task can be limited only the top 100 (excluding the USA). 

The output consists of two files: one for nodes *(nodeID,nodeLabel,latitude,longitude)* and one for edges list *(nodeID_from,nodeID_to,country_name,country_ISO3)*. 

A simplified network analysis to compare the networks is expected.

In [1]:
library(terra)
library(geodata)
library(dplyr)
library(eurostat)
library(sf)
library(here)

terra 1.7.78


Attaching package: ‘dplyr’


The following objects are masked from ‘package:terra’:

    intersect, union


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Linking to GEOS 3.10.2, GDAL 3.4.1, PROJ 8.2.1; sf_use_s2() is TRUE

here() starts at /home/lapo/Desktop/Uni



In [2]:
# Define some hyperparameters of the analysis
weight <- TRUE # If TRUE, edge list with the SCI as weights
dropNodeCode <- FALSE # If TRUE, drop the node code from the edge list

In [2]:
# Import the data from meta
path_data_fb <- "../../data/meta/task44/gadm1_nuts3_counties-gadm1_nuts3_counties - FB Social Connectedness Index - October 2021.tsv"
df <- read.table(file = path_data_fb, sep = "\t", header = TRUE)

In [3]:
tail(df)

Unnamed: 0_level_0,user_loc,fr_loc,scaled_sci
Unnamed: 0_level_1,<chr>,<chr>,<int>
63824116,ZWE9,ZWE4,318504
63824117,ZWE9,ZWE5,305709
63824118,ZWE9,ZWE6,365478
63824119,ZWE9,ZWE7,715880
63824120,ZWE9,ZWE8,722214
63824121,ZWE9,ZWE9,15475362


In [5]:
# Filter out USA counties for memory reasons
if (any(grepl("USA", df$user_loc))) {
  df <- df[-grep("USA", df$user_loc),]
}
if (any(grepl("USA", df$fr_loc))) {
  df <- df[-grep("USA", df$fr_loc),]
}

sprintf("Filtered out USA counties, df has %d rows now", nrow(df))

In [6]:
head(df)

Unnamed: 0_level_0,user_loc,fr_loc,scaled_sci
Unnamed: 0_level_1,<chr>,<chr>,<int>
1,ABW,ABW,11264841
2,ABW,AGO1,38
3,ABW,AGO10,34
4,ABW,AGO11,32
5,ABW,AGO12,23
6,ABW,AGO13,37


In [5]:
# Import the table with the levels of the counties
levels <- read.table(file = "../../data/task_44/meta/gadm1_nuts3_counties_levels.csv", sep = ',', header = TRUE)

tail(levels)

Unnamed: 0_level_0,key,level
Unnamed: 0_level_1,<chr>,<chr>
8003,LKA24_318,gadm2
8004,LKA24_319,gadm2
8005,LKA25_320,gadm2
8006,LKA25_321,gadm2
8007,LKA25_322,gadm2
8008,LKA25_323,gadm2


In [8]:
# Function to get the country from a NUTS3 CNTR_CODE
get_nuts3_country <- function(nuts3_code, nuts3_shapefile){
  nuts3_region <- nuts3_shapefile[nuts3_shapefile$id == nuts3_code, ]
  country <- countrycode::countrycode(nuts3_region$CNTR_CODE, origin = "eurostat", destination = "iso3c")
  return(country)
}

In [9]:
# Attach the country to the levels table
levels$country <- sub("^([A-Z]+).*", "\\1", levels$key)

# Overwrite nuts3 countries with ISO 3 codes
nuts3_shapefile <- eurostat::get_eurostat_geospatial(nuts_level = 3, year = 2016, output_class = "sf")
nuts3_keys <- levels[levels$level == "nuts3", ]$key
nuts3_countries <- sapply(nuts3_keys, get_nuts3_country, nuts3_shapefile = nuts3_shapefile)
levels[levels$level == "nuts3", ]$country <- nuts3_countries

# Clear the memory
rm(nuts3_countries, nuts3_keys, nuts3_shapefile)

Extracting data from eurostat::eurostat_geodata_60_2016



In [10]:
# Attach the countries and the levels to the df table
df <- df |>
  # Left join the user and fr country
  left_join(levels, by = c("user_loc" = "key")) |>
  left_join(levels, by = c("fr_loc" = "key"), suffix = c("_user", "_fr")) |>
  # Drop the level_fr column and rename the level_user column
  select(-c("level_fr")) |>
  rename(level = level_user)

In [11]:
# Filter out extra national connection and small areas
df <- df |>
  # Extra national connection
  filter(country_user == country_fr) |>
  select(-c("country_user")) |> 
  rename(country = country_fr) |>
  # Small areas and missing levels
  filter(level %in% c("gadm1", "gadm2", "nuts3"))

In [12]:
# Get the unique countries and the correspondent levels
country_levels <- df |> 
  group_by(country) |>
  summarise(unique(level)) |>
  rename(level = `unique(level)`)

# Drop the level column from the df table
df <- df |>
  select(-c("level"))

tail(country_levels)

country,level
<chr>,<chr>
UZB,gadm1
VNM,gadm1
XKO,gadm1
ZAF,gadm1
ZMB,gadm1
ZWE,gadm1


In [13]:
head(df)

Unnamed: 0_level_0,user_loc,fr_loc,scaled_sci,country
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>
1,AGO1,AGO1,13506847,AGO
2,AGO1,AGO10,333587,AGO
3,AGO1,AGO11,599759,AGO
4,AGO1,AGO12,304830,AGO
5,AGO1,AGO13,315700,AGO
6,AGO1,AGO14,376355,AGO


In [14]:
# Custom sorting functions
sort_gadm1 <- function(vec) {
  # Extract alphabetic and numeric parts
  alphabetic_part <- gsub("\\d", "", vec)  # Remove digits
  numeric_part <- as.integer(gsub("\\D", "", vec))  # Remove non-digits and convert to integer
  
  # Sort by alphabetic part first, then numeric part
  order <- order(alphabetic_part, numeric_part)
  
  # Return sorted vector
  return(vec[order])
}

sort_gadm2 <- function(vec) {
  # Extract the numbers after the underscore
  codes <- as.integer(sub(".*_(\\d{1,3})$", "\\1", vec))

  # Sort by the numbers
  order <- order(codes)
  
  # Return sorted vector
  return(vec[order])
}

In [15]:
head(country_levels)

country,level
<chr>,<chr>
AGO,gadm1
ALB,nuts3
ARE,gadm1
ARG,gadm1
ARM,gadm1
AUS,gadm1


In [18]:
# Get the shapefile for the NUTS3 regions
nuts3_shapefile <- eurostat::get_eurostat_geospatial(nuts_level = 3, year = 2016, output_class = "sf")

# Loop over the countries
for(i in 1:nrow(country_levels)){
  # Get the country and the level and subset the df
  country <- country_levels$country[i]
  level <- country_levels$level[i]
  df_country <- df[df$country == country,]

  # Extract the counties/regions
  nodeCode <- unique(c(df_country$user_loc, df_country$fr_loc))

  ## GADM ##
  if(level %in% c("gadm1", "gadm2")){
    # Sort the counties/regions, get the gadm data, create the node dataframe
    if(level == "gadm1"){
      nodeCode <- sort_gadm1(nodeCode)
      gadm_data <- gadm(country, level = 1, path = "../../data/task_44/", version="3.6")
      
      nodes_df <- data.frame(nodeCode = nodeCode)
      nodes_df$nodeID <- as.integer(gsub("\\D", "", nodes_df$nodeCode))
    } else if(level == "gadm2"){
      nodeCode <- sort_gadm2(nodeCode)
      gadm_data <- gadm(country, level = 2, path = "../../data/task_44/", version="3.6")
      
      nodes_df <- data.frame(nodeCode = nodeCode)
      nodes_df$nodeID <- as.integer(sub(".*_(\\d{1,3})$", "\\1", nodes_df$nodeCode))
    }

    # Extract the name
    if (level == "gadm1"){
      nodes_df$name <- gadm_data$NAME_1[nodes_df$nodeID]
    } else if (level == "gadm2"){
      nodes_df$name <- gadm_data$NAME_2[nodes_df$nodeID]
    }

    # Get the centroids of each region
    centroids <- terra::centroids(gadm_data)
    coords <- terra::crds(centroids)

    # Attach the coordinates
    nodes_df$longitude <- coords[, 1][nodes_df$nodeID]
    nodes_df$latitude <- coords[, 2][nodes_df$nodeID]

    # Get the nodeID for the user_loc and fr_loc
    nodeID_from <- sapply(df_country$user_loc, function(x) nodes_df[nodes_df$nodeCode == x, ]$nodeID)
    nodeID_to <- sapply(df_country$fr_loc, function(x) nodes_df[nodes_df$nodeCode == x, ]$nodeID)

    # Create the edge dataframe
    edges_df <- data.frame(nodeID_from = nodeID_from, nodeID_to = nodeID_to)

    # Extract the scaled_sci, the country and the country ISO3
    if (weight) edges_df$weight <- df_country$scaled_sci
    edges_df$country_name <- gadm_data$NAME_0[nodes_df$nodeID]
    edges_df$country_ISO3 <- country

    # Drop nodeCode (assuming label = name)
    if (dropNodeCode) nodes_df$nodeCode <- NULL
  }


  ## NUTS3 ##
  else if(level == "nuts3"){
    # Extract nodeID
    nodes_df <- data.frame(nodeCode = nodeCode)
    nodes_df$nodeID <- as.integer(gsub("\\D", "", nodes_df$nodeCode))

    # Extract the regions of the country
    nuts3_regions <- nuts3_shapefile[match(nodeCode, nuts3_shapefile$id), ]

    # Attach the name
    nodes_df$name <- nuts3_regions$NAME_LATN

    # Extract and attach the coordinates
    centroids <- st_centroid(nuts3_regions$geometry)
    coords <- st_coordinates(centroids)
    nodes_df$longitude <- coords[, 1]
    nodes_df$latitude <- coords[, 2]

    # Get the nodeID for the user_loc and fr_loc
    nodeID_from <- sapply(df_country$user_loc, function(x) nodes_df[nodes_df$nodeCode == x, ]$nodeID)
    nodeID_to <- sapply(df_country$fr_loc, function(x) nodes_df[nodes_df$nodeCode == x, ]$nodeID)

    # Create the edge dataframe
    edges_df <- data.frame(nodeID_from = nodeID_from, nodeID_to = nodeID_to)

    # Extract the scaled_sci, the country and the country ISO3
    if (weight) edges_df$weight <- df_country$scaled_sci
    country_name <- countrycode::countrycode(country, "iso3c", "country.name.en")
    edges_df$country_name <- country_name
    edges_df$country_ISO3 <- country

    # Drop nodeCode (assuming label = name)
    if (dropNodeCode) nodes_df$nodeCode <- NULL

    # Reset the nodeIDs
    id_map <- data.frame(nodeID = 1:nrow(nodes_df), old_nodeID = nodes_df$nodeID)
    nodes_df$nodeID <- id_map[match(nodes_df$nodeID, id_map$old_nodeID),]$nodeID
    edges_df$nodeID_from <- id_map[match(edges_df$nodeID_from, id_map$old_nodeID),]$nodeID
    edges_df$nodeID_to <- id_map[match(edges_df$nodeID_to, id_map$old_nodeID),]$nodeID
  }

  # Save the data
  write.csv(nodes_df, file = sprintf("../../data/task_44/networks/%s_nodes.csv", country), row.names = FALSE)
  write.csv(edges_df, file = sprintf("../../data/task_44/networks/%s_edges.csv", country), row.names = FALSE)
}

Extracting data from eurostat::eurostat_geodata_60_2016



AGOALBAREARGARMAUSAUTAZEBDIBELBENBFABGDBGRBHRBIHBLRBOLBRABWACAFCANCHECHLCIVCMRCODCOGCOLCRICYPCZEDEUDNKDOMDZAECUEGYERIESPESTETHFINFRAGABGBRGEOGHAGINGMBGNBGNQGRCGTMHKGHNDHRVHTIHUNIDNINDIRLISLITAJAMJORJPNKAZKENKGZKHMKORKWTLAOLBNLBRLBYLIELKALSOLTULUXLVAMARMDAMDGMEXMKDMLIMLTMMRMNEMNGMOZMRTMUSMWIMYSNAMNERNGANICNLDNORNPLNZLOMNPAKPANPERPHLPNGPOLPRTPRYQATROURWASAUSENSGPSLESLVSRBSVKSVNSWESWZTCDTGOTHATJKTLSTTOTUNTURTWNTZAUGAUKRURYUZBVNMXKOZAFZMBZWE