# Data preparation

This code transform the data into an appropriate format for stan model. First, it aggregates and standardises the values of environmental variables and data on buckwheat production for each county in China, and saves it into a csv file. It also merges the environmental data with the spatail polygons for counties, to show the values on the plots. Then it caluclates and saves the adjacency matrix for counties in Chinese to be used in iCAR model.

In [None]:
# Load libraries:
library(rgdal) # working with spatial data
library(raster) # working with later layers
library(data.table) # working with data frames
library(spdep) # calculating the adjacency matrix
require(geosphere) # calculating polygon area
library(here) # setting paths
library(viridis) # for colour schemes visualisation
library(RColorBrewer) # for colours on the plots
library(tmap) # for thematic mapping

In [2]:
### Set the paths for loading data

# Masked data on buckwheat production in China (masked to keep the data consistent:
path2prd<-here("data","production","masked")

# Environmental variables (cropped, because the border cells will have a consistent quality:
path2env <-here("data","environmental","present","masked")

# Path to past environmental data:
path2past <- here("data","environmental","past")

# Path to counties in China:
path2counties<-here("raw_data","CHN_adm")

# Path to full names of environmental variables:

path2env_table<-here("outputs","01_01_Predictor_variables.csv")

### Set the paths for outputs:
path2prd_table<-here("data","production","by_county")
path2env_counties<-here("data","Env_by_county.csv")
path2envMaps<-here("outputs","ESM_12a.tiff")
path2envMapsAE<-here("outputs","ESM_12")
path2stats<- here("outputs","03_01_Env_Mean_and_SD.csv")
path2env_sd<- here("data","Sd_Env_by_county.csv")
path2prd_counties<-here("data","Prd_by_county.csv")
path2adjacency <- here("data","adjacency.csv")

In [3]:
### Load theme for plotting maps
source(here("R","theme_tmaps.R"))

In [4]:
### Loading data:

# Get vector map of counties in China
china<-readOGR(dsn = path2counties, layer = "CHN_adm3")

# Get outer border of China
outer_border<-readOGR(dsn = path2counties, layer = "CHN_adm0")

OGR data source with driver: ESRI Shapefile 
Source: "G:\My Drive\my_repositories\PhD\raw_data\CHN_adm", layer: "CHN_adm3"
with 2409 features
It has 13 fields
Integer64 fields read as strings:  ID_0 ID_1 ID_2 ID_3 
OGR data source with driver: ESRI Shapefile 
Source: "G:\My Drive\my_repositories\PhD\raw_data\CHN_adm", layer: "CHN_adm0"
with 1 features
It has 70 fields
Integer64 fields read as strings:  ID_0 OBJECTID_1 


In [5]:
crop_data<-c()
crops<-c("wheat","barley")
for(i in 1:2){
path2prod<-paste(path2prd,"//",crops[i],sep="")
# Get production dataset for China:
prd_layers<-list.files(path=path2prod,pattern='tif$',full.names=TRUE)
prd<-stack(prd_layers)
crop_data[[i]]<-prd}

### Get production data by county in the table

In [7]:
# Get county areas:
area <- areaPolygon(china)
i=1
for (i in 1:1){
    print(crops[i])
    # Get data quality per county
    prd<-crop_data[[i]]

    # Get the most frequent data quality
    r.mean.dq <- extract(prd[[grep("DataQuality",names(prd))]],china,
                     fun=function(x,na.rm){return(sort(x,decreasing=TRUE,na.last=na.rm)[1])},na.rm=TRUE,df=TRUE)

    #Get weighted mean of yield per hectare and harvested area fraction per county
    r.mean.pr <- extract(prd[[paste(crops[i],c("YieldPerHectare","HarvestedAreaFraction"),sep="_")]],china,
                     fun="mean",na.rm=TRUE,weights=TRUE,normalizeWeights=TRUE,df=TRUE)

    # Get totrals for the production and harvested area for each county
    r.sum.pr <- extract(prd[[paste(crops[i],c("HarvestedAreaHectares","Production"),sep="_")]],china,weights=TRUE,normalizeWeights=FALSE,na.rm=TRUE)

    # Sum the toals of harvester area hectares and tonnes of production
    r.sum.pr<-lapply(r.sum.pr,as.data.frame)
    r.sum.pr<-lapply(r.sum.pr,function(x){
        HarvestedAreaHectares<-x$weight*x[paste(crops[i],"HarvestedAreaHectares",sep="_")]
        Production<-x$weight*x[paste(crops[i],"Production",sep="_")]
        x<-data.frame(HarvestedAreaHectares=sum(HarvestedAreaHectares, na.rm=TRUE),Production=sum(Production,na.rm=TRUE))
        return(x)    
    })
    r.sum.pr<-rbindlist(r.sum.pr)

    # Bind all the extracted data into one table
    r.values<-cbind(r.mean.dq,r.mean.pr[,-1],r.sum.pr,area)
    colnames(r.values)<-gsub(paste(crops[i],"_",sep=""),"",colnames(r.values))

    ### Get the proportion of area which is cultivated, by dividing the cultivated area in hectares for the county, by the area of 
    ### the county. Area is given in square meters, and cultivated area in hectares, so multiply by 10 000.
    r.values$ProportionAreaCultivated<-(as.numeric(as.character(r.values$HarvestedAreaHectares))/r.values$area)*10000

    ### Get the average produciont of buckwheat in tonnes, per hectar of a county
    r.values$ProductionPerHectare<-(as.numeric(as.character(r.values$Production))/r.values$area)*10000

    r.values$logProportionAreaCultivated <- log(r.values$ProportionAreaCultivated+1e-12 )
    r.values$logProductionPerHectare <- log(r.values$ProductionPerHectare+1e-12)
    
    # Write data to a file:

    write.csv(r.values,paste(path2prd_table,"//",crops[i],".csv",sep=""),row.names=FALSE)}

[1] "wheat"


""fun" was changed to "mean"; other functions cannot be used when "weights=TRUE""
