/
data prep.r
477 lines (385 loc) · 19.5 KB
/
data prep.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
##
## Initialisation variables, functions, etc.
##
library(tidyverse)
library(readxl)
library(data.table)
library(igraph)
library(futile.logger)
flog.info("Preparing data...")
##
## directories for loading/saving stuff
##
data.dir = "../../data/2016"
data.out.dir = "./data"
plots.dir = "./plots"
results.dir = "./results"
models.dir = "./models"
# create the directories if they don't exist
if (!dir.exists(data.out.dir)) # data out
dir.create(data.out.dir)
if (!dir.exists(plots.dir)) # plots
dir.create(plots.dir)
if (!dir.exists(results.dir)) # results
dir.create(results.dir)
if (!dir.exists(models.dir)) # models
dir.create(models.dir)
##
## data processing parameters
##
license.owners.only = T # whether or not to output only license owners (siida shares)
keep.30C = F # whether or not to remove people in district 30C
# if this variable has been defined outside of this file and set to True, then the people in `people.to.remove` (line 55) will *not* be removed;
# otherwise, set the variable to false and remove unwanted herders
if (!exists("keep.all.herders")) { keep.30C=F }
############################################################################
## Import all data
##
flog.info("Importing data files...")
herders = read_excel(file.path(data.dir, "Herders_anon.xlsx"), sheet = 1)
herd.sizes = read_excel(file.path(data.dir, "Herders_anon.xlsx"), sheet = "Reindeer - from NRK")
survey = read_excel(file.path(data.dir, "Surveys.xlsx"), sheet = "Survey") # interview data
net.maps = read_excel(file.path(data.dir, "Net-maps.xlsx"), sheet = "Net-maps - individuals")
sibs = read_excel(file.path(data.dir, "Surveys.xlsx"), sheet = "Siblings")
kids = read_excel(file.path(data.dir, "Surveys.xlsx"), sheet = "Children")
# remove person/people from district 30C (no net-maps or games data for them)
if (!keep.30C)
{
people.to.remove = subset(herders, Distrikt=="30C")$HerderID
herders = subset(herders, !HerderID %in% people.to.remove)
survey = subset(survey, !HerderID %in% people.to.remove)
net.maps = subset(net.maps, !Ego %in% people.to.remove | !Alter %in% people.to.remove) # this shouldn't cause any changes
sibs = subset(sibs, !SibID %in% people.to.remove)
rm(people.to.remove)
}
############################################################################
## Sort out variables
##
flog.info("Formatting data...")
##
## main herders table
##
setnames(herders, c("License?", "Interviewed?"), c("SiidaShareYN", "Interviewed."))
# drop columns to preserve anonymity
herders$Etternavn = NULL
herders$Kode = NULL
##
## some useful lists
##
siida.share.ids = subset(herders, SiidaShareYN==1)$HerderID # IDs for herders with siida shares in the district
interviewee_list = survey$HerderID # people we interviewed (list everyone, including unlicensed herders but *not* people in district 30C)
netmappers_list = unique(net.maps$Interviewee) # people who drew net-maps
##
## should we keep siida shares only?
##
if (license.owners.only)
{
herders = subset(herders, SiidaShareYN==1)
survey = subset(survey, HerderID %in% siida.share.ids)
net.maps = subset(net.maps, Ego %in% siida.share.ids & Alter %in% siida.share.ids)
}
##
## interview data
##
# copy each participant's siida ID into survey table
survey = survey %>% left_join(
select(herders, HerderID, SiidaID), by="HerderID"
)
# calculate age at the time of fieldwork
# survey$Age = 2016 - survey$BirthYear
# convert amount of subsistence from herd into ordinal scale
scale_subsistence = c("None", "Almost none", "Less than half", "About half", "More than half", "Almost all", "All")
survey$SubsistenceSpring = ordered(survey$SubsistenceSpring, levels=scale_subsistence)
survey$SubsistenceSummer = ordered(survey$SubsistenceSummer, levels=scale_subsistence)
survey$SubsistenceAutumn = ordered(survey$SubsistenceAutumn, levels=scale_subsistence)
survey$SubsistenceWinter = ordered(survey$SubsistenceWinter, levels=scale_subsistence)
# add up subsistence scores (first convert to numbers)
survey$SubsistenceTotal = as.integer(survey$SubsistenceSpring) + as.integer(survey$SubsistenceSummer) +
as.integer(survey$SubsistenceAutumn) + as.integer(survey$SubsistenceWinter)
# copy total subsistence score into herders table
herders = herders %>%
left_join( select(survey, HerderID, SubsistenceTotal), by="HerderID" )
##
## net-maps
##
net.maps = filter(net.maps, Type != "Influence") # remove influence because it wasn't a useful measure in the end
# move gifts and kin into separate dataframes
gifts = net.maps %>%
filter(Type == "Gift") %>%
dplyr::select(Ego, Alter, Value) %>%
mutate(Value = as.numeric(as.character(Value))) %>% # convert the value of the gifts into numeric
rename(GiftSize = Value)
kin = net.maps %>%
filter(Type == "Kin") %>%
dplyr::select(Ego, Alter, r) %>%
na.omit(.) %>% # de-dupe and remove NAs
distinct()
# NB: `kin` based on the net-maps doesn't include all kin we know about - they'll be added when we start accounting for sibs and kids
# and this dataframe will get overwritten by a more complete edge list when we construct the social networks
# keep only advice, help and shared items in `net.maps`
net.maps = filter(net.maps, Type %in% c("Advice", "Help", "Items"))
# make dataframe containing outbound connections only
net.maps.o = filter(net.maps, Alter!=Interviewee) %>% dplyr::select(Ego, Alter, Type)
## in this case, in-degree will be a measure of help received, as reported by others
## and out-degree will be the amount of self-reported help given to others
#... inbound connections only
net.maps.i = filter(net.maps, Alter==Interviewee) %>% dplyr::select(Ego, Alter, Type)
## in this case, in-degree will be a self-reported measure of help received
## and out-degree will be the amount of help given, as reported by others
# sort out factors to remove missing levels
net.maps$Type = factor(net.maps$Type)
net.maps.o$Type = factor(net.maps.o$Type)
net.maps.i$Type = factor(net.maps.i$Type)
##
## clean up
##
rm(scale_subsistence)
############################################################################
## Get herd size data
##
# centre herd size in 2012 and rename variable
herd.sizes$num.reindeer.z = scale(herd.sizes$`Rein 2012`, scale = T)
setnames(herd.sizes, "Rein 2012", "num.reindeer")
herders = merge(herders,
subset(herd.sizes, select=c(HerderID, num.reindeer, num.reindeer.z)),
by="HerderID", all.x=T)
############################################################################
## Calculate complete set of siblings for each person (if they have them)
##
# assign ID numbers to sibs (continuing on from participant IDs)
## first, swap 'HerderID' and 'SibID' column names
names(sibs)[names(sibs)=="HerderID"] = "SibID2"
names(sibs)[names(sibs)=="SibID"] = "HerderID"
names(sibs)[names(sibs)=="SibID2"] = "SibID"
sibs = subset(sibs, SibName != "(deceased)" | is.na(SibName)) # remove anyone deceased (but explicitly keep people we don't know names for)
## assign a new ID number to each sib based on their 'tmpName' (a unique string basd on sex, birth year and no. kids)
## (this is the best we can do to not generate duplicate IDs where sibs without ID numbers are related to more than one interviewee -- e.g. in the case of herders 23 and 24)
largest_id = max(herders$HerderID) + 1
tmpNames = unique( subset(sibs, nchar(tmpName) > 5)$tmpName ) # make sure these names have more five characters (most information)
sib.ids = data.frame(tmpName = tmpNames, tmpID = seq(from=largest_id, to=(largest_id + length(tmpNames) - 1)))
## assign a new ID number to each sib based on their tmpNames...
sibs = merge(sibs, sib.ids, by="tmpName", all.x=T)
#... if we didn't assign an ID based on 'tmpName', give them a new, sequential ID
largest_id = max(sib.ids$tmpID) + 1
sibs$tmpID[ is.na(sibs$tmpID) ] = seq(from=largest_id, to=(largest_id + length(sibs$tmpID[ is.na(sibs$tmpID) ]) - 1))
##... and only use the generated ID if we haven't already linked the sib to someone in our original list of herders
sibs$HerderID = ifelse(is.na(sibs$HerderID), sibs$tmpID, sibs$HerderID)
sibs = as.data.table( subset(sibs, select=c(SibID, HerderID, Sex, BirthYear, NumSons, NumDaughters)) )
# each sibling 'HerderID' related to the person we interviewed 'SibID' is also sib with the others related to the interviewee
# get all combinations of sib relationships
# code adapted from: http://stackoverflow.com/a/30312324
sibs2 = left_join(
dplyr::select(sibs, SibID, HerderID1 = HerderID),
dplyr::select(sibs, SibID, HerderID2 = HerderID),
by = "SibID"
) %>%
filter(HerderID1 != HerderID2) %>%
dplyr::select(SibID=HerderID1, HerderID=HerderID2)
sibs2 = as.data.table(sibs2)
setkey(sibs2, SibID, HerderID)
sibs2 = unique(sibs2)
# merge in covariates for the alter in the sib relationship (in this case, 'HerderID')
sibs.s = unique( subset(sibs, HerderID %in% sibs2$HerderID, select=-c(SibID)) ) # covariates to merge
setkey(sibs.s, HerderID)
setkey(sibs2, HerderID)
sibs2 = sibs.s[sibs2]
# de-dupe
setkey(sibs2, SibID, HerderID)
sibs2 = unique(sibs2)
setcolorder(sibs2, c("SibID", "HerderID", "Sex", "BirthYear", "NumSons", "NumDaughters")) # make sure columns in right order before binding
# now need to add the people we interviewed (currently: sibs$SibID) as the alter in the sib relationship
## first, swap ego/alter columns
sibs3 = copy(sibs)
sibs3 = sibs3 %>% dplyr::select(SibID=HerderID, HerderID=SibID)
## get covars from survey table
sibs.s = as.data.table( subset(survey, select=c("HerderID", "Sex", "BirthYear", "NumSons", "NumDaughters")) )
## merge
setkey(sibs3, HerderID)
setkey(sibs.s, HerderID)
sibs3 = sibs.s[sibs3]
setcolorder(sibs3, c("SibID", "HerderID", "Sex", "BirthYear", "NumSons", "NumDaughters")) # make sure columns in right order before binding
# merge all sib combinations
sibs = rbindlist(list(sibs, sibs2, sibs3)) # append newly expanded list of sibs to main list of sibs
# de-dupe
setkey(sibs, SibID, HerderID)
sibs = unique(sibs)
rm(sibs.s, sibs2, sibs3, sib.ids, largest_id, tmpNames)
############################################################################
## Assign new ID numbers to kids if they don't already have them
##
# assign ID numbers to kids (continuing on from participant IDs)
# set participant as parent for their kids and assign ID numbers to kids
names(kids)[names(kids)=="HerderID"] = "ParentID"
names(kids)[names(kids)=="ChildID"] = "HerderID"
largest_id = max(sibs$HerderID) + 1
kids$tmpID = seq(from=largest_id, to=(largest_id + nrow(kids) - 1))
##... but only use new ID if we haven't already linked the sib to someone in our original list of herders
kids$HerderID = ifelse(is.na(kids$HerderID), kids$tmpID, kids$HerderID)
############################################################################
## Make relatedness matrix (as edge list)
##
flog.info("Making relatedness matrix...")
sibs_sub = subset(sibs, select=c("HerderID", "SibID"))
kids_sub = subset(kids, select=c("HerderID", "ParentID"))
names(sibs_sub) = c("Ego", "Alter")
names(kids_sub) = c("Ego", "Alter")
herders.r = rbind(sibs_sub, kids_sub)
herders.r$r = 0.5 # relatedness of sibs and kids
# add kin from netmaps
herders.r = rbind(herders.r, kin)
# convert to a data table and de-dupe
herders.r = as.data.table(herders.r)
setkey(herders.r, Ego, Alter)
herders.r = na.omit(herders.r) # remove incomplete
herders.r = unique(herders.r) # remove duplicates
herders.r = subset(herders.r, Ego!=Alter) # (just in case)
##
## some dyads only have one entry - but each ego-alter pair should also appear as alter-ego - should be symmetric - so add missing dyads
##
# first, assign a dyad ID to each entry
herders.r[, DyadID := ifelse(Ego < Alter, paste(Ego, Alter, sep=""), paste(Alter, Ego, sep=""))]
herders.r$DyadID = as.integer(herders.r$DyadID) # paste() makes it character; convert to number
# count no. times each dyad appears
herders.r.sum = herders.r %>% group_by(DyadID) %>% summarise(n_appearances=length(DyadID))
single.dyads = herders.r.sum$DyadID[ herders.r.sum$n_appearances<2 ] # list of ego-alter pairs without corresponding alter-ego (i.e. appears only once)
herders.r.new = herders.r[DyadID %in% single.dyads] # get entries that only appear once
setnames(herders.r.new, c("Ego", "Alter"), c("Alter", "Ego")) # swap ego and alter column names
setcolorder(herders.r.new, names(herders.r)) # swap column order
herders.r = rbind(herders.r, herders.r.new) # append new dyads onto end of relatedness table
#### DEBUG - are any dyads repeated? ##
#herders.r.sum = ddply(herders.r, .(DyadID), summarise, n_appearances=length(DyadID))
#herders.r.sum[ herders.r.sum$n_appearances>2, ] # do any dyads appear more than twice (too many entries)
####
# remove DyadID - no longer needed
herders.r$DyadID = NULL
# clean up
rm(sibs_sub, kids_sub, largest_id, herders.r.sum, herders.r.new, single.dyads)
############################################################################
## Load dyadic data (or create if file doesn't exist)
##
flog.info("Creating dyadic data...")
source("create dyadic data.r")
############################################################################
## Create social networks
##
flog.info("Creating social networks...")
##
## First, a bit of data wrangling
##
# rename some columns for compatibility with Gephi
setnames(net.maps, "Type", "LinkType")
setnames(net.maps.o, "Type", "LinkType")
setnames(net.maps.i, "Type", "LinkType")
# make dummy variable in `herders` identifying people who were named in the net-maps
# (these are the subset we should be doing SNA with)
herders$NamedInNetmap = ifelse( herders$HerderID %in% unique(c(net.maps$Ego, net.maps$Alter)), 1, 0 )
# keep only one entry for each Ego-Alter pair, rather than separate entries for each help type
# (this would have the same effect as using igraph::simplify())
coop.net = unique(subset(net.maps, select=c(Ego, Alter)))
coop.net.o = unique(subset(net.maps.o, select=c(Ego, Alter)))
coop.net.i = unique(subset(net.maps.i, select=c(Ego, Alter)))
# if we don't know someone's siida membership, assign them to something arbitrary
# this has no effect if we're only working with siida shares, since we know every license owner's siida
herders$SiidaID[ is.na(herders$SiidaID) ] = max(herders$SiidaID, na.rm=T) + 1
##
## create networks
##
# siida membership
siida.mem = subset( herders.wide, SameSiida==1, select=c(Ego, Alter) )
g.siida = graph.data.frame(siida.mem, vertices=herders, directed=T)
rm(siida.mem)
# relatives
kin = subset( herders.wide, r>0, select=c(Ego, Alter, r) ) # this overwrites the previous `kin` df that was created from the net maps (and, thus, was incomplete)
kin$Weight = kin$r
g.kin = graph.data.frame(kin, vertices=herders, directed=T)
# gifts (the `gifts` dataframe was created directly from the netmap, towards the beginning of this file)
gifts$Weight = gifts$GiftSize
g.gifts = graph.data.frame(gifts, vertices=herders, directed=T)
##
## net-maps with interviewees' outbound connections only
##
# cooperation network
g.netmap.o = graph.data.frame(coop.net.o, vertices=herders, directed=T)
# make a cooperation network containing only people we interviewed and people they named
# g.netmap.o.sub = graph.data.frame(coop.net.o,
# vertices = subset(herders, NamedInNetmap==1),
# directed = T)
##
## net-maps with interviewees' inbound connections only
##
# cooperation network
g.netmap.i = graph.data.frame(coop.net.i, vertices=herders, directed=T)
# make a cooperation network containing only people we interviewed and people they named
# g.netmap.i.sub = graph.data.frame(coop.net.i,
# vertices = subset(herders, NamedInNetmap==1),
# directed = T)
##
## net-maps with everything
##
g.all = graph.data.frame(coop.net, vertices = herders, directed = T)
# g.all.sub = graph.data.frame(coop.net, vertices = subset(herders, NamedInNetmap==1), directed = T)
##
## make graphs of each type of net-map (self-reported outbound connections only)
##
net.maps.advice = subset(net.maps.o, LinkType=="Advice", select=c(Ego, Alter))
net.maps.help = subset(net.maps.o, LinkType=="Help", select=c(Ego, Alter))
net.maps.sharing = subset(net.maps.o, LinkType=="Items", select=c(Ego, Alter))
g.advice = simplify( graph.data.frame( net.maps.advice, vertices=herders, directed=T) )
g.help = simplify( graph.data.frame( net.maps.help, vertices=herders, directed=T) )
g.sharing = simplify( graph.data.frame( net.maps.sharing, vertices=herders, directed=T) )
# don't need these anymore
rm(coop.net.i, coop.net.o)
rm(net.maps.i, net.maps.o)
#######################################################################################
## Calculate centrality measures (in/out degrees)
##
flog.info("Calculating network statistics...")
## there are four measures of degree in the cooperative network:
##
## - cooperation given from ego to alter, as reported by ego (`herders$CoopGiven.SelfReport` which is out-degree in `g.netmap.o`)
## - cooperation given from ego to alter, as reported by alter (`herders$CoopGiven.OtherReport` which is out-degree in `g.netmap.i`)
## - cooperation received by ego from alter, as reported by ego (`herders$CoopReceived.SelfReport` which is in-degree in `g.netmap.i`)
## - cooperation received by ego from alter, as reported by alter (`herders$CoopReceived.OthersReport` which is in-degree in `g.netmap.o`)
##
# cooperation from outbound links (in-degree = cooperation received, reported by others; out-degree = cooperation given, reported by self)
deg.in = degree(g.netmap.o, mode="in")
herders$CoopReceived.OthersReport = deg.in[ as.character(herders$HerderID) ]
deg.out = degree(g.netmap.o, mode="out")
herders$CoopGiven.SelfReport = deg.out[ as.character(herders$HerderID) ]
# cooperation from inbound links (in-degree = cooperation received, reported by self; out-degree = cooperation given, reported by others)
deg.in = degree(g.netmap.i, mode="in")
herders$CoopReceived.SelfReport = deg.in[ as.character(herders$HerderID) ]
deg.out = degree(g.netmap.i, mode="out")
herders$CoopGiven.OtherReport = deg.out[ as.character(herders$HerderID) ]
# cooperation from complete network
deg.in = degree(g.all, mode="in")
herders$coop.deg.in = deg.in[ as.character(herders$HerderID) ]
deg.out = degree(g.all, mode="out")
herders$coop.deg.out = deg.out[ as.character(herders$HerderID) ]
# no. gifts received
deg.in = degree(g.gifts, mode="in")
herders$NumGifts = deg.in[ as.character(herders$HerderID) ]
herders$gifts.bin = ifelse(herders$NumGifts > 0, 1, 0) # also create a binary variable for whether/not they received a gift
# no. gifts given
deg.out = degree(g.gifts, mode="out")
herders$NumGiftsGiven = deg.out[ as.character(herders$HerderID) ]
rm(deg.in, deg.out)
#######################################################################################
## tidy up and save data
##
# keep only subset of herder variables used in these analyses
herders = herders %>%
select(HerderID, SiidaID, Interviewed., SiidaShareYN, NamedInNetmap, num.reindeer, SubsistenceTotal,
NumGifts, CoopGiven.OtherReport, CoopGiven.SelfReport, coop.deg.out)
net.maps = net.maps %>%
select(Interviewee:LinkType)
# clean up environment
rm(license.owners.only, keep.30C)
rm(sibs, kids, herd.sizes, herders.r, kin)
rm(survey)
# save processed data
write_csv(herders, file.path(data.out.dir, "herders.csv"))
write_csv(herders.wide, file.path(data.out.dir, "herders-dyadic-wide.csv"))
write_csv(gifts, file.path(data.out.dir, "gifts.csv"))
write_csv(net.maps, file.path(data.out.dir, "net-maps.csv"))
flog.info("Finished preparing data")