Skip to content

Commit

Permalink
better organization
Browse files Browse the repository at this point in the history
prep for #47
  • Loading branch information
wibeasley committed Jun 21, 2018
1 parent db52f1d commit 4490893
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 35 deletions.
Binary file modified inst/doc/Faq.pdf
Binary file not shown.
Binary file modified inst/doc/NlsInvestigator.pdf
Binary file not shown.
Binary file modified inst/doc/NlsyAce.pdf
Binary file not shown.
93 changes: 58 additions & 35 deletions utility/incorporate-outside-datasets.R
Original file line number Diff line number Diff line change
@@ -1,48 +1,66 @@
#This isn't part of the build process. They should be executed infrequently, not for every build.
# Run it when there's a chance the extract data is different, or there's been a new version frrom NlsyLinksDetermination
rm(list=ls(all=TRUE))

# knitr::stitch_rmd(script="./manipulation/te-ellis.R", output="./stitched-output/manipulation/te-ellis.md") # dir.create("./stitched-output/manipulation/", recursive=T)
# For a brief description of this file see the presentation at
# - slides: https://rawgit.com/wibeasley/RAnalysisSkeleton/master/documentation/time-and-effort-synthesis.html#/
# - code: https://github.com/wibeasley/RAnalysisSkeleton/blob/master/documentation/time-and-effort-synthesis.Rpres
rm(list=ls(all=TRUE)) #Clear the variables from previous runs.
if( any(search()=="package:NlsyLinks") ) detach("package:NlsyLinks") #So the lazy-loaded datasets aren't available

# ---- load-sources ------------------------------------------------------------
# if( any(.packages(all.available=TRUE) == "NlsyLinks") ) remove.packages("NlsyLinks") #system("R CMD REMOVE NlsyLinks") #This shouldn't be necesary.
library(magrittr)
library(RODBC)
# require(NlsyLinks) #Don't load' the lazy-loaded datasets shouldn't be accessible

###############################################################
### Declare Paths
###############################################################
requireNamespace("readr" )
requireNamespace("tidyr" )
requireNamespace("dplyr" )

# ---- declare-globals ---------------------------------------------------------
directoryDatasetsCsv <- "./outside-data" #These CSVs are in the repository, but not in the build.
directoryDatasetsRda <- "./data" #These RDAs are derived from the CSV, and included in the build as compressed binaries.
algorithmVersion <- 85L
algorithmVersion <- 85L

pathInputLinks <- file.path(directoryDatasetsCsv, paste0("links-2011-v" , algorithmVersion, ".csv"))
pathInputSubjectDetails <- file.path(directoryDatasetsCsv, paste0("subject-details-v", algorithmVersion, ".csv"))
pathInputSurveyDate <- file.path(directoryDatasetsCsv, paste0("survey-time.csv"))
pathInputExtraOutcomes79 <- file.path(directoryDatasetsCsv, "extra-outcomes-79.csv")

pathOutputExtraOutcomes <- file.path(directoryDatasetsRda, "ExtraOutcomes79.rda")
pathOutputLinkTrim <- file.path(directoryDatasetsRda, "Links79Pair.rda")
pathOutputLinkExpanded <- file.path(directoryDatasetsRda, "Links79PairExpanded.rda")
pathOutputSubjectDetails <- file.path(directoryDatasetsRda, "SubjectDetails79.rda")
pathOutputSurveyDate <- file.path(directoryDatasetsRda, "SurveyDate.rda")

# ---- ExtraOutcomes79 ---------------------------------------------------------
ExtraOutcomes79 <- read.csv(file.path(directoryDatasetsCsv, "extra-outcomes-79.csv"))
# ---- load-data ---------------------------------------------------------------
dsLinks79PairWithoutOutcomes <- read.csv(pathInputLinks , stringsAsFactors=FALSE)
ExtraOutcomes79 <- read.csv(pathInputExtraOutcomes79 , stringsAsFactors=TRUE )
SubjectDetails79 <- read.csv(pathInputSubjectDetails , stringsAsFactors=TRUE )
SurveyDate <- read.csv(pathInputSurveyDate , stringsAsFactors=FALSE)

# ---- tweak-data --------------------------------------------------------------

save(ExtraOutcomes79, file=pathOutputExtraOutcomes, compress="xz")
# ---- Groom ExtraOutcomes79 ---------------------------------------------------------
ExtraOutcomes79 <- ExtraOutcomes79 %>%
as.data.frame()

# ---- Links79PairExpanded and Links79Pair -------------------------------------
dsLinks79PairWithoutOutcomes <- pathInputLinks %>%
read.csv(stringsAsFactors=FALSE) %>%
# ---- Groom Links79PairExpanded and Links79Pair -------------------------------------
dsLinks79PairWithoutOutcomes <- dsLinks79PairWithoutOutcomes %>%
dplyr::select(-MultipleBirthIfSameSex, -RImplicitSubject, -RImplicitMother)

ExtraOutcomes79$SubjectTag <- NlsyLinks::CreateSubjectTag(subjectID=ExtraOutcomes79$SubjectID, generation=ExtraOutcomes79$Generation)
# colnames(dsLinks79PairWithoutOutcomes)
ExtraOutcomes79WithTags <- ExtraOutcomes79 %>%
dplyr::mutate(
SubjectTag = NlsyLinks::CreateSubjectTag(subjectID=SubjectID, generation=Generation)
)

remaining <- setdiff(colnames(dsLinks79PairWithoutOutcomes), c("SubjectTag_S1", "SubjectTag_S2"))
relationshipLabels <- c("Gen1Housemates","Gen2Siblings","Gen2Cousins","ParentChild", "AuntNiece")

Links79PairExpanded <- c("MathStandardized", "HeightZGenderAge") %>%
NlsyLinks::CreatePairLinksSingleEntered(
outcomeNames = .,
outcomeDataset = ExtraOutcomes79,
outcomeDataset = ExtraOutcomes79WithTags,
linksPairDataset = dsLinks79PairWithoutOutcomes,
linksNames = remaining
) %>%
Expand All @@ -51,25 +69,19 @@ Links79PairExpanded <- c("MathStandardized", "HeightZGenderAge") %>%
RelationshipPath = factor(RelationshipPath, levels=seq_along(relationshipLabels), labels=relationshipLabels),
IsMz = factor(IsMz , levels=c(0, 1, 255), labels=c("No", "Yes", "DoNotKnow")),
EverSharedHouse = as.logical(EverSharedHouse)

) %>%
dplyr::select(-RImplicitDifference) %>%
dplyr::arrange(ExtendedID, SubjectTag_S1, SubjectTag_S2)

dplyr::arrange(ExtendedID, SubjectTag_S1, SubjectTag_S2) %>%
as.data.frame()

### Prepare for rda
# multipleBirthLabels <- c("No", "Twin", "Triplet", "DoNotKnow")
# Links79PairExpanded$MultipleBirth <- factor(Links79PairExpanded$MultipleBirth, levels=c(0, 2, 3, 255), labels=multipleBirthLabels)

Links79Pair <- Links79PairExpanded %>%
dplyr::select(ExtendedID, SubjectTag_S1, SubjectTag_S2, R, RelationshipPath)

save(Links79Pair , file=pathOutputLinkTrim , compress="xz")
save(Links79PairExpanded, file=pathOutputLinkExpanded, compress="xz")

# ---- SubjectDetails ----------------------------------------------------------
SubjectDetails79 <- read.csv(pathInputSubjectDetails, stringsAsFactors=TRUE)
dplyr::select(ExtendedID, SubjectTag_S1, SubjectTag_S2, R, RelationshipPath) %>%
as.data.frame()

# ---- Groom SubjectDetails ----------------------------------------------------------
vectorOfTwins <- sort(unique(unlist(Links79PairExpanded[Links79PairExpanded$IsMz=="Yes", c("SubjectTag_S1", "SubjectTag_S2")])))

SubjectDetails79 <- SubjectDetails79 %>%
Expand All @@ -82,19 +94,30 @@ SubjectDetails79 <- SubjectDetails79 %>%
dplyr::select(
-IsDead, #This isn't finished yet.
-DeathDate #This isn't finished yet.
)

save(SubjectDetails79, file=pathOutputSubjectDetails, compress="xz")

# ---- SurveyDate --------------------------------------------------------------
SurveyDate <- read.csv(pathInputSurveyDate, stringsAsFactors=FALSE)
) %>%
as.data.frame()

# ---- Groom SurveyDate --------------------------------------------------------------
SurveyDate <- SurveyDate %>%
dplyr::mutate(
SurveySource = factor(SurveySource, levels=0:3, labels=c("NoInterview", "Gen1", "Gen2C", "Gen2YA")),
SurveyDate = as.Date(SurveyDate),
Age = ifelse(!is.na(AgeCalculateYears), AgeCalculateYears, AgeSelfReportYears)
) %>%
dplyr::arrange(SubjectTag, SurveySource, SurveyYear)

save(SurveyDate, file=pathOutputSurveyDate, compress="xz")
dplyr::arrange(SubjectTag, SurveySource, SurveyYear) %>%
as.data.frame()

# ---- verify-values -----------------------------------------------------------

checkmate::assert_data_frame(ExtraOutcomes79 , min.rows=100)
checkmate::assert_data_frame(Links79Pair , min.rows=100)
checkmate::assert_data_frame(Links79PairExpanded , min.rows=100)
checkmate::assert_data_frame(SubjectDetails79 , min.rows=100)
checkmate::assert_data_frame(SurveyDate , min.rows=100)

# ---- save-to-disk ------------------------------------------------------------
save(ExtraOutcomes79 , file=pathOutputExtraOutcomes , compress="xz")
save(Links79Pair , file=pathOutputLinkTrim , compress="xz")
save(Links79PairExpanded , file=pathOutputLinkExpanded , compress="xz")
save(SubjectDetails79 , file=pathOutputSubjectDetails , compress="xz")
save(SurveyDate , file=pathOutputSurveyDate , compress="xz")

0 comments on commit 4490893

Please sign in to comment.