Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions aggregate-raw-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ debug <- FALSE
data_dir <- file.path("raw-data")
output_dir <- file.path("data")
glotto_path <- file.path("mappings", "InventoryID-LanguageCodes.csv")
output_path <- file.path(output_dir, "phoible-by-phoneme.csv")
output_path_rdata <- file.path(output_dir, "phoible-by-phoneme.RData")
output_path <- file.path(output_dir, "phoible-nofeats.csv")
output_path_rdata <- file.path(output_dir, "phoible-nofeats.RData")
if (!dir.exists(output_dir)) dir.create(output_dir, mode="0755")

## LOAD EXTERNAL FUNCTIONS
Expand Down Expand Up @@ -232,30 +232,30 @@ if (!debug) rm(saphon_raw, saphon_ipa)
## ## ## ## ## ## ## ## ##

## combine into one data frame
data_sources_list <- list(ph_data, aa_data, spa_data, upsid_data,
ra_data, gm_data, saphon_data, uz_data, ea_data, er_data)
data_sources_list <- list(ph_data, aa_data, spa_data, upsid_data, ra_data,
gm_data, saphon_data, uz_data, ea_data, er_data)
all_data <- do.call(rbind, data_sources_list)
all_data <- all_data[with(all_data, order(LanguageCode, Source, InventoryID)),]
all_data <- all_data[order(all_data$InventoryID),]

## MERGE IN GLOTTOLOG CODES
glotto_mapping <- read.csv(glotto_path)
glotto_mapping <- glotto_mapping[c("InventoryID", "Glottocode")]
glotto_mapping <- glotto_mapping[c("InventoryID", "Glottocode", "ISO6393")]
all_data <- merge(all_data, glotto_mapping, all.x=TRUE)

## ADD GLYPH IDs
all_data$GlyphID <- get_codepoints(all_data$Phoneme)

## CONVERT INVENTORY ID TO INTEGER
all_data$InventoryID <- as.numeric(all_data$InventoryID)
all_data$InventoryID <- as.integer(all_data$InventoryID)

## SAVE
output_fields <- c("InventoryID", "Glottocode", "LanguageCode", "LanguageName",
output_fields <- c("InventoryID", "Glottocode", "ISO6393", "LanguageName",
"SpecificDialect", "GlyphID", "Phoneme", "Allophones",
"Marginal", "Source")
phoible <- all_data[output_fields]
write.csv(phoible, file=output_path, row.names=FALSE, quote=TRUE, eol="\n",
fileEncoding="UTF-8")
save(phoible, file=output_path_rdata)
"Marginal", "SegmentClass", "Source")
phoible_nofeats <- all_data[output_fields]
write.csv(phoible_nofeats, file=output_path, row.names=FALSE, quote=TRUE,
eol="\n", fileEncoding="UTF-8")
save(phoible_nofeats, file=output_path_rdata)
## WRITE LOG FILE
if(exists("unfamiliar_glyphs")) {
log_path <- file.path(output_dir, "unfamiliar-glyphs.csv")
Expand Down
Binary file removed data/phoible-by-phoneme.RData
Binary file not shown.
105,482 changes: 0 additions & 105,482 deletions data/phoible-by-phoneme.csv

This file was deleted.

Binary file added data/phoible.RData
Binary file not shown.
105,482 changes: 105,482 additions & 0 deletions data/phoible.csv

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion mappings/InventoryID-LanguageCodes.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"InventoryID","LanguageCode","Glottocode","LanguageName","Source"
"InventoryID","ISO6393","Glottocode","LanguageName","Source"
1,"kor","kore1280","Korean","spa"
2,"ket","kett1243","Ket","spa"
3,"lbe","lakk1252","Lak","spa"
Expand Down
124 changes: 62 additions & 62 deletions raw-data/EA/EA_IPA_correspondences.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ aɪ aɪ NA TRUE
aɪ̃ aɪ̃ NA FALSE
a̰ɪ̰ a̰ɪ̰ NA FALSE
ãɪ̯ː ãɪ̯ː NA FALSE
aj aj NA TRUE
ãj ãj NA FALSE
aj ai̯ conventionalized TRUE
ãj ãi̯ conventionalized FALSE
ao ao NA TRUE
ãõ ãõ NA FALSE
aɵ̯ aɵ̯ NA FALSE
Expand All @@ -97,9 +97,9 @@ a̰ɯ̰ a̰ɯ̰ NA FALSE
aɰ aɯ guess TRUE
aʊ aʊ NA TRUE
aʊ̃ aʊ̃ NA FALSE
aw aw NA TRUE
ãw ãw NA FALSE
ãw̃ ãw̃ NA FALSE
aw au̯ conventionalized TRUE
ãw ãu̯ conventionalized FALSE
ãw̃ ãũ̯ conventionalized FALSE
aˤ aˤ NA TRUE
aˤː aːˤ NA TRUE
ɐ ɐ NA TRUE
Expand Down Expand Up @@ -280,8 +280,8 @@ eɪ eɪ NA TRUE
eɪ̃ eɪ̃ NA FALSE
ẽɪ̃ ẽɪ̃ NA FALSE
ḛɪ̰ ḛɪ̰ NA FALSE
ej ej NA TRUE
ẽj ẽj NA FALSE
ej ei̯ conventionalized TRUE
ẽj ẽi̯ conventionalized FALSE
eo eo NA TRUE
eu eu NA TRUE
eu̯ eu̯ NA TRUE
Expand All @@ -305,7 +305,7 @@ ew ue guess TRUE
əi̯ əi̯ NA FALSE
ə̃ĩ ə̃ĩ NA FALSE
əɨ əɨ NA FALSE
əj əj NA FALSE
əj əi̯ conventionalized FALSE
əo əo NA FALSE
əu əu NA TRUE
əũ əũ NA FALSE
Expand All @@ -314,7 +314,7 @@ ew ue guess TRUE
əɯ̃ əɯ̃ NA FALSE
ə̰ɯ̰ ə̰ɯ̰ NA FALSE
əʊ əʊ NA FALSE
əw əw NA FALSE
əw əu̯ conventionalized FALSE
əy əy NA TRUE
əˤ əˤ new segment type added FALSE
ɛ ɛ NA TRUE
Expand Down Expand Up @@ -358,7 +358,7 @@ ew ue guess TRUE
ɜː ɜː NA TRUE
ɜ̹ː ɜ̹ː NA FALSE
ɜi ɜi NA TRUE
ɜj ɜj NA FALSE
ɜj ɜi̯ conventionalized FALSE
ɞ ɞ NA TRUE
ɞ̜ ɞ̜ NA FALSE
ɞ̠ ɞ̠ NA FALSE
Expand Down Expand Up @@ -525,7 +525,7 @@ iɜ iɜ NA FALSE
ii ii NA FALSE
ii̯ ii̯ NA FALSE
iɪ̯ iɪ̯ NA FALSE
ij ij NA FALSE
ij ii̯ conventionalized FALSE
io io NA TRUE
iø iø NA FALSE
ĩõ ĩõ NA FALSE
Expand Down Expand Up @@ -577,33 +577,33 @@ iˤæ iˤæ NA FALSE
ɨɛ ɨɛ NA FALSE
ɨi ɨi NA TRUE
ɨi̯ ɨi̯ NA FALSE
ɨj ɨj NA FALSE
ɨj ɨi̯ conventionalized FALSE
j j NA TRUE
j̊ j̊ NA FALSE
j̃ j̃ NA TRUE
jː jː NA TRUE
ja ja NA TRUE
jaː jaː NA FALSE
jæ NA FALSE
jæi jæi NA FALSE
jai jai NA FALSE
jau jau NA FALSE
jau̯ jau̯ NA TRUE
je je NA TRUE
jẽ jẽ NA FALSE
jei̯ jei̯ NA FALSE
jɛ NA TRUE
jɛ̃ jɛ̃ NA FALSE
jɛi jɛi NA FALSE
jɛi̯ jɛi̯ NA FALSE
jo jo NA TRUE
joː joː NA FALSE
joi joi NA FALSE
jɔ NA TRUE
ju ju NA TRUE
juː juː NA FALSE
juaː juaː NA FALSE
jui jui NA FALSE
ja i̯a conventionalized TRUE
jaː i̯aː conventionalized FALSE
i̯æ conventionalized FALSE
jæi i̯æi conventionalized FALSE
jai i̯ai conventionalized FALSE
jau i̯au conventionalized FALSE
jau̯ i̯au̯ conventionalized TRUE
je i̯e conventionalized TRUE
jẽ i̯ẽ conventionalized FALSE
jei̯ i̯ei̯ conventionalized FALSE
i̯ɛ conventionalized TRUE
jɛ̃ i̯ɛ̃ conventionalized FALSE
jɛi i̯ɛi conventionalized FALSE
jɛi̯ i̯ɛi̯ conventionalized FALSE
jo i̯o conventionalized TRUE
joː i̯oː conventionalized FALSE
joi i̯oi conventionalized FALSE
i̯ɔ conventionalized TRUE
ju i̯u conventionalized TRUE
juː i̯uː conventionalized FALSE
juaː i̯uaː conventionalized FALSE
jui i̯ui conventionalized FALSE
ʝ ʝ NA TRUE
ʝː ʝː NA FALSE
ɟ ɟ NA TRUE
Expand Down Expand Up @@ -833,23 +833,23 @@ oe oe NA TRUE
œ̃ɛ̃ œ̃ɛ̃ NA FALSE
œ̞ɛ̞ œ̞ɛ̞ new segment type added; TODO: investigate FALSE
œi œi NA TRUE
œj œj NA FALSE
œj œi̯ conventionalized FALSE
œy œy NA TRUE
oə oə NA FALSE
øə øə NA FALSE
oi oi NA TRUE
oi̯ oi̯ NA TRUE
o̞i̯ o̞i̯ NA FALSE
øɪ̯ øɪ̯ NA FALSE
oj oj NA TRUE
õj õj NA FALSE
oj oi̯ conventionalized TRUE
õj õi̯ conventionalized FALSE
oɔ oɔ NA FALSE
ou ou NA TRUE
ou̯ ou̯ NA TRUE
øu øu NA FALSE
oʊ oʊ NA TRUE
oʊ̃ oʊ̃ NA FALSE
ow ow NA TRUE
ow ou̯ conventionalized TRUE
øy øy NA TRUE
øy̯ øy̯ NA FALSE
oˤ oˤ NA TRUE
Expand Down Expand Up @@ -889,7 +889,7 @@ oˤː oːˤ NA FALSE
ɔʊ ɔʊ NA FALSE
ɔʊ̯ ɔʊ̯ NA FALSE
ɔ̃ʊ̯̃ ɔ̃ʊ̯̃ NA FALSE
ɔw ɔw NA FALSE
ɔw ɔu̯ conventionalized FALSE
ɔy̯ː ɔy̯ː NA FALSE
ɔʏ ɔʏ NA FALSE
ɔˤː ɔːˤ NA FALSE
Expand Down Expand Up @@ -1238,7 +1238,7 @@ uɜ uɜ NA FALSE
ui ui NA TRUE
ui̯ ui̯ NA TRUE
ũĩ ũĩ NA TRUE
uj uj NA TRUE
uj ui̯ conventionalized TRUE
uo uo NA TRUE
uo̞ uo̞ NA FALSE
ũo ũo NA FALSE
Expand Down Expand Up @@ -1333,35 +1333,35 @@ vˤ vˤ NA FALSE
w w NA TRUE
w̜ w̜ NA FALSE
w̥ w̥ NA TRUE
wa wa NA TRUE
wã wã NA FALSE
waː waː NA FALSE
wæi wæi NA FALSE
wai wai NA FALSE
wa u̯a conventionalized TRUE
wã u̯ã conventionalized FALSE
waː u̯aː conventionalized FALSE
wæi u̯æi conventionalized FALSE
wai u̯ai conventionalized FALSE
ʷd ʷd NA FALSE
we we NA TRUE
wei̯ wei̯ NA TRUE
weu̯ weu̯ NA FALSE
wə NA TRUE
wə̃ wə̃ NA FALSE
NA FALSE
wɛ̃ wɛ̃ NA FALSE
wɛi wɛi NA FALSE
wɛi̯ wɛi̯ NA FALSE
we u̯e conventionalized TRUE
wei̯ u̯ei̯ conventionalized TRUE
weu̯ u̯eu̯ conventionalized FALSE
u̯ə conventionalized TRUE
wə̃ u̯ə̃ conventionalized FALSE
u̯ɛ NA FALSE
wɛ̃ u̯ɛ̃ conventionalized FALSE
wɛi u̯ɛi conventionalized FALSE
wɛi̯ u̯ɛi̯ conventionalized FALSE
wʰ wʰ NA FALSE
wi wi NA TRUE
wi u̯i NA TRUE
wʲ wʲ NA TRUE
w̜ʲ w̜ʲ NA FALSE
ʷɟ ʷɟ NA FALSE
ʷl ʷl NA FALSE
ʷm ʷm NA FALSE
wo wo NA TRUE
wõ wõ NA FALSE
woː woː NA FALSE
wõː wõː NA FALSE
wɔ NA FALSE
wɔ̃ wɔ̃ NA FALSE
wɔː wɔː NA FALSE
wo u̯o conventionalized TRUE
wõ u̯õ conventionalized FALSE
woː u̯oː conventionalized FALSE
wõː u̯õː conventionalized FALSE
u̯ɔ conventionalized FALSE
wɔ̃ u̯ɔ̃ conventionalized FALSE
wɔː u̯ɔː conventionalized FALSE
ʷr ʷr NA FALSE
ʷz ʷz NA FALSE
ʷʐ ʷʐ NA FALSE
Expand Down
Loading