# Editing the AAChange column
Author: Vanessa Pitz

# Overview
0. Load input
1. Change layout
2. Add helper columns
3. Merge back together
4. Checks and fixes
5. Write files

Layout input file:

| MarkerName     | Gene   | AAChange                                                                                                                                                                                                                                                                                                                                                                                                 |
|----------------|--------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| chr1:20638104  | PINK1  | PINK1:NM_032409:exon2:c.C650C:p.A217A                                                                                                                                                                                                                                                                                                                                                                    |
| chr22:38112571 | PLA2G6 | PLA2G6:NM_001004426:exon15:c.G2047G:p.D683D,PLA2G6:NM_001199562:exon15:c.G2047G:p.D683D,PLA2G6:NM_001349865:exon15:c.G2047G:p.D683D,PLA2G6:NM_001349866:exon15:c.G2047G:p.D683D,PLA2G6:NM_001349868:exon15:c.G1531G:p.D511D,PLA2G6:NM_001349864:exon16:c.G2209G:p.D737D,PLA2G6:NM_001349869:exon16:c.G1513G:p.D505D,PLA2G6:NM_003560:exon16:c.G2209G:p.D737D,PLA2G6:NM_001349867:exon17:c.G1675G:p.D559D |


And yes, this script exists to clean up the mess in the AAChange column, which is generated when annotating variants. We only want to focus on the protein level changes, i.e. “p.xx”

# 0. Start by reading in the file

In [None]:
library(dplyr)
library(tidyr)
library(data.table)
library(ggplot2)
library(stringr)

data = fread("834_23andMe_rare_variants_annotated_fullfile.txt", header = T)
data1 = data %>% unite("MarkerName", c("Chr", "Start"), sep = ":")

In [8]:
# rename some columns
#data1 = data1 %>% rename("Gene" = Gene.refGene, "AAChange" =AAChange.refGene)

variants = data1 %>% dplyr::select(MarkerName,Gene,AAChange)
# one quick edit because the gene name is wrong
variants$Gene[variants$Gene == "FBXO7;FBXO7"] <- "FBXO7"

MarkerName,Gene,AAChange
<chr>,<chr>,<chr>
chr1:155235024,GBA,"GBA:NM_001171811:exon10:c.A1321G:p.I441V,GBA:NM_001171812:exon10:c.A1435G:p.I479V,GBA:NM_000157:exon11:c.A1582G:p.I528V,GBA:NM_001005741:exon12:c.A1582G:p.I528V,GBA:NM_001005742:exon12:c.A1582G:p.I528V"
chr1:155235057,GBA,"GBA:NM_001171811:exon10:c.G1288A:p.G430S,GBA:NM_001171812:exon10:c.G1402A:p.G468S,GBA:NM_000157:exon11:c.G1549A:p.G517S,GBA:NM_001005741:exon12:c.G1549A:p.G517S,GBA:NM_001005742:exon12:c.G1549A:p.G517S"
chr1:155235195,GBA,"GBA:NM_001171811:exon9:c.G1244A:p.R415H,GBA:NM_001171812:exon9:c.G1358A:p.R453H,GBA:NM_000157:exon10:c.G1505A:p.R502H,GBA:NM_001005741:exon11:c.G1505A:p.R502H,GBA:NM_001005742:exon11:c.G1505A:p.R502H"
chr1:155235196,GBA,"GBA:NM_001171811:exon9:c.C1243T:p.R415C,GBA:NM_001171812:exon9:c.C1357T:p.R453C,GBA:NM_000157:exon10:c.C1504T:p.R502C,GBA:NM_001005741:exon11:c.C1504T:p.R502C,GBA:NM_001005742:exon11:c.C1504T:p.R502C"
chr1:155235197,GBA,"GBA:NM_001171811:exon9:c.C1242G:p.N414K,GBA:NM_001171812:exon9:c.C1356G:p.N452K,GBA:NM_000157:exon10:c.C1503G:p.N501K,GBA:NM_001005741:exon11:c.C1503G:p.N501K,GBA:NM_001005742:exon11:c.C1503G:p.N501K"
chr1:155235231,GBA,"GBA:NM_001171811:exon9:c.A1208G:p.H403R,GBA:NM_001171812:exon9:c.A1322G:p.H441R,GBA:NM_000157:exon10:c.A1469G:p.H490R,GBA:NM_001005741:exon11:c.A1469G:p.H490R,GBA:NM_001005742:exon11:c.A1469G:p.H490R"


## 1. Change AAChange layout - wide to long

### 1.1 Add a gene count, so we know how many records of that gene we have

In [9]:
variants = variants %>% group_by(Gene) %>% add_count() %>% ungroup()

### 1.2 Split the very long AAChange contents up at the “,” and write the wide format into long format: Now we have multiple rows per MarkeName

In [10]:
variants_sep = variants %>% separate_rows(AAChange, sep =",")
head(variants_sep)

MarkerName,Gene,AAChange,n
<chr>,<chr>,<chr>,<int>
chr1:155235024,GBA,GBA:NM_001171811:exon10:c.A1321G:p.I441V,103
chr1:155235024,GBA,GBA:NM_001171812:exon10:c.A1435G:p.I479V,103
chr1:155235024,GBA,GBA:NM_000157:exon11:c.A1582G:p.I528V,103
chr1:155235024,GBA,GBA:NM_001005741:exon12:c.A1582G:p.I528V,103
chr1:155235024,GBA,GBA:NM_001005742:exon12:c.A1582G:p.I528V,103
chr1:155235057,GBA,GBA:NM_001171811:exon10:c.G1288A:p.G430S,103


### 1.3 Separate AAChange and only keep columns you need

In [11]:
variants_sep = variants_sep %>% tidyr::separate(AAChange, c("Gene2", "NM", "exon", "c", "p"), sep = ":") %>% dplyr::select(MarkerName, Gene, NM, p, n)

# Warning message: Expected 5 pieces. Missing pieces filled with `NA` in 17 rows [10, 20, 43, 190, 280, 373, 378, 515, 815, 1474, 1523, 1596, 1860, 2116, 2275, 2327, 2419]


head(variants_sep)

“Expected 5 pieces. Missing pieces filled with `NA` in 19 rows [434, 435, 470, 534, 617, 662, 699, 708, 737, 758, 795, 908, 927, 954, 961, 1109, 1724, 1914, 2256].”


MarkerName,Gene,NM,p,n
<chr>,<chr>,<chr>,<chr>,<int>
chr1:155235024,GBA,NM_001171811,p.I441V,103
chr1:155235024,GBA,NM_001171812,p.I479V,103
chr1:155235024,GBA,NM_000157,p.I528V,103
chr1:155235024,GBA,NM_001005741,p.I528V,103
chr1:155235024,GBA,NM_001005742,p.I528V,103
chr1:155235057,GBA,NM_001171811,p.G430S,103


In [12]:
# little beauty correction to remove the "p."
variants_sep$p = gsub("p.", "", variants_sep$p)

## 2. Add helper columns

### 2.1 Identify NM with the highest count per gene

In [13]:
countNM = variants_sep %>% group_by(Gene, NM) %>% tally() %>% arrange(Gene) %>% rename("Gene_NM_count" = n)

head(countNM)

Gene,NM,Gene_NM_count
<chr>,<chr>,<int>
ATP13A2,NM_001141973,35
ATP13A2,NM_001141974,39
ATP13A2,NM_022089,35
ATP13A2,,1
DNAJC13,NM_001329126,41
DNAJC13,NM_015268,41


Identify the highest count
This could’ve been easily done with e.g. slice() but we have multiple ones with the highest count and we don’t just want to remove them. It’s a bit more complicated but this now gives us the highest count value per gene per NM.

In [14]:
topNM = variants_sep %>% group_by(Gene, NM) %>% tally() %>% slice(which.max(n)) %>% dplyr::select(Gene,n) %>% ungroup()

head(topNM)

Gene,n
<chr>,<int>
ATP13A2,39
DNAJC13,41
DNAJC6,16
EIF4G1,41
FBXO7,11
GBA,101


In [15]:
# Add the highest count
# Now add the highest count column and filter for those Gene_NM_count == n

fulljoin = full_join(countNM, topNM) 
head(fulljoin)

[1m[22mJoining, by = "Gene"


Gene,NM,Gene_NM_count,n
<chr>,<chr>,<int>,<int>
ATP13A2,NM_001141973,35,39
ATP13A2,NM_001141974,39,39
ATP13A2,NM_022089,35,39
ATP13A2,,1,39
DNAJC13,NM_001329126,41,41
DNAJC13,NM_015268,41,41


In [16]:
# Now identify the NM_x to keep - note there are sometimes multiple per gene

fulljoin = fulljoin %>% filter(Gene_NM_count == n)
fulljoin$Keep = "Yes"
fulljoin = fulljoin %>% select(Gene, NM, Keep)

head(fulljoin)

Gene,NM,Keep
<chr>,<chr>,<chr>
ATP13A2,NM_001141974,Yes
DNAJC13,NM_001329126,Yes
DNAJC13,NM_015268,Yes
DNAJC6,NM_001256864,Yes
DNAJC6,NM_001256865,Yes
DNAJC6,NM_014787,Yes


In [17]:
fulljoin = fulljoin %>% separate(NM, c("NM", "NM_number"), sep = "_")
fulljoin = fulljoin %>% group_by(Gene) %>% dplyr::slice(which.min(NM_number)) %>% arrange(NM_number) %>% ungroup()
fulljoin = fulljoin %>% unite(NM, c("NM", "NM_number"), sep = "_")

## 3. Join those helper files with original file

In [18]:
fulljoin2 = full_join(variants_sep, fulljoin) %>% filter(!is.na(Keep))

head(fulljoin2)

[1m[22mJoining, by = c("Gene", "NM")


MarkerName,Gene,NM,p,n,Keep
<chr>,<chr>,<chr>,<chr>,<int>,<chr>
chr1:155235024,GBA,NM_000157,I528V,103,Yes
chr1:155235057,GBA,NM_000157,G517S,103,Yes
chr1:155235195,GBA,NM_000157,R502H,103,Yes
chr1:155235196,GBA,NM_000157,R502C,103,Yes
chr1:155235197,GBA,NM_000157,N501K,103,Yes
chr1:155235231,GBA,NM_000157,H490R,103,Yes


In [19]:
# write this into a file, in case we end up with multiple AAchanges and need to trace back
write.table(fulljoin2, "NM_transcript_list.txt", row.names=F, sep = "\t", quote = F)

In [20]:
# different NM's can have the same p.Change!
# some manual changes
fulljoin2$Gene1 = fulljoin2$Gene
fulljoin2 = fulljoin2 %>% tidyr::unite("VariantName", c("Gene", "p"), sep ="_") %>% rename("Gene" = Gene1)  %>% select(MarkerName, Gene, VariantName) %>% distinct()

Concatenate p.s when there are multiple per MarkerName

In [22]:
fulljoin2 = fulljoin2 %>% group_by(MarkerName) %>% dplyr::summarise(VariantName = paste(VariantName, collapse = ", "))
fulljoin2 = fulljoin2 %>% distinct()

toadd = fulljoin2 %>% dplyr::select(MarkerName, VariantName)
leftjoin = left_join(data1, toadd)
leftjoin = leftjoin %>% distinct()

head(leftjoin)

[1m[22mJoining, by = "MarkerName"


MarkerName,End,Ref,Alt,Func.refGene,Gene,GeneDetail.refGene,ExonicFunc.refGene,AAChange,avsnp150,⋯,AF_eas,AF_nfe,AF_fin,AF_asj,AF_oth,non_topmed_AF_popmax,non_neuro_AF_popmax,non_cancer_AF_popmax,controls_AF_popmax,VariantName
<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
chr1:155235024,155235024,T,C,exonic,GBA,.,nonsynonymous SNV,"GBA:NM_001171811:exon10:c.A1321G:p.I441V,GBA:NM_001171812:exon10:c.A1435G:p.I479V,GBA:NM_000157:exon11:c.A1582G:p.I528V,GBA:NM_001005741:exon12:c.A1582G:p.I528V,GBA:NM_001005742:exon12:c.A1582G:p.I528V",rs536425950,⋯,.,.,.,.,.,.,.,.,.,GBA_I528V
chr1:155235057,155235057,C,T,exonic,GBA,.,nonsynonymous SNV,"GBA:NM_001171811:exon10:c.G1288A:p.G430S,GBA:NM_001171812:exon10:c.G1402A:p.G468S,GBA:NM_000157:exon11:c.G1549A:p.G517S,GBA:NM_001005741:exon12:c.G1549A:p.G517S,GBA:NM_001005742:exon12:c.G1549A:p.G517S",rs121908301,⋯,.,.,.,.,.,.,.,.,.,GBA_G517S
chr1:155235195,155235195,C,T,exonic,GBA,.,nonsynonymous SNV,"GBA:NM_001171811:exon9:c.G1244A:p.R415H,GBA:NM_001171812:exon9:c.G1358A:p.R453H,GBA:NM_000157:exon10:c.G1505A:p.R502H,GBA:NM_001005741:exon11:c.G1505A:p.R502H,GBA:NM_001005742:exon11:c.G1505A:p.R502H",rs80356772,⋯,.,.,.,.,.,.,.,.,.,GBA_R502H
chr1:155235196,155235196,G,A,exonic,GBA,.,nonsynonymous SNV,"GBA:NM_001171811:exon9:c.C1243T:p.R415C,GBA:NM_001171812:exon9:c.C1357T:p.R453C,GBA:NM_000157:exon10:c.C1504T:p.R502C,GBA:NM_001005741:exon11:c.C1504T:p.R502C,GBA:NM_001005742:exon11:c.C1504T:p.R502C",rs80356771,⋯,0,0,0,0,0,0.0001,.,.,.,GBA_R502C
chr1:155235197,155235197,G,C,exonic,GBA,.,nonsynonymous SNV,"GBA:NM_001171811:exon9:c.C1242G:p.N414K,GBA:NM_001171812:exon9:c.C1356G:p.N452K,GBA:NM_000157:exon10:c.C1503G:p.N501K,GBA:NM_001005741:exon11:c.C1503G:p.N501K,GBA:NM_001005742:exon11:c.C1503G:p.N501K",.,⋯,.,.,.,.,.,.,.,.,.,GBA_N501K
chr1:155235231,155235231,T,C,exonic,GBA,.,nonsynonymous SNV,"GBA:NM_001171811:exon9:c.A1208G:p.H403R,GBA:NM_001171812:exon9:c.A1322G:p.H441R,GBA:NM_000157:exon10:c.A1469G:p.H490R,GBA:NM_001005741:exon11:c.A1469G:p.H490R,GBA:NM_001005742:exon11:c.A1469G:p.H490R",rs76071730,⋯,.,.,.,.,.,.,.,.,.,GBA_H490R


Some edits still required, where there's more than just one variant name, e.g. here with DNAJC6

## 4. Checks

In [23]:
empty = leftjoin %>% filter(is.na(VariantName)) %>% select(MarkerName)
dim(empty)

In [26]:
merge_empty = merge(empty, data1)
dim(merge_empty)

In [27]:
head(merge_empty)

MarkerName,End,Ref,Alt,Func.refGene,Gene,GeneDetail.refGene,ExonicFunc.refGene,AAChange,avsnp150,⋯,AF_amr,AF_eas,AF_nfe,AF_fin,AF_asj,AF_oth,non_topmed_AF_popmax,non_neuro_AF_popmax,non_cancer_AF_popmax,controls_AF_popmax
<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
chr12:40232385,40232385,T,C,splicing,LRRK2,NM_198578:exon3:c.347+2T>C,.,.,rs141221000,⋯,0.0012,0,0,0,0,0,0.0035,0.0030,.,0.0031
chr15:61875847,61875847,T,A,splicing,VPS13C,NM_020821:exon76:c.10225-2A>T;NM_017684:exon74:c.10096-2A>T;NM_018080:exon74:c.10096-2A>T;NM_001018088:exon76:c.10225-2A>T,.,.,rs780867411,⋯,.,.,.,.,.,.,.,.,.,.
chr15:61922396,61922396,C,T,splicing,VPS13C,NM_020821:exon54:c.6975+1G>A;NM_017684:exon52:c.6846+1G>A;NM_018080:exon52:c.6846+1G>A;NM_001018088:exon54:c.6975+1G>A,.,.,rs781120650,⋯,0,0,0.0001,0,0,0,0.0002,0.0001,.,0.0004
chr15:61969298,61969298,C,A,splicing,VPS13C,NM_020821:exon28:c.2911+1G>T;NM_017684:exon26:c.2782+1G>T;NM_018080:exon26:c.2782+1G>T;NM_001018088:exon28:c.2911+1G>T,.,.,rs766792785,⋯,.,.,.,.,.,.,.,.,.,.
chr15:61983818,61983818,A,G,splicing,VPS13C,NM_020821:exon20:c.1914+2T>C;NM_017684:exon18:c.1785+2T>C;NM_018080:exon18:c.1785+2T>C;NM_001018088:exon20:c.1914+2T>C,.,.,rs775829630,⋯,.,.,.,.,.,.,.,.,.,.
chr15:61991096,61991096,T,C,splicing,VPS13C,NM_020821:exon18:c.1484-2A>G;NM_017684:exon16:c.1355-2A>G;NM_018080:exon16:c.1355-2A>G;NM_001018088:exon18:c.1484-2A>G,.,.,rs143639809,⋯,0,0,0.0021,0.0119,0,0.0019,0.0026,0.0021,.,0.0040


In [30]:
merge_empty %>% filter(AAChange != ".")
merge_empty %>% filter(AAChange != ".") %>% tally()
#seems like some where missed out but that's okay, just do those manually

MarkerName,End,Ref,Alt,Func.refGene,Gene,GeneDetail.refGene,ExonicFunc.refGene,AAChange,avsnp150,⋯,AF_amr,AF_eas,AF_nfe,AF_fin,AF_asj,AF_oth,non_topmed_AF_popmax,non_neuro_AF_popmax,non_cancer_AF_popmax,controls_AF_popmax
<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
chr1:16986246,16986246,G,A,exonic,ATP13A2,.,nonsynonymous SNV,"ATP13A2:NM_001141973:exon29:c.C3503T:p.P1168L,ATP13A2:NM_022089:exon29:c.C3518T:p.P1173L",rs372995036,⋯,0.0,0.0,0.0001,0.0003,0.0,0.0,0.0002,7.369e-05,.,.
chr1:16986291,16986291,C,T,exonic,ATP13A2,.,nonsynonymous SNV,"ATP13A2:NM_001141973:exon29:c.G3458A:p.R1153H,ATP13A2:NM_022089:exon29:c.G3473A:p.R1158H",rs544885605,⋯,0.0024,0.0,6.497e-05,0.0,0.0,0.0,0.0025,0.0036,.,.
chr1:16986321,16986321,C,T,exonic,ATP13A2,.,nonsynonymous SNV,"ATP13A2:NM_001141973:exon29:c.G3428A:p.R1143H,ATP13A2:NM_022089:exon29:c.G3443A:p.R1148H",rs533548757,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.0005,0.0003,.,.
chr1:16986554,16986554,G,A,exonic,ATP13A2,.,nonsynonymous SNV,"ATP13A2:NM_001141973:exon28:c.C3299T:p.P1100L,ATP13A2:NM_022089:exon28:c.C3314T:p.P1105L",rs201756175,⋯,0.0,0.0,6.502e-05,0.0,0.0103,0.0018,0.0001,.,.,0.0004
chr20:5100832,5100832,G,A,exonic,TMEM230,.,nonsynonymous SNV,"TMEM230:NM_001009925:exon4:c.C322T:p.R108C,TMEM230:NM_001330985:exon4:c.C322T:p.R108C,TMEM230:NM_001330986:exon4:c.C322T:p.R108C,TMEM230:NM_014145:exon4:c.C322T:p.R108C,TMEM230:NM_001009923:exon5:c.C511T:p.R171C,TMEM230:NM_001009924:exon5:c.C322T:p.R108C,TMEM230:NM_001330984:exon5:c.C322T:p.R108C",rs143571424,⋯,0.0,0.0,0.0031,0.0092,0.0,0.0055,0.0033,0.0034,.,0.0049
chr20:5109435,5109435,C,T,exonic,TMEM230,.,nonsynonymous SNV,TMEM230:NM_001009923:exon3:c.G185A:p.R62H,rs149904653,⋯,0.0,0.0,6.48e-05,0.0,0.0,0.0,9.014e-05,7.341e-05,.,.
chr20:5111588,5111588,G,A,exonic,TMEM230,.,nonsynonymous SNV,TMEM230:NM_001009923:exon2:c.C86T:p.S29L,rs191127079,⋯,0.0,0.0,0.0,0.0,0.0,0.0027,0.007,0.0070,.,0.0074
chr22:32475369,32475369,G,A,exonic,FBXO7,.,nonsynonymous SNV,FBXO7:NM_001033024:exon1:c.G8A:p.R3Q,rs199636063,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.0074,0.0044,.,0.0066
chr22:32475378,32475378,G,A,exonic,FBXO7,.,nonsynonymous SNV,FBXO7:NM_001033024:exon1:c.G17A:p.G6E,rs9621461,⋯,0.1038,0.0418,0.1036,0.1094,0.0862,0.0919,0.1049,0.1065,.,0.1027
chr22:38143150,38143150,G,A,exonic,PLA2G6,.,nonsynonymous SNV,PLA2G6:NM_001349868:exon4:c.C74T:p.P25L,rs185396488,⋯,0.0035,0.0,0.0001,0.0,0.0,0.0,0.0025,0.0054,.,0.0081


n
<int>
11


In [31]:
leftjoin$VariantName[leftjoin$MarkerName == "chr1:16986246"] <- "ATP13A2_P1173L"
leftjoin$VariantName[leftjoin$MarkerName == "chr1:16986291"] <- "ATP13A2_R1158H"
leftjoin$VariantName[leftjoin$MarkerName == "chr1:16986321"] <- "ATP13A2_R1148H"
leftjoin$VariantName[leftjoin$MarkerName == "chr1:16986554"] <- "ATP13A2_P1105L"
leftjoin$VariantName[leftjoin$MarkerName == "chr20:5100832"] <- "TMEM230_R108C"
leftjoin$VariantName[leftjoin$MarkerName == "chr20:5109435"] <- "TMEM230_R62H"
leftjoin$VariantName[leftjoin$MarkerName == "chr20:5111588"] <- "TMEM230_S29L"
leftjoin$VariantName[leftjoin$MarkerName == "chr22:32475369"] <- "FBXO7_R3Q"
leftjoin$VariantName[leftjoin$MarkerName == "chr22:32475378"] <- "FBXO7_G6E"
leftjoin$VariantName[leftjoin$MarkerName == "chr22:38143150"] <- "PLA2G6_P25L"
leftjoin$VariantName[leftjoin$MarkerName == "chr22:38143219"] <- "PLA2G6_G2A"

## 5. Write files

In [36]:
write.table(leftjoin, "Edited_AAChange_23andMe_834_clean.txt", quote = F, sep = "\t", row.names = F) # this has all the VariantNames automatically edited to only one name