-
Notifications
You must be signed in to change notification settings - Fork 0
/
TE_data_pileup.R
3351 lines (2980 loc) · 169 KB
/
TE_data_pileup.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
## PopTEvo R code for pan-genome TE analyses
## Shujun Ou (shujun.ou.1@gmail.com)
## 09/01/2022
#update.packages()
library(tidyr)
library(ggplot2)
library(wesanderson)
library(scales)
library(reshape2)
library(plyr)
library(grid)
library(gridExtra)
library(lattice)
#install.packages("devtools")
#devtools::install_github("thomasp85/patchwork")
library(patchwork)
library(devtools)
#install_github("vqv/ggbiplot")
library(ggbiplot)
library(RColorBrewer)
library(vioplot)
library(ggridges)
library("stringr")
library(dplyr) # require dplyr 1.0.5 or lower by ggtree
library(ggsignif)
library(MKinfer)
#devtools::install_github("solatar/ggbrace")
library(ggbrace)
library(ggtext)
library(ggforce)
if (!require(coin)) {
install.packages("coin")
}
if (!require(multcompView)) {
install.packages("multcompView")
}
if (!require(devtools)) install.packages("devtools")
devtools::install_github("gaospecial/ggVennDiagram")
require(rcompanion)
library("ggVennDiagram")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
library(rvcheck)
#if(!require(ggtree)){BiocManager::install("ggtree", force = TRUE)}
datadir = "/Users/oushujun/My\ Drive/study/Maize\ research/NAM/TE"
setwd(datadir)
load("./TE_data_pileup_v2.RData")
########################
#### basic settings ####
########################
NAM_ID = c("B73", "B97", "Ky21", "M162W", "Ms71", "Oh43", "Oh7B", "M37W", "Mo18W", "Tx303",
"HP301", "P39", "Il14H", "CML52", "CML69", "CML103", "CML228", "CML247", "CML277",
"CML322", "CML333", "Ki3", "Ki11", "NC350", "NC358", "Tzi8")
NAM_temp = c("B73", "B97", "Ky21", "M162W", "Ms71", "Oh43", "Oh7B")
NAM_flnt = c("HP301", "P39", "Il14H")
NAM_adx = c("M37W", "Mo18W", "Tx303")
NAM_nontrop = c(NAM_temp, NAM_flnt)
NAM_trop = c("CML52", "CML69", "CML103", "CML228", "CML247", "CML277", "CML322", "CML333",
"Ki3", "Ki11", "NC350", "NC358", "Tzi8")
NAM = data.frame(genome = NAM_ID, group = c(rep('temp', 7), rep('admx', 3), rep('temp', 3), rep('trop', 13)))
NAM_colors = c("B73"="goldenrod1", "B97"="royalblue", "CML103"="limegreen",
"CML228"="limegreen", "CML247"="limegreen", "CML277"="limegreen",
"CML322"="limegreen", "CML333"="limegreen", "CML52"="limegreen",
"CML69"="limegreen", "HP301"="orchid", "Il14H"="orangered",
"Ki11"="limegreen", "Ki3"="limegreen", "Ky21"="royalblue",
"M162W"="royalblue", "M37W"="gray47", "Ms71"="royalblue",
"Mo18W"="gray47", "NC350"="limegreen", "NC358"="limegreen",
"Oh43"="royalblue", "Oh7B"="royalblue", "P39"="orangered",
"Tx303"="gray47", "Tzi8"="limegreen")
#reorder NAM_colors based on NAM_ID
NAM_colors = NAM_colors[order(match(names(NAM_colors), NAM_ID))]
TE_colors = c('gray', rev(brewer.pal(11,'RdYlBu')[1:5]),
brewer.pal(11,'RdYlBu')[6],
rev(brewer.pal(11,'RdBu')[7:10]))
show_col(TE_colors)
TE_colors2 = c("LTR/Copia"="#D1E5F0", "LTR/Ty3"="#92C5DE", "LTR/unknown"="#4393C3",
"LINE/L1"="#2166AC", "LINE/RTE"="#2166AC", "LINE/unknown"="#2166AC",
"DNA/Helitron"="#FFFFBF", "TIR/Tc1_Mariner"="#A50026", "TIR/Mutator"="#D73027",
"TIR/PIF_Harbinger"="#F46D43", "TIR/CACTA"="#FDAE61", "TIR/hAT"="#FEE090",
"centromeric_repeat"="gray", "rDNA_spacer"="gray", "subtelomere"="gray",
"knob"="gray", "low_complexity"="gray")
show_col(TE_colors2)
TE_colors3 = c("LTR/Copia"="#92C5DE", "LTR/Ty3"="#92C5DE", "LTR/unknown"="#92C5DE",
"LINE/L1"="#2166AC", "LINE/RTE"="#2166AC", "LINE/unknown"="#2166AC", "LINE"="#2166AC",
"DNA/Helitron"="#D73027", "TIR/Tc1_Mariner"="#FDAE61", "TIR/Mutator"="#FDAE61",
"TIR/PIF_Harbinger"="#FDAE61", "TIR/CACTA"="#FDAE61", "TIR/hAT"="#FDAE61",
"centromeric_repeat"="gray", "rDNA_spacer"="gray", "subtelomere"="gray",
"knob"="gray", "low_complexity"="gray")
show_col(TE_colors3)
rename_supfam = list("Gypsy_LTR_retrotransposon"="LTR/Ty3",
"LTR/Gypsy"="LTR/Ty3",
"LTR/CRM"="LTR/Ty3",
'Gypsy'="LTR/Ty3",
"Copia_LTR_retrotransposon"="LTR/Copia",
'Copia'="LTR/Copia",
"LTR_retrotransposon"="LTR/unknown",
"LTR/Retrovirus"="LTR/unknown",
"LTR/mixture"="LTR/unknown",
"LTR/Bel-Pao"="LTR/unknown",
"LTR_unknown"="LTR/unknown",
"helitron"="DNA/Helitron",
"Helitron/unknown"="DNA/Helitron",
"PIF_Harbinger_TIR_transposon"="TIR/PIF_Harbinger",
"MITE/DTH"="TIR/PIF_Harbinger",
"DNA/DTH"="TIR/PIF_Harbinger",
"PIF_Harbinger"="TIR/PIF_Harbinger",
"hAT_TIR_transposon"="TIR/hAT",
"DNA/DTA"="TIR/hAT",
"MITE/DTA"="TIR/hAT",
'hAT'="TIR/hAT",
"Mutator_TIR_transposon"="TIR/Mutator",
"TIR/MuDR_Mutator"="TIR/Mutator",
"DNA/DTM"="TIR/Mutator",
"MITE/DTM"="TIR/Mutator",
"Mutator"="TIR/Mutator",
"CACTA_TIR_transposon"="TIR/CACTA",
"TIR/EnSpm_CACTA"="TIR/CACTA",
"CACTA"="TIR/CACTA",
"DNA/DTC"="TIR/CACTA",
"MITE/DTC"="TIR/CACTA",
"Tc1_Mariner_TIR_transposon"="TIR/Tc1_Mariner",
"MITE/DTT"="TIR/Tc1_Mariner",
"DNA/DTT"="TIR/Tc1_Mariner",
"Tc1_Mariner"="TIR/Tc1_Mariner",
"TIR/Sola1"="TIR/unknown",
"TIR/Sola2"="TIR/unknown",
"TIR/Merlin"="TIR/unknown",
"TIR/Kolobok"="TIR/unknown",
"TIR/Novosib"="TIR/unknown",
"L1_LINE_retrotransposon"="LINE/L1",
"RTE_LINE_retrotransposon"="LINE/RTE",
"LINE_element"="LINE/unknown",
"Maverick/unknown"='unknown',
"Penelope/unknown"='unknown',
"mixture/mixture"='unknown',
"DIRS/unknown"='unknown',
"pararetrovirus/unknown"='unknown',
"CentC"="centromeric_repeat",
"rDNA_intergenic_spacer_element"="rDNA/spacer",
"subtelomere"="subtelomere",
"knob"="knob",
"low_complexity"="low_complexity")
####################################
#### Genome-level TE size stats ####
####################################
## percent TE in assembly
repeats = read.table('NAM.200_v2/NAM.EDTA2.0.0.MTEC02052020.TE.v1.anno.pcnt.txt', header=T)
repeats$Ty3 = repeats$Gypsy #convert supfam name
# remove % and convert to numeric; remove _pcnt string in genome names
repeats[,-1] = apply(repeats[,-1], 2, function(x){as.numeric(sub("%", "", x, fixed=TRUE))})
repeats$Genome = sub("_pcnt", "", repeats$Genome, fixed=TRUE)
str(repeats)
# add more categories
repeats$DNA = repeats$helitron + repeats$CACTA + repeats$Mutator + repeats$PIF_Harbinger +
repeats$Tc1_Mariner + repeats$hAT
repeats$nonLTR = repeats$L1_LINE + repeats$LINE_element + repeats$RTE_LINE
repeats$LTR = repeats$Copia + repeats$Ty3 + repeats$LTR_unknown
repeats$nonTE = repeats$centromeric_repeat + repeats$knob + repeats$low_complexity +
repeats$rDNA_spacer + repeats$subtelomere
# fix genome names
repeats$Genome = sub("Oh7b", "Oh7B", repeats$Genome) %>%
sub("IL14H", "Il14H", .) %>% sub("MS71", "Ms71", .) %>% sub("Mo18.*", "Mo18W", .)
repeats$Genome
# change the order of Genomes
repeats$Genome = factor(repeats$Genome, levels=NAM_ID)
# gather plotting variables and change the variable plotting order
repeats_final = repeats %>%
gather(variable, value, hAT, CACTA, PIF_Harbinger, Mutator, Tc1_Mariner,
helitron, nonLTR, Copia, Ty3, LTR_unknown, nonTE)
repeats_final$variable = as.factor(repeats_final$variable) #convert to character to factor
repeats_final$variable = factor(repeats_final$variable,
levels = c("nonTE", "hAT", "CACTA", "PIF_Harbinger", "Mutator",
"Tc1_Mariner", "helitron", "nonLTR",
"LTR_unknown", "Ty3", "Copia"))
repeats_pcnt_plot = repeats_final %>%
ggplot(aes(x = Genome, y = value, fill = variable)) +
geom_bar(stat = "identity",colour="black") +
scale_fill_manual(values=TE_colors) +
labs(x =" ", y = "Repeat length (%)",size=12, face="bold") +
theme(axis.title.y = element_text(size=12, face="bold"),
axis.text.y = element_text(size=12),
axis.text.x = element_text(color = NAM_colors, size=8, angle=35, vjust=1, hjust=0.95),
legend.text = element_text(size=12),
legend.title=element_blank()) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
theme(legend.position="top")
repeats_pcnt_plot
## total TE bp in assembly
repeats_bp = read.table('NAM.200_v2/NAM.EDTA2.0.0.MTEC02052020.TE.v1.anno.bp.txt', header=T)
repeats_bp$Genome = sub("_bp", "", repeats_bp$Genome, fixed=TRUE)
repeats_bp = repeats_bp[!names(repeats_bp) %in% c('centromeric_repeat', 'knob', 'low_complexity', 'rDNA_spacer', 'subtelomere', 'Total')]
repeats_bp$TE = rowSums(repeats_bp[,2:13])
## total intact TE bp in assembly
intact = read.table('NAM.200_v2/NAM.EDTA2.0.0.MTEC02052020.intact.sum.txt', header=T)
intact$intact = intact$LTR + intact$TIR + intact$Helitron
intact$fragmented = repeats_bp$TE - intact$intact
intact$total = intact$intact + intact$fragmented
str(intact)
intact$Genome
rm(repeats_bp)
#read in genome size data
genome_size = read.table('Genome_size.txt', header=T)
genome_size = genome_size[order(match(genome_size$Genome, repeats$Genome)),]
# fix genome names
intact$Genome = sub("Oh7b", "Oh7B", intact$Genome)
genome_size$Genome = sub("Oh7b", "Oh7B", genome_size$Genome) %>%
sub("IL14H", "Il14H", .) %>% sub("MS71", "Ms71", .) %>% sub("Mo18.*", "Mo18W", .)
unique(intact$Genome)
unique(genome_size$Genome)
#summarize intact and fragmented percentage
intact_pcntTE = data.frame(intact[,-1]*100/(intact$total), Genome=intact$Genome) #intact/TE, frag/TE
summary(intact_pcntTE$intact)
summary(intact_pcntTE$fragmented)
std <- function(x) sd(x)/sqrt(length(x)) #standard error
sd(intact_pcntTE$intact)
sd(intact_pcntTE$fragmented)
std(intact_pcntTE$intact)
std(intact_pcntTE$fragmented)
# plot the %intact and %homo in %TE
intact_pcntTE_final = intact_pcntTE %>% gather(variable, value, intact, fragmented)
intact_pcntTE_final$Genome = factor(intact_pcntTE_final$Genome, levels=NAM_ID)
intact_pcntTE_final$variable = as.factor(intact_pcntTE_final$variable) #convert to character to factor
intact_pcntTE_final$variable = factor(intact_pcntTE_final$variable,
levels = c("intact", "fragmented"))
# plot the %intact and %frag to the total TE size
intact_pcntTE_plot = intact_pcntTE_final %>%
ggplot(aes(x = Genome, y = value, fill = variable)) +
geom_bar(stat = "identity", color="black") +
scale_fill_manual(values=c("#92C5DE", "gray")) +
labs(x =" ", y = "TE length (%)",size=14, face="bold") +
theme(axis.title.y = element_text(size=20, face="bold"),
axis.text.y = element_text(size=12),
axis.text.x = element_text(color = NAM_colors, size=10, angle=35, vjust=1, hjust=0.95),
legend.text = element_text(size=12),
legend.title=element_blank()) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
theme(legend.position="top")
intact_pcntTE_plot
######################################
#### Read and process family data ####
######################################
fam = read.table('NAM.200_v2/NAM.EDTA2.0.0.MTEC02052020.TE.v1.1.anno.sum.fam.bp', header=T)
fam_int = read.table('NAM.200_v2/NAM.EDTA2.0.0.MTEC02052020.TE.v1.intact.sum.fam.bp', header=T)
fam_homo = read.table('NAM.200_v2/NAM.EDTA2.0.0.MTEC02052020.TE.v1.1.homo.sum.fam.bp', header=T)
class = read.table('NAM.200_v2/NAM.EDTA2.0.0.MTEC02052020.TElib.fa.list', header=F)
colnames(class) = c('TE', "Supfam") #add column header
# rename classes
table(class$Supfam)
class$Supfam = dplyr::recode(class$Supfam, !!!rename_supfam)
table(class$Supfam)
# check if TEs are classified uniquely
length(class$TE)
length(unique(class$TE))
# fix genome names
fam$TE_fam = sub("Oh7b", "Oh7B", fam$TE_fam)
names(fam)[names(fam) == 'Oh7b'] = 'Oh7B'
fam_int$TE_fam = sub("Oh7b", "Oh7B", fam_int$TE_fam)
names(fam_int)[names(fam_int) == 'Oh7b'] = 'Oh7B'
fam_homo$TE_fam = sub("Oh7b", "Oh7B", fam_homo$TE_fam)
names(fam_homo)[names(fam_homo) == 'Oh7b'] = 'Oh7B'
str(fam)
#remove AB10
drops <- c("B73Ab10","B73_AB10")
fam = fam[ , !(names(fam) %in% drops)]
fam_int = fam_int[ , !(names(fam_int) %in% drops)]
fam_homo = fam_homo[ , !(names(fam_homo) %in% drops)]
colnames(fam) #check if Ab10 is removed
colnames(fam_int)
sum(str_detect(fam_int$TE_fam, ":", negate = T))
# transpose, and add back col and row names
library(data.table)
tr_fam <- transpose(fam[-1]) # remove the first row, $TE_fam
rownames(tr_fam) = colnames(fam[-1])
colnames(tr_fam) = as.matrix(transpose(fam[1]))[1,]
tr_fam_int <- transpose(fam_int[-1]) # remove the first row, $TE_fam
rownames(tr_fam_int) = colnames(fam_int[-1])
colnames(tr_fam_int) = as.matrix(transpose(fam_int[1]))[1,]
tr_fam_homo <- transpose(fam_homo[-1]) # remove the first row, $TE_fam
rownames(tr_fam_homo) = colnames(fam_homo[-1])
colnames(tr_fam_homo) = as.matrix(transpose(fam_homo[1]))[1,]
#str(tr_fam)
fam = tr_fam
fam = fam[,apply(fam, 2, mean) != 0] #remove all 0 rows
rm(tr_fam)
fam_int = tr_fam_int
fam_int = fam_int[,apply(fam_int, 2, mean) != 0] #remove all 0 rows
rm(tr_fam_int)
fam_homo = tr_fam_homo
fam_homo = fam_homo[,apply(fam_homo, 2, mean) != 0] #remove all 0 rows
rm(tr_fam_homo)
# convert bp to mb
fam = fam/1000000
old_fam = fam # backup
fam_int = fam_int/1000000
fam_homo = fam_homo/1000000
# checked fam_homo, fam_int, no split families
length(str_subset(names(fam_homo), "TE_"))
length(unique(sub("_LTR|_INT", '', str_subset(names(fam_homo), "TE_"))))
names(fam_homo) = sub("_LTR|_INT", '', names(fam_homo))
length(str_subset(names(fam_int), "TE_"))
length(unique(sub("_LTR|_INT", '', str_subset(names(fam_int), "TE_"))))
names(fam_int) = sub("_LTR|_INT", '', names(fam_int))
# combine LTR families split in two columns, this step take like 20 mins..
library(stringr)
length(str_subset(names(fam), "TE_"))
length(unique(sub("_LTR|_INT", '', str_subset(names(fam), "TE_"))))
fam_names = str_subset(names(fam), "TE_")
fam_all = sub("_LTR|_INT", '', str_subset(names(fam), "TE_"))
fam_dup = names(table(fam_all)[table(fam_all)>1])
length(fam_dup)
remove_list = c()
for (i in 1:length(fam_dup)){
this_id = str_subset(fam_names, fam_dup[i])
remove_list = c(remove_list, this_id)
fam$this_fam = rowSums(fam[this_id]) #combine cols
fam = fam[, !names(fam) %in% this_id] #remove cols
names(fam)[names(fam) == 'this_fam'] = fam_dup[i] #rename new col
}
names(fam) = sub("_LTR|_INT", '', names(fam)) #convert the rest of names
# convert bp to TE space percentage
fam_pcnt = fam/rowSums(fam)
############################
#### Pan TE family stat ####
############################
# remove non-TE
drops = c('GA-rich', 'G-rich',
subset(class, Supfam %in% c("Cent/CentC", "knob/knob180",
"rDNA/spacer", "knob/TR-1",
"subtelomere/4-12-1"))$TE)
fam = fam[, !(names(fam) %in% drops)]
fam_homo = fam_homo[, !(names(fam_homo) %in% drops)]
## Remove entries that are not in the pan-TE library
fam_rare = colSums(fam[str_detect(colnames(fam), ":")])
length(fam_rare) #269847
sum(fam_rare)/nrow(fam) #rare TE avg NAM, 46.25434 Mb
sum(colSums(fam))/length(rownames(fam)) #total TE avg NAM #1787.338
fam = fam[-grep(":", colnames(fam),)] # remove rare TEs
sum(fam)/26 #common TE avg NAM #1787.338
rowSums(fam)
dim(fam) # 17482 fams, 17473 are TEs and 9 are non-TEs
## count family occurrance
str(fam)
levels=c("0")
non_0_count <- 26 - sapply(levels, function(x)colSums(fam=="0")) #count occurrences of 0 in each column
non_0_count_int <- 26 - sapply(levels, function(x)colSums(fam_int=="0")) #count occurrences of 0 in each column
non_0_count_homo <- 26 - sapply(levels, function(x)colSums(fam_homo=="0")) #count occurrences of 0 in each column
# count and size stats
dim(non_0_count)
sum(non_0_count==26)/length(non_0_count) #0.7228006
length(str_subset(colnames(fam), ":", negate = T))
length(str_subset(colnames(fam), ":", negate = F))
length(colnames(fam))
total_fam = 17473
sum(colSums(fam)/non_0_count<0.1)/total_fam #a list of TE fam <100kb on avg, 0.9611973
sum(colSums(fam)/non_0_count>=0.1) #a list of TE fam >100kb on avg, 687
sum(colSums(fam)/non_0_count>=0.1)/total_fam #0.0393178
sum(colSums(fam))/26 # total size of common fams on avg
total_TE = 1833.592 #rare + common avg NAM = 1787.338+46.25434 = 1833.592
sum(rowSums(fam[,colSums(fam)/non_0_count<0.1]))/26 #total size of fams < 0.1M, 120.8612 Mb
sum(colSums(fam)[colSums(fam)/non_0_count>=0.1]/26) #NAM avg size of TE fam >100kb, 1666.477 Mb
rowSums(fam[,colSums(fam)/non_0_count>=0.1])/total_TE
mean(rowSums(fam[,colSums(fam)/non_0_count<0.1]))/total_TE #0.06591497
sd(rowSums(fam[,colSums(fam)/non_0_count<0.1])/total_TE) #0.001118202
mean(rowSums(fam[,colSums(fam)/non_0_count>=0.1]))/total_TE #0.9088591
sd(rowSums(fam[,colSums(fam)/non_0_count>=0.1])/total_TE) #0.01220491
mean(sum(tail(sort(colSums(fam)), 50))/26)/total_TE #top 50 fams, 0.5203749
mean(sum(tail(sort(colSums(fam)), 100))/26)/total_TE #top 100 fams, 0.6936001
length(intersect(unique(sub('_LTR|_INT', '', class[str_detect(class$Supfam, 'LTR'),]$TE)), names(fam))) #total number of LTR fams, 10838
sum(colSums(fam[intersect(unique(sub('_LTR|_INT', '', class[str_detect(class$Supfam, 'LTR'),]$TE)), names(fam))]))/26 #total size of LTR fams, 1599.559
sum(fam_homo)/26 #total number of fragmented fams, 1286.315 mb
## add family and superfamily info and mean family size info
Fam_count = data.frame(count=non_0_count, TE=colnames(fam),
Supfam=class[match(colnames(fam), sub('_INT|_LTR', '', class$TE)),]$Supfam,
Size=colSums(fam)/non_0_count)
colnames(Fam_count) = c('count', 'TE', 'Supfam', 'Size')
dim(Fam_count)
Fam_count_int = data.frame(count=non_0_count_int, TE=colnames(fam_int),
Supfam=class[match(colnames(fam_int), sub('_INT|_LTR', '', class$TE)),]$Supfam,
Size=colSums(fam_int)/non_0_count_int)
colnames(Fam_count_int) = c('count', 'TE', 'Supfam', 'Size')
Fam_count_int$all_count = Fam_count$count[match(Fam_count_int$TE, Fam_count$TE)]
Fam_count_int$all_count = ifelse(is.na(Fam_count_int$all_count), Fam_count_int$count, Fam_count_int$all_count)
Fam_count_int$all_size = (Fam_count_int$count*Fam_count_int$Size)/Fam_count_int$all_count
dim(Fam_count_int)
Fam_count_homo = data.frame(count=non_0_count_homo, TE=colnames(fam_homo),
Supfam=class[match(colnames(fam_homo), sub('_INT|_LTR', '', class$TE)),]$Supfam,
Size=colSums(fam_homo)/non_0_count_homo)
colnames(Fam_count_homo) = c('count', 'TE', 'Supfam', 'Size')
Fam_count_homo$all_count = Fam_count$count[match(Fam_count_homo$TE, Fam_count$TE)]
Fam_count_homo$all_count = ifelse(is.na(Fam_count_homo$all_count), Fam_count_homo$count, Fam_count_homo$all_count)
Fam_count_homo$all_size = (Fam_count_homo$count*Fam_count_homo$Size)/Fam_count_homo$all_count
dim(Fam_count_homo)
## filter out nonTE categories
Fam_count = Fam_count[which(Fam_count$Supfam!="centromeric_repeat" & Fam_count$Supfam!="knob" &
Fam_count$Supfam!="low_complexity" & Fam_count$Supfam!="rDNA_intergenic_spacer_element" &
Fam_count$Supfam!="subtelomere"),]
Fam_count$Supfam = factor(Fam_count$Supfam) #remove unused factor levels
levels(Fam_count$Supfam)
str(Fam_count)
dim(Fam_count)
## replace LINEs to nonLTR
nonLTR = c("LINE/L1", "LINE/RTE", "LINE/unknown")
Fam_count$Supfam <- as.character(Fam_count$Supfam)
Fam_count$Supfam <- factor(with(Fam_count, replace(Supfam, Supfam %in% nonLTR, "nonLTR")))
levels(Fam_count$Supfam)
# add family length info
fam_len = read.table('NAM.EDTA2.0.0.MTEC02052020.TElib.fa.info', header = T, sep = '\t')
Fam_count$fam_len = fam_len$fam_len[match(Fam_count$TE, fam_len$id)]
rm(fam_len)
# calculate fam size
std <- function(x) sd(x)/sqrt(length(x)) #standard error
fam_size = data.frame(TE = colnames(fam),
sd = apply(fam, 2, sd, na.rm=TRUE),
se = apply(fam, 2, std),
mean = apply(fam, 2, mean, na.rm=TRUE),
Min = apply(fam, 2, min, na.rm=TRUE),
Max = apply(fam, 2, max, na.rm=TRUE))
str(fam_size)
# make groups for fam size
cutoff = 0.1
fam_size = fam_size %>%
mutate(size_group = case_when(mean <= cutoff ~ paste('[0, ', cutoff, ']', sep = ''),
mean > cutoff ~ paste('(', cutoff, ', inf)', sep = '')))
fam_size$size_group = as.factor(fam_size$size_group)
fam_size$size_group <- factor(fam_size$size_group,
levels = c(paste('[0, ', cutoff, ']', sep = ''),
paste('(', cutoff, ', inf)', sep = '')))
###################################
##### plot rare LTR count data#####
###################################
rare_LTR2 = read.table('NAM.EDTA1.9.0.MTEC02052020.TE.v1.anno.LTR.rare.count2', header = F)
names(rare_LTR2) = c('genome', 'uniq', 'count', 'mean', 'sd')
rare_LTR2$genome = sub('\\..*', '', rare_LTR2$genome)
rare_LTR2 = rare_LTR2[rare_LTR2$genome != 'B73Ab10' & rare_LTR2$genome != 'B73_AB10', ] #rm Ab10
rare_LTR2$genome = sub("Oh7b", "Oh7B", rare_LTR2$genome) %>%
sub("IL14H", "Il14H", .) %>% sub("MS71", "Ms71", .) %>% sub("Mo18.*", "Mo18W", .)
rare_LTR2$genome = factor(rare_LTR2$genome, NAM$genome)
unique(rare_LTR2$genome)
rare_LTR_plot2 = ggplot(rare_LTR2, aes(x=genome, y=mean)) +
geom_bar(stat = 'identity', alpha = 0.5) +
geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.1, position=position_dodge(.9)) +
labs(x ="", y="Copy number of<br>unclassified LTR-RTs")+
theme(axis.title.y = element_markdown(size=12, face="bold"),
# axis.title.y = element_text(),
axis.text.y = element_text(size=10),
axis.text.x = element_text(size=10, angle=50,vjust=1,hjust=0.95)) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"))
rare_LTR_plot2
############################
##### bootstrap pan-TE #####
############################
pan_TE_bs = read.table('NAM.200_v2/pan_TE_bootstrap1000.summary26.txt', header=F)
pan_TE_bs = data.frame(count=as.vector(t(pan_TE_bs)),Genome=rep(1:26, 1000))
pan_TE_bs$Genome = as.factor(pan_TE_bs$Genome)
length(pan_TE_bs)
Total_flTE_fam = 14801
mean(pan_TE_bs$count[pan_TE_bs$Genome==2])/Total_flTE_fam
mean(pan_TE_bs$count[pan_TE_bs$Genome==3])/Total_flTE_fam
mean(pan_TE_bs$count[pan_TE_bs$Genome==4])/Total_flTE_fam #0.9115198
mean(pan_TE_bs$count[pan_TE_bs$Genome==5])/Total_flTE_fam
mean(pan_TE_bs$count[pan_TE_bs$Genome==10])/Total_flTE_fam
Total_flTE_fam - mean(pan_TE_bs$count[pan_TE_bs$Genome==10])
# add percentage to the second axis and change background colors
show_col(brewer.pal(8,'YlGn'))
color_levels = brewer.pal(8,'YlGn')
pan_TE_bs_plot_pcnt = ggplot(pan_TE_bs, aes(x=Genome, y=count)) +
annotation_raster(alpha(color_levels[2], 1), xmin = -Inf, xmax = Inf,
ymin = -Inf, ymax = Total_flTE_fam*0.8) +
annotation_raster(alpha(color_levels[3], 1), xmin = -Inf, xmax = Inf,
ymin = Total_flTE_fam*0.8, ymax = Total_flTE_fam*0.9) +
annotation_raster(alpha(color_levels[4], 1), xmin = -Inf, xmax = Inf,
ymin = Total_flTE_fam*0.9, ymax = Total_flTE_fam*0.95) +
annotation_raster(alpha(color_levels[5], 1), xmin = -Inf, xmax = Inf,
ymin = Total_flTE_fam*0.95, ymax = Inf) +
geom_violin() + #this layer at the top
scale_x_discrete(breaks = c(1, 5, 10, 15, 20, 25)) +
scale_y_continuous(sec.axis = sec_axis(~ . *100/Total_flTE_fam, breaks = c(75,80,85,90,95,100),name="% of TE families")) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
labs(x ="Number of Genomes", y = "# of TE families") +
theme(axis.title.y = element_text(size=12),
axis.title.x = element_text(size=12),
axis.text.y = element_text(size=10),
axis.text.x = element_text(size=10),
legend.text = element_text(size=12),
legend.title=element_blank())
pan_TE_bs_plot_pcnt
## plot size of TE families
Fam_count$count = as.factor(Fam_count$count)
Fam_size_freq_int_plot_log = ggplot(Fam_count_int, aes(x=factor(all_count), y=all_size)) +
geom_boxplot(outlier.size=0.8, width=0.7, fill="white") +
geom_hline(yintercept = 0.1, linetype = "dashed") +
labs(x ="Frequency (n = 26)", y = "TE family size (Mb)<br>Intact elements ")+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) +
theme(axis.title.y = element_markdown(size=14, face="bold"),
axis.title.x = element_text(size=14, face="bold"),
axis.text.y = element_text(size=12),
axis.text.x = element_text(size=12),
legend.text = element_text(size=12),
legend.title=element_blank())
Fam_size_freq_int_plot_log
Fam_size_freq_homo_plot_log = ggplot(Fam_count_homo, aes(x=factor(all_count), y=all_size)) +
geom_boxplot(outlier.size=0.8, width=0.7, fill="white") +
geom_hline(yintercept = 0.1, linetype = "dashed") +
labs(x ="Frequency (n = 26)", y = "TE family size (Mb)<br>Fragmented elements")+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) +
theme(axis.title.y = element_markdown(size=14, face="bold"),
axis.title.x = element_text(size=14, face="bold"),
axis.text.y = element_text(size=12),
axis.text.x = element_text(size=12),
legend.text = element_text(size=12),
legend.title=element_blank())
Fam_size_freq_homo_plot_log
#number of families less than x Mb in total size
length(Fam_count[Fam_count$Size<0.01,]$Size)/total_fam #10k, 0.7493275
length(Fam_count[Fam_count$Size<0.1,]$Size)/total_fam #100k, 16786, 0.9606822
length(Fam_count[Fam_count$Size<1,]$Size)/total_fam #1M, 0.9879815
sum(Fam_count[Fam_count$Size<0.01,]$Size)/sum(Fam_count$Size) #0.0193377
sum(Fam_count[Fam_count$Size<0.1,]$Size)/sum(Fam_count$Size) # 0.07216198
sum(Fam_count[Fam_count$Size<1,]$Size)/sum(Fam_count$Size) # 0.1441918
#rare and large TE families
Fam_count$count = as.numeric(Fam_count$count)
Fam_count[Fam_count$Size>20 & Fam_count$count<=3,]
Fam_count[Fam_count$Size>0.02 & Fam_count$count<26,]
length(Fam_count[Fam_count$Size>0.1,]$count)
## Number of families that are present in all 26 NAM parents
Fam_count$TE = as.factor(Fam_count$TE)
table(Fam_count$count)["26"] #12636
table(Fam_count$count)["26"]/total_fam #0.7231729
## more than 20 times
sum(table(Fam_count$count)[20:26]) #14438
sum(table(Fam_count$count)[20:26])/total_fam #0.8263034
## Other summaries
table(Fam_count$count)["1"]
table(Fam_count$count)["1"]/total_fam
sum(table(Fam_count$count)[2:23])/total_fam
sum(table(Fam_count$count)[24:25])/total_fam
sum(table(Fam_count$count)[24:26])/total_fam
## histogram of freq
plotorder <- c("Gypsy_LTR_retrotransposon", "Copia_LTR_retrotransposon", "LTR_retrotransposon",
"nonLTR", "helitron", "CACTA_TIR_transposon", "hAT_TIR_transposon", "Mutator_TIR_transposon",
"PIF_Harbinger_TIR_transposon", "Tc1_Mariner_TIR_transposon")
plotorder = dplyr::recode(plotorder, !!!rename_supfam)
Fam_count <- arrange(transform(Fam_count,
Supfam=factor(Supfam,levels=plotorder)),Supfam)
Fam_count$count = as.numeric(Fam_count$count)
Fam_count_freq_plot = ggplot(Fam_count, aes(count)) +
geom_histogram(binwidth=1, center=0.5) +
facet_wrap(~Supfam, scales = "free_y", ncol = 5) +
labs(x ="Frequency (n = 26)", y = "Number of TE families") +
theme(axis.title.y = element_text(size=14, face="bold"),
axis.title.x = element_text(size=14, face="bold"),
axis.text.y = element_text(size=12),
axis.text.x = element_text(size=12),
legend.text = element_text(size=12),
legend.title=element_blank())
Fam_count_freq_plot
###################################
#### Top 50 largest TE families #####
###################################
# Plot out largest TE families and variation in NAMs
str(fam)
rank = 50
rownames(fam) #genome name
topSize = fam_size[order(-fam_size$mean),][1:rank,]
topSize$TE = factor(topSize$TE, levels = topSize$TE) #control the plotting order
head(topSize)
topSize_100 = fam_size[order(-fam_size$mean),][1:100,]
sum(topSize$mean) #top 50, 954.1553
sum(topSize$mean)/total_TE #0.5203749
sum(topSize_100$mean) # top 100, 1271.78
sum(fam_size$mean) #size of all TEs, 1787.338
# top size plot, error bar SD
topSize_plot_SD = ggplot(topSize, aes(x=TE, y=mean)) +
geom_bar(stat="identity", fill = rgb(27,168,241, max=255)) +
geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2, position=position_dodge(.9)) +
labs(x ="TE family", y = "Family size (Mb)", size=12, face="bold")+
theme(axis.title.y = element_text(size=12),
axis.text.y = element_text(size=10),
axis.text.x = element_text(size=8, angle=50,vjust=1,hjust=0.95),
plot.margin = margin(0.2, 0.2, 0.2, 0.7, "cm")
) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"))
topSize_plot_SD
############################
#### PCA ###################
############################
## PCA
names(fam)
rownames(fam)
dim(fam) #17482
pca1 = prcomp(fam, scale. = F) #use unnormalized value to count in the effect of TE fam size
pca1$sdev # sqrt of eigenvalues
pca1$rotation # loadings
pca1$x # PCs (aka scores)
scores = as.data.frame(pca1$x)
sum = summary(pca1)
str(sum)
PCvar_num = pca1$sdev^2/sum(pca1$sdev^2) #sum$importance, pcnt variations explained of PCs
PCvar = sprintf("%1.1f%%", 100*PCvar_num)
PCvar
# SNP PCA
NAM.SNP25k = as.data.frame(read.table('B73v5.NAM-illumina_filtered-pass-only-two-round-gatk-snps.homo.chr.gt.pres.25k.h',
header = T) %>% select(all_of(NAM_ID)) %>% t(.))
snp_pca1 = prcomp(NAM.SNP25k, scale. = F)
screeplot(snp_pca1)
snp_scores = as.data.frame(snp_pca1$x)
snp_sum = summary(snp_pca1)
snp_PCvar_num = snp_pca1$sdev^2/sum(snp_pca1$sdev^2) #sum$importance, pcnt variations explained of PCs
barplot(snp_PCvar_num, ylim=c(0,0.1), xlab='Principal components', ylab='Variations explained')
snp_PCvar = sprintf("%1.1f%%", 100*snp_PCvar_num)
snp_PCvar
#reorder columns based on NAM_colors
scores = scores[names(NAM_colors),]
rownames(scores)
snp_scores = snp_scores[names(NAM_colors),]
rownames(snp_scores)
# set ellipse group and plot
scores$group = rownames(scores)
scores[scores$group %in% NAM_trop,]$group = 'trop'
scores[scores$group %in% NAM_nontrop,]$group = 'temp'
scores[scores$group %in% NAM_adx,]$group = NA
PCA_biplot_group = ggplot(data=scores, aes(x=PC1, y=PC2)) +
geom_hline(yintercept=0, colour="gray65") +
geom_vline(xintercept=0, colour="gray65") +
geom_mark_ellipse(data=subset(scores, !is.na(group)), expand = 0, aes(color=group), alpha = 0,
linetype = 'dashed', size = 0.4, show.legend = F) +
geom_text(aes(label=rownames(scores)),
colour=NAM_colors, alpha=0.8, size=3) +
scale_color_manual(values=c('#53CACE', '#F5958F')) +
xlab(paste("PC1 (", PCvar[1], ")", sep="")) + xlim(-9, 9) +
ylab(paste("PC2 (", PCvar[2], ")", sep="")) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"))
PCA_biplot_group
snp_scores$group = rownames(snp_scores)
snp_scores[snp_scores$group %in% NAM_trop,]$group = 'trop'
snp_scores[snp_scores$group %in% NAM_nontrop,]$group = 'temp'
snp_scores[snp_scores$group %in% NAM_adx,]$group = NA
snp_PCA_biplot_group = ggplot(data=snp_scores, aes(x=PC1, y=PC2)) +
geom_hline(yintercept=0, colour="gray65") +
geom_vline(xintercept=0, colour="gray65") +
geom_mark_ellipse(data=subset(snp_scores, !is.na(group)), expand = 0.01, aes(color=group), alpha = 0,
linetype = 'dashed', size = 0.4, show.legend = F) +
geom_text(aes(label=rownames(snp_scores)),
colour=NAM_colors, alpha=0.8, size=5) +
scale_color_manual(values=c('#53CACE', '#F5958F')) +
xlab(paste("PC1 (", snp_PCvar[1], ")", sep="")) +
ylab(paste("PC2 (", snp_PCvar[2], ")", sep="")) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
axis.title.x = element_text(size=16),
axis.title.y = element_text(size=16),
axis.text.y = element_text(size=14),
axis.text.x = element_text(size=14),
legend.title=element_blank())
snp_PCA_biplot_group
###################################
##### BUSCO tree #####
###################################
library('ggtree') # require dplyr 1.0.5
busco_tree <- read.tree(file = 'busco-tree.nwk')
busco_tree$edge.length[is.na(busco_tree$edge.length)] = 0 #convert NAs to 0
busco_tree$tip.label = busco_tree$tip.label %>% sub("Oh7b", "Oh7B", .) %>%
sub("IL14H", "Il14H", .) %>% sub("MS71", "Ms71", .) %>% sub("Mo18*", "Mo18W", .)
tree_cols = c(NAM_colors, PARVIGLUMIS='black')
names(tree_cols) = c(NAM_colors, PARVIGLUMIS='black')
busco_tree = phytools::reroot(busco_tree, node.number=13) #use busco_tree$tip.label to find the node.number, read help
busco_tree = full_join(busco_tree,
data.frame(label = names(c(NAM_colors, PARVIGLUMIS='black')),
stat = c(NAM_colors, PARVIGLUMIS='black')), by = 'label')
busco_tree_p <- ggtree(busco_tree, layout = 'rectangular',
branch.length = 'none') +
geom_tiplab(aes(color = stat), size = 5, hjust = 0) + geom_rootpoint() +
# geom_tippoint() +
scale_color_manual(values = tree_cols) +
theme(legend.position = "none") + xlim(NA, 15)
busco_tree_p
###################################
#### 10 Most varying TE families #####
###################################
## calculate family variations
require(goeveg)
Var = apply(fam, 2, var, na.rm=TRUE)
class(Var)
str(Var)
## get top 10 varying TE families
rank = 10
var_rank = Var[order(-Var)][1:rank]
fam[,names(var_rank)]
topVar = data.frame(fam[names(var_rank)], Genome = rownames(fam))
str(topVar)
names(topVar)
# change the order of Genomes
new_order = c(NAM_ID[11:13], NAM_ID[1:10], NAM_ID[14:26])
topVar$Genome = factor(topVar$Genome, levels=new_order)
varlist = colnames(topVar)[1:10]
# gather plotting data
topVar_gather = topVar %>% gather(key=variable, value=value, names(topVar)[1:10])
topVar_gather$x=as.numeric(factor(topVar_gather$Genome)) #get the actual order of genome for plotting
str(topVar_gather)
# normalize the size
normalize <- function(x) {x / sqrt(sum(x^2))}
topVar_nor = data.frame(apply(topVar[,1:10], 2, scale), Genome = topVar$Genome) #standardization to mean=0, var=1
str(topVar_nor)
# gather plotting data
topVar_nor_gather = topVar_nor %>% gather(variable, value, names(topVar_nor)[1:10])
topVar_nor_gather$x = as.numeric(factor(topVar_nor_gather$Genome)) #get the actual order of genome for plotting
str(topVar_nor_gather)
# plot normalized family size with pvalue
boot.t.test(topVar_nor_gather[topVar_nor_gather$Genome %in% NAM_trop, ]$value,
topVar_nor_gather[topVar_nor_gather$Genome %in% NAM_nontrop, ]$value)
2*pt(-abs(12.387),df=216.33) #actual p, 5.495085e-27
topVar_nor_p = ggplot(topVar_nor_gather, aes(x = Genome, y = value)) +
geom_boxplot() +
geom_point(aes(color=variable), position = position_jitter(width = 0.15), size = 1) +
geom_signif(comparisons=list(c("HP301", "Oh7B")), annotations="", y_position = 1.7,
tip_length = 0, color = '#53CACE', size = 0.5) +
geom_signif(comparisons=list(c("CML52", "Tzi8")), annotations="", y_position = 2.5,
tip_length = 0, color = '#F5958F', size = 0.5) +
geom_signif(annotations="P < 1.0e-10", y_position = 3.1, xmin=5.5, xmax=20, size = 0.5,
vjust = -0.5, tip_length = c(0.16, 0.03)) +
labs(x =" ", y = "Standardized family size",size=14, face="bold") +
ylim(-4.4, 3.5) +
theme(axis.title.y = element_text(size=12, face="bold"),
axis.text.y = element_text(size=10),
axis.text.x = element_text(color = NAM_colors[new_order], size=10, angle=35, vjust=1, hjust=0.95),
legend.text = element_text(size=8),
legend.title=element_blank()) +
theme(legend.position = c(0.5, 0.1), legend.direction = "horizontal",
legend.text = element_text(size=5.5),
legend.key.size = unit(0.3, "cm"),
legend.key.width = unit(0.3,"cm")) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"))
topVar_nor_p
##################################
#### Trop/Temp LTRs variation ####
##################################
# find LTR fams with size sig. diff in trop and temp
str(fam)
fam_nontrop = fam[rownames(fam) %in% NAM_nontrop,]
fam_trop = fam[rownames(fam) %in% NAM_trop,]
test_trop_temp <- vector("list", 1)
for (j in seq(ncol(fam))){
temp = fam_nontrop[,j]
trop = fam_trop[,j]
if (sd(temp-trop) == 0){
temp = jitter(temp, amount = 0.0000001)
}
test_trop_temp[[j]] = t.test(temp, trop)
}
test_trop_temp[[1]]
test_trop_temp = data.frame(matrix(unlist(test_trop_temp), nrow=length(test_trop_temp), byrow=T))
test_trop_temp[,1:9] = lapply(test_trop_temp[,1:9], function(x) {as.numeric(x)})
names(test_trop_temp)[names(test_trop_temp) == 'X3'] = 'pval'
names(test_trop_temp)[names(test_trop_temp) == 'X6'] = 'temp_mean'
names(test_trop_temp)[names(test_trop_temp) == 'X7'] = 'trop_mean'
test_trop_temp$diff = test_trop_temp$trop_mean - test_trop_temp$temp_mean
test_trop_temp$mean = (test_trop_temp$trop_mean*13 + test_trop_temp$temp_mean*10)/23
rownames(test_trop_temp) = sub('_INT|_LTR', '', colnames(fam))
test_trop_temp$id = rownames(test_trop_temp)
test_trop_temp$class = class$Supfam[match(test_trop_temp$id, sub('_INT|_LTR', '', class$TE))]
test_trop_temp = subset(test_trop_temp, !is.na(class)) # keep classified
str(test_trop_temp)
# stats trop or temp unique fams
dim(subset(test_trop_temp, temp_mean==0))
dim(subset(test_trop_temp, trop_mean==0))
sum(subset(test_trop_temp, temp_mean==0)$diff) #0.2673689
sum(subset(test_trop_temp, trop_mean==0)$diff) #-0.1140743 # this value varies slightly due to jittering
# find LTR fams with intact size sig. diff b/t trop and temp
str(fam_int)
rowSums(fam_int[,colnames(fam_int) %in% Fam_count$TE])
sum(str_detect(names(fam_int), ':', negate = T), na.rm = T)
fam_int_pan = fam_int[, names(fam_int) %in% Fam_count$TE]
fam_int_nontrop = fam_int_pan[rownames(fam_int_pan) %in% NAM_nontrop,]
fam_int_trop = fam_int_pan[rownames(fam_int_pan) %in% NAM_trop,]
test_trop_temp_int <- vector("list", 1)
for (j in seq(ncol(fam_int_pan))){
temp = fam_int_nontrop[,j]
trop = fam_int_trop[,j]
if (sd(temp-trop) == 0){
temp = jitter(temp, amount = 0.000000001)
}
test_trop_temp_int[[j]] = t.test(temp, trop)
}
test_trop_temp_int[[1]]
test_trop_temp_int = data.frame(matrix(unlist(test_trop_temp_int), nrow=length(test_trop_temp_int), byrow=T))
test_trop_temp_int[,1:9] = lapply(test_trop_temp_int[,1:9], function(x) {as.numeric(x)})
names(test_trop_temp_int)[names(test_trop_temp_int) == 'X3'] = 'pval'
names(test_trop_temp_int)[names(test_trop_temp_int) == 'X6'] = 'temp_mean'
names(test_trop_temp_int)[names(test_trop_temp_int) == 'X7'] = 'trop_mean'
test_trop_temp_int$diff = test_trop_temp_int$trop_mean - test_trop_temp_int$temp_mean
test_trop_temp_int$mean = (test_trop_temp_int$trop_mean*13 + test_trop_temp_int$temp_mean*10)/23
rownames(test_trop_temp_int) = sub('_INT|_LTR', '', colnames(fam_int_pan))
test_trop_temp_int$id = rownames(test_trop_temp_int)
#test_trop_temp_int$category = Fam_count$category[match(test_trop_temp_int$id, Fam_count$TE)]
str(test_trop_temp_int)
# stats
dim(subset(test_trop_temp_int, pval<0.05)) #584, this number fluctuates slightly due to jittering
Fam_count$int_diff_pval = test_trop_temp_int$pval[match(Fam_count$TE, test_trop_temp_int$id)]
Fam_count$trop_temp_diff_int = test_trop_temp_int$diff[match(Fam_count$TE, test_trop_temp_int$id)]
test_trop_temp_int$class = Fam_count$Supfam[match(test_trop_temp_int$id, Fam_count$TE)]
test_trop_temp_int$class = sub('LINE.*', 'LINE', test_trop_temp_int$class)
table(test_trop_temp_int$class)
rowSums(fam_int)
# stat LTR fams (more stats in the solo-intact section)
sum(test_trop_temp$trop_mean) - sum(test_trop_temp$temp_mean) #35.06666
sum(test_trop_temp[str_detect(test_trop_temp$class, 'LTR'),]$diff) #34.4353/35.06666 = 0.981485
sum(subset(test_trop_temp, class=='LTR/Ty3')$diff) #21.54911/35.06666 = 0.6145185
sum(test_trop_temp[str_detect(test_trop_temp$class, 'TIR'),]$diff) #1.54422/35.06666=0.04448964
sum(test_trop_temp[str_detect(test_trop_temp$class, 'CACTA'),]$diff) #1.136115/35.06666=0.03290385
sum(test_trop_temp[str_detect(test_trop_temp$class, 'Helitron'),]$diff) #-0.9063066/35.06666=-0.02578943
#DNA TE (TIR + Hel): 0.04448964-0.02578943 = 0.01870021
sum(test_trop_temp[str_detect(test_trop_temp$class, 'LINE'),]$diff) #-0.006489931/35.06666=-0.0001850741
dim(subset(test_trop_temp, abs(diff)>0.025)) #216
sum(subset(test_trop_temp, diff>0)$diff) #trop larger: 51.72713
sum(subset(test_trop_temp, diff<0)$diff) #temp larger: -16.66041
sum(test_trop_temp$diff) #net diff: 35.06666
sum(test_trop_temp_int$diff) #net diff of intact: 17.80152
sum(test_trop_temp_int$diff)/sum(test_trop_temp$diff) #17.80152/35.06666 = 0.5076481
sum(fam_int)/26 #563.3439
563.3439/total_TE #0.3072351
fisher.test(rbind(c(17.80152, 35.06666 - 18.01183), c(563.3439, total_TE - 563.3439))) #p-value = 0.01503
test_trop_temp[test_trop_temp$diff>1,]
trop_temp_difflist = colnames(fam)[test_trop_temp$diff>1]
fam[colnames(fam)[test_trop_temp$diff>1]]
# make groups
cutoff = 0.025
test_trop_temp = test_trop_temp %>%
mutate(diff_group = case_when(diff < -cutoff ~ paste('(-inf, ', -cutoff, ']', sep = ''),
diff >= -cutoff & diff <= cutoff ~ paste('(', -cutoff, ', ', cutoff, ')', sep = ''),
diff > cutoff ~ paste('[', cutoff, ', inf)', sep = '')))
test_trop_temp$diff_group = as.factor(test_trop_temp$diff_group)
test_trop_temp$diff_group <- factor(test_trop_temp$diff_group,
levels = c(paste('(-inf, ', -cutoff, ']', sep = ''),
paste('(', -cutoff, ', ', cutoff, ')', sep = ''),
paste('[', cutoff, ', inf)', sep = '')))
test_trop_temp$class = Fam_count$Supfam[match(test_trop_temp$id, Fam_count$TE)]
test_trop_temp$class = sub('LINE.*', 'LINE', test_trop_temp$class)
test_trop_temp$class = sub('nonLTR', 'LINE', test_trop_temp$class)
test_trop_temp = subset(test_trop_temp, !is.na(class)) # remove non-TE fams
# make plots
trop_temp_diff_rank_plot4 = ggplot() +
geom_violin(data = test_trop_temp, aes(diff_group, diff), fill = 'grey', scale = "width") +
geom_text(aes(label = paste('n = ', format(dim(subset(test_trop_temp, diff < -cutoff))[1], big.mark=","), sep = '')), x = 1, y = -0.9, size = 5) +
geom_text(aes(label = paste('n = ', format(dim(subset(test_trop_temp, diff >= -cutoff & diff <= cutoff))[1], big.mark=","), sep = '')), x = 2, y = -0.9, size = 5) +
geom_text(aes(label = paste('n = ', format(dim(subset(test_trop_temp, diff > cutoff))[1], big.mark=","), sep = '')), x = 3, y = -0.9, size = 5) +
labs(y = 'Trop - Temp family size (Mb)', x = 'Temp. larger No diff. Trop. larger') +
ylim(-1.1, 3.5) +
theme(axis.title.x = element_text(face="bold", size=12),
axis.title.y = element_text(face="bold", size=12),
axis.text.x = element_text(size=10, vjust = -2),
axis.text.y = element_text(size=10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"))
trop_temp_diff_rank_plot4
fam_size_rank_plot = ggplot() +
geom_violin(data = subset(fam_size, TE %in% test_trop_temp$id), aes(size_group, mean), scale = "width") +
geom_text(aes(label = paste('n = ', total_fam - dim(subset(fam_size, mean > cutoff))[1], sep = '')), x = 1.25, y = 10, size = 2.8) +
geom_text(aes(label = paste('n = ', dim(subset(fam_size, mean > cutoff))[1], sep = '')), x = 2.25, y = 10, size = 2.8) +
labs(y = 'Family size (Mb)', x = '') +
theme(axis.title.y = element_text(size=12),
axis.text.x = element_text(size=10, vjust = -1),
axis.text.y = element_text(size=10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"))
fam_size_rank_plot
trop_temp_size_cmb_plot = ggplot(test_trop_temp, aes(x = class, y = diff, fill = class)) +
geom_bar(position = 'stack', stat = 'identity', size = 0) +
geom_hline(yintercept=0, size = 0.3, color = 'black') +
scale_fill_manual(values=TE_colors3) +
scale_color_manual(values=TE_colors3) +
ylab("Trop - Temp TE size (Mb)") +
scale_y_continuous(breaks = c(-5, 0, 5, 10, 15), limits = c(-8, 15)) +
theme(legend.position="none") +
theme(axis.title.x = element_blank(),
axis.title.y = element_text(size=12, face="bold"),
axis.text.y = element_text(size=10),
axis.text.x = element_text(size=12, angle=35, vjust=1, hjust=0.95)) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),