# Preparing the environment

In [1]:
library(karyoploteR)
library(GenomicRanges)
library(BSgenome.Hsapiens.UCSC.hg38)
library(ggplot2)
library(dplyr)
library(circlize)


Loading required package: regioneR

Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following object is masked from ‘package:utils’:

    findMatches


The following objects are masked from ‘package:base’:

    expand.grid, I, unname


Loading required package: IRanges

Loading

# Importing files for use with KaryoploteR

Data import

In [3]:
# File path
original_file <- "../samples/gp_global/merged/STRs_annotated_complete.tsv"

# Read cleaned file with read.delim
df_annot <- read.delim(original_file, header=TRUE, sep="\t", stringsAsFactors = FALSE)

# Show the first lines of the updated data frame
head(df_annot)


Unnamed: 0_level_0,STRs_ID,region,gene_id,priority,gene_name,gene_chrom,gene_start,gene_end,annotation,sample_id,chrom,start,end,repeat_unit,allele1_est,allele2_est,depth
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<int>,<chr>,<chr>,<chr>,<int>,<int>,<chr>,<dbl>,<dbl>,<dbl>
1,chr16:12744297:AAGC:12,promoter,ENSG00000260378,4,AC109597.1,chr16,12745873,12757835,promoter:AC109597.1,BEL-1012-1,16,12744297,12744345,AAGC,-0.5,0.0,36
2,chr17:4413485:CTTG:11,intergenic,intergenic,6,.,.,-1,-1,intergenic,BEL-1012-1,17,4413485,4413530,CTTG,5.0,5.73,25
3,chr20:62334753:AGGGCG:5,intron,ENSG00000130702,5,LAMA5,chr20,62307955,62367312,intron:LAMA5,BEL-1012-1,20,62334753,62334783,AGGGCG,3.17,0.0,46
4,chr22:32780933:CCTT:0,promoter,ENSG00000229673,4,Z73495.1,chr22,32783177,32784982,promoter:Z73495.1,BEL-1012-1,22,32780933,32780934,CCTT,-1.0,0.0,30
5,chr20:41707548:AAGG:0,intergenic,intergenic,6,.,.,-1,-1,intergenic,BEL-1012-1,20,41707548,41707549,AAGG,0.0,0.0,28
6,chr7:92128476:GT:2,intron,ENSG00000001630,5,CYP51A1,chr7,92112153,92134803,intron:CYP51A1,BEL-1012-1,7,92128476,92128479,GT,0.0,0.0,40


Quantidade de STRs a serem visualizadas e cromossomos disponíveis

In [14]:
# 1. Add "chr" prefix directly to the chrom column
df_annot <- df_annot %>%
  mutate(chrom = paste0("chr", chrom))

# 2. Define chromosomes with "chr" prefix
chromosomes_to_plot <- c(paste0("chr", 1:22), "chrX", "chrY")

# 3. Filter
df_filtered <- df_annot %>%
  filter(chrom %in% chromosomes_to_plot)

# 4. Create GRanges (already with correct chromosome names)
gr_strs <- GRanges(
  seqnames = df_filtered$chrom,
  ranges = IRanges(start = df_filtered$start, end = df_filtered$end)
)

# 5. Add metadata
mcols(gr_strs)$ID <- df_filtered$STRs_ID
mcols(gr_strs)$Region <- df_filtered$region
mcols(gr_strs)$Gene_Name <- df_filtered$gene_name

# 6. Verify
cat("STRs to plot:", length(gr_strs), "\n")
print(table(gr_strs$Region))
cat("\nChromosomes available for plotting:\n")
print(unique(seqnames(gr_strs)))


STRs to plot: 14913155 

            CDS  five_prime_utr      intergenic          intron          others 
          36290           61998         5427454         8185887           82992 
       promoter three_prime_utr 
         960444          158090 

Chromosomes available for plotting:
 [1] chr16 chr17 chr20 chr22 chr7  chr14 chr6  chr4  chr3  chr11 chr1  chr19
[13] chr8  chr10 chrY  chr5  chr9  chr2  chr18 chr15 chr12 chr13 chr21 chrX 
24 Levels: chr16 chr17 chr20 chr22 chr7 chr14 chr6 chr4 chr3 chr11 ... chrX


# Plots

Plot simples

In [19]:
# Create results directory if it doesn't exist
if (!dir.exists("results")) {
  dir.create("results")
  cat("Created 'results' directory\n")
}

# Verify data
print(paste("Total STRs to plot:", length(gr_strs)))
print(table(gr_strs$region))

# ========================================
# Option 1: Simple plot
# ========================================

# Open PNG device
png(filename = "results/karyotype_STRs_distribution.png", 
    width = 3000, 
    height = 2000, 
    res = 300)

# Create plot
kp <- plotKaryotype(genome = "hg38", 
                    chromosomes = chromosomes_to_plot,
                    plot.type = 1,
                    cex = 0.7)

kpPoints(kp, data = gr_strs, 
         y = 0.5,
         col = "darkblue",
         cex = 0.3,
         pch = 16)

#kpAddMainTitle(kp, "Distribution of STRs in the human genome", cex = 1.2)

# Close device and save
dev.off()

cat("Plot saved to: results/karyotype_STRs_distribution.png\n")


[1] "Total STRs to plot: 14913155"
< table of extent 0 >


Plot saved to: results/karyotype_STRs_distribution.png


Heatmap showing STR density per 1 MB

In [22]:
# Define standard somatic and sex chromosomes
chromosomes_to_plot <- c(paste0("chr", 1:22), "chrX", "chrY")

# Bin parameters
bin_size <- 1e6
chr_lengths <- seqlengths(BSgenome.Hsapiens.UCSC.hg38)

# Create bins for chromosomes of interest
bins_list <- list()
for (chr in chromosomes_to_plot) {
  if (chr %in% names(chr_lengths)) {
    chr_len <- chr_lengths[chr]
    bins_df <- data.frame(
      chr = chr,
      start = seq(1, chr_len, by = bin_size),
      end = pmin(seq(1, chr_len, by = bin_size) + bin_size - 1, chr_len)
    )
    bins_list[[chr]] <- bins_df
  }
}

bins <- do.call(rbind, bins_list)
bins_gr <- GRanges(seqnames = bins$chr,
                   ranges = IRanges(start = bins$start, end = bins$end))

# Calculate the density of STRs in bins using your complete GRanges
overlaps <- countOverlaps(bins_gr, gr_strs)
mcols(bins_gr)$density <- overlaps

# Set color palette for heatmap
colors <- c("white", "yellow","#d94701", "#ee070bff")

# Set intervals for quantitative legend
breaks <- quantile(mcols(bins_gr)$density, probs = seq(0, 1, length.out = length(colors) + 1))
legend_labels <- paste0(round(breaks[-length(breaks)]), " - ", round(breaks[-1]))

png("results/density_STRs.png", 
   width = 24, height = 14, units = "in", res = 600)


# Create karyotype plot
kp <- plotKaryotype(genome = "hg38",
                    chromosomes = chromosomes_to_plot,
                    plot.type = 1,
                    cex = 1.2)

# Plot density heatmap
kpHeatmap(kp, data = bins_gr, y = bins_gr$density, colors = colors)

# Add quantitative legend to chart
legend(x = "bottomright",
       legend = legend_labels,
       fill = colors,
       title = "STRs density (per 1 MB)",
       bg = "white",
       cex = 1.5)

# Close the graphics device to save the file
dev.off()

cat("Plot saved to: results/density_STRs.png\n")

Plot saved to: results/density_STRs.png
