Permalink
Cannot retrieve contributors at this time
1954 lines (1641 sloc)
53.5 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package org.snpeff.interval; | |
| import java.util.ArrayList; | |
| import java.util.Collections; | |
| import java.util.HashSet; | |
| import java.util.LinkedList; | |
| import java.util.List; | |
| import java.util.Map; | |
| import java.util.Set; | |
| import org.snpeff.interval.codonChange.CodonChange; | |
| import org.snpeff.serializer.MarkerSerializer; | |
| import org.snpeff.snpEffect.Config; | |
| import org.snpeff.snpEffect.EffectType; | |
| import org.snpeff.snpEffect.VariantEffect.ErrorWarningType; | |
| import org.snpeff.snpEffect.VariantEffects; | |
| import org.snpeff.stats.ObservedOverExpectedCpG; | |
| import org.snpeff.util.Gpr; | |
| import org.snpeff.util.GprSeq; | |
| /** | |
| * Codon position | |
| * @author pcingola | |
| */ | |
| class CodonPosition { | |
| public int codonNum = -1; | |
| public int codonIndex = -1; | |
| } | |
| /** | |
| * Interval for a transcript, as well as some other information: exons, utrs, cds, etc. | |
| * | |
| * @author pcingola | |
| */ | |
| public class Transcript extends IntervalAndSubIntervals<Exon> { | |
| private static final long serialVersionUID = -2665025617916107311L; | |
| boolean aaCheck; // Has this transcript been checked against a protein sequence? | |
| boolean canonical; // Is this a canonical transcript? | |
| boolean corrected; // Have coordinates been corrected? (e.g. frame correction) | |
| boolean dnaCheck; // Has this transcript been checked against a CDS/cDNA sequence? | |
| boolean proteinCoding; // Is this a protein-coding transcript? | |
| boolean ribosomalSlippage; // Ribosomal slippage causes changes in reading frames. This might be represented as negative length introns (overlapping exons). | |
| int cdsStart, cdsEnd; // CDS start and end coordinates. Note: If the transcript is in reverse strand, then cdsStart > cdsEnd | |
| int spliceSiteSize, spliceRegionExonSize, spliceRegionIntronMin, spliceRegionIntronMax; // Splice sizes | |
| int upDownLength; // Upstream and downstream size | |
| BioType bioType; // Transcript biotype | |
| String cds; // Coding sequence | |
| String mRna; // mRna sequence (includes 5'UTR and 3'UTR) | |
| String protein; // Protein sequence | |
| String version = ""; // Transcript version | |
| List<Utr> utrs; // UTRs | |
| List<Cds> cdss; // CDS information | |
| List<Intron> introns; // Intron markers | |
| Upstream upstream; // Upstream interval | |
| Downstream downstream; // Downstream interval | |
| Exon firstCodingExon; // First coding exon. I.e. where transcription start site (TSS) is. | |
| int cds2pos[], aa2pos[]; | |
| TranscriptSupportLevel transcriptSupportLevel = null; | |
| public Transcript() { | |
| super(); | |
| utrs = new ArrayList<>(); | |
| cdss = new ArrayList<>(); | |
| type = EffectType.TRANSCRIPT; | |
| } | |
| public Transcript(Gene gene, int start, int end, boolean strandMinus, String id) { | |
| super(gene, start, end, strandMinus, id); | |
| type = EffectType.TRANSCRIPT; | |
| } | |
| /** | |
| * Calculate chromosome position as function of Amino Acid number | |
| * | |
| * @returns An array mapping 'pos[aaNumber] = chromosmalPos' | |
| */ | |
| public synchronized int[] aaNumber2Pos() { | |
| if (aa2pos != null) return aa2pos; | |
| calcCdsStartEnd(); | |
| aa2pos = new int[protein().length()]; | |
| for (int i = 0; i < aa2pos.length; i++) | |
| aa2pos[i] = -1; | |
| int cdsMin = Math.min(cdsStart, cdsEnd); | |
| int cdsMax = Math.max(cdsStart, cdsEnd); | |
| // For each exon, add CDS position to array | |
| int aaNum = 0; | |
| int step = isStrandPlus() ? 1 : -1; | |
| int codonFrame = 0; | |
| for (Exon exon : sortedStrand()) { | |
| int min = isStrandPlus() ? exon.getStart() : exon.getEnd(); | |
| int aaIdxStart = -1, aaIdxEnd = -1; | |
| for (int pos = min; exon.intersects(pos) && aaNum < aa2pos.length; pos += step) { | |
| // Is this within a CDS? | |
| if ((cdsMin <= pos) && (pos <= cdsMax)) { | |
| // Update AA indexes for this exon | |
| if (aaIdxStart < 0) aaIdxStart = aaNum; | |
| aaIdxEnd = aaNum; | |
| // First codon base? Add to map | |
| if (codonFrame == 0) aa2pos[aaNum] = pos; | |
| // Last base in this codon? Increment AA number | |
| if (codonFrame == 2) aaNum++; | |
| // Update codon base | |
| codonFrame = (codonFrame + 1) % 3; | |
| } | |
| } | |
| // Update exons' AA indexes | |
| if (aaIdxStart >= 0) exon.setAaIdx(aaIdxStart, aaIdxEnd); | |
| } | |
| return aa2pos; | |
| } | |
| /** | |
| * Find a genomic position of the first base in a Amino Acid 'aaNum' | |
| */ | |
| public int aaNumber2Pos(int aaNum) { | |
| aaNumber2Pos(); | |
| return -1; | |
| } | |
| /** | |
| * Add a CDS | |
| */ | |
| public void add(Cds cdsInt) { | |
| cdss.add(cdsInt); | |
| cds = null; | |
| } | |
| /** | |
| * Add an intron | |
| */ | |
| public void add(Intron intron) { | |
| if (introns == null) introns = new ArrayList<>(); | |
| introns.add(intron); | |
| // Introns should be sorted by strand | |
| if (isStrandPlus()) Collections.sort(introns); | |
| else Collections.sort(introns, Collections.reverseOrder()); | |
| } | |
| /** | |
| * Add a SpliceSite | |
| */ | |
| public void add(SpliceSite spliceSite) { | |
| for (Exon ex : this) | |
| if (ex.intersects(spliceSite)) ex.add(spliceSite); | |
| for (Intron intr : introns()) | |
| if (intr.intersects(spliceSite)) intr.add(spliceSite); | |
| } | |
| /** | |
| * Add a UTR | |
| */ | |
| public void add(Utr utr) { | |
| utrs.add(utr); | |
| cds = null; | |
| } | |
| /** | |
| * Add missing UTRs. See utrFromCds() method. | |
| */ | |
| boolean addMissingUtrs(Markers missingUtrs, boolean verbose) { | |
| missingUtrs.sort(false, isStrandMinus()); | |
| // Get min/max CDS positions | |
| int minCds = Integer.MAX_VALUE; | |
| int maxCds = 0; | |
| for (Cds c : cdss) { | |
| minCds = Math.min(minCds, c.getStart()); | |
| maxCds = Math.max(maxCds, c.getEnd()); | |
| } | |
| if (verbose) { | |
| System.out.println("Transcript '" + id + "' has missing UTRs." // | |
| + " Strand: " + (strandMinus ? "-" : "+") // | |
| + " (minCds: " + minCds // | |
| + " , maxCds: " + maxCds + "):" // | |
| ); | |
| } | |
| // Add intervals | |
| boolean retVal = false; | |
| for (Marker mu : missingUtrs) { | |
| Exon exon = queryExon(mu); | |
| if (exon == null) throw new RuntimeException("Cannot find exon for UTR: " + mu); | |
| Utr toAdd = null; | |
| if (isStrandPlus()) { | |
| if (mu.getEnd() <= minCds) toAdd = new Utr5prime(exon, mu.getStart(), mu.getEnd(), strandMinus, mu.getId()); | |
| else if (mu.getStart() >= maxCds) toAdd = new Utr3prime(exon, mu.getStart(), mu.getEnd(), strandMinus, mu.getId()); | |
| } else { | |
| if (mu.getStart() >= maxCds) toAdd = new Utr5prime(exon, mu.getStart(), mu.getEnd(), strandMinus, mu.getId()); | |
| else if (mu.getEnd() <= minCds) toAdd = new Utr3prime(exon, mu.getStart(), mu.getEnd(), strandMinus, mu.getId()); | |
| } | |
| // OK? | |
| if (toAdd != null) { | |
| add(toAdd); | |
| if (verbose) System.out.println("\tAdding " + toAdd); | |
| retVal = true; | |
| } | |
| } | |
| return retVal; | |
| } | |
| /** | |
| * Adjust transcript coordinates | |
| */ | |
| public boolean adjust() { | |
| boolean changed = false; | |
| int strandSumTr = 0; | |
| int newStart = Integer.MAX_VALUE; | |
| int newEnd = Integer.MIN_VALUE; | |
| int countStrandPlus = 0, countStrandMinus = 0; | |
| for (Exon exon : sortedStrand()) { | |
| newStart = Math.min(newStart, exon.getStart()); | |
| newEnd = Math.max(newEnd, exon.getEnd()); | |
| // Common exon strand | |
| if (exon.isStrandPlus()) countStrandPlus++; | |
| else countStrandMinus++; | |
| } | |
| // UTRs | |
| for (Utr utr : getUtrs()) { | |
| newStart = Math.min(newStart, utr.getStart()); | |
| newEnd = Math.max(newEnd, utr.getEnd()); | |
| } | |
| // Sanity check | |
| strandSumTr = countStrandPlus - countStrandMinus; // Some exons have incorrect strands, we use the strand indicated by most exons | |
| boolean newStrandMinus = strandSumTr < 0; | |
| if ((countStrandPlus > 0) && (countStrandMinus > 0)) Gpr.debug("Transcript '" + id + "' has " + countStrandPlus + " exons on the plus and " + countStrandMinus + " exons on the minus strand! This should never happen!"); | |
| // Change transcript strand? | |
| if (strandMinus != newStrandMinus) { | |
| changed = true; | |
| setStrandMinus(newStrandMinus); // Change strand | |
| } | |
| // Changed? Update values | |
| if (newStart < Integer.MAX_VALUE && newEnd > Integer.MIN_VALUE) { | |
| // Change start? | |
| if (start != newStart) { | |
| setStart(newStart); | |
| changed = true; | |
| } | |
| // Change end? | |
| if (end != newEnd) { | |
| setEnd(newEnd); | |
| changed = true; | |
| } | |
| } | |
| return changed; | |
| } | |
| /** | |
| * Create a new transcript after applying changes in variant | |
| * | |
| * Note: If this transcript is unaffected, no new transcript is created (same transcript is returned) | |
| */ | |
| @Override | |
| public Transcript apply(Variant variant) { | |
| // Variant after this marker: No effect | |
| if (!shouldApply(variant)) return this; | |
| //--- | |
| // Create new transcript | |
| //--- | |
| Transcript newTr = (Transcript) super.apply(variant); | |
| if (newTr == null) return null; | |
| //--- | |
| // Add changed UTRs | |
| //--- | |
| for (Utr utr : utrs) { | |
| Utr newUtr = (Utr) utr.apply(variant); | |
| if (newUtr != null) { | |
| Exon newExon = newTr.findExon(newUtr); | |
| if (newExon != null) { | |
| newUtr.setParent(newExon); | |
| newTr.utrs.add(newUtr); | |
| } else { | |
| // This might happen when a duplication affecting part of an exon | |
| // E.g. If the duplication affects the coding part and NOT the 3'UTR then the UTR doesn't have a | |
| if (Config.get().isDebug()) Gpr.debug("WARNING: applying variant: Could not find 'new' parent exon for 'new' UTR" // | |
| + "\n\t\tVariant : " + variant // | |
| + "\n" // | |
| + "\n\t\tUTR (ori) :" + utr // | |
| + "\n" // | |
| + "\n\t\tTranscript (ori) :" + this // | |
| + "\n" // | |
| + "\n\t\tUTR (new) :" + newUtr // | |
| + "\n" // | |
| + "\n\t\tTranscript (new) :" + newTr // | |
| ); | |
| } | |
| } | |
| } | |
| // Rank exons: if a variant deletes one or more exons, ranks will change | |
| newTr.rankExons(); | |
| // Up & Down stream | |
| newTr.createUpDownStream(upDownLength); | |
| // Introns | |
| newTr.introns(); | |
| // Splice sites | |
| newTr.createSpliceSites(spliceSiteSize, spliceRegionExonSize, spliceRegionIntronMin, spliceRegionIntronMax); | |
| return newTr; | |
| } | |
| /** | |
| * Find base at genomic coordinate 'pos' | |
| */ | |
| public String baseAt(int pos) { | |
| calcCdsStartEnd(); | |
| Exon ex = findExon(pos); | |
| if (ex == null) return null; | |
| return ex.basesAt(pos - ex.getStart(), 1); | |
| } | |
| /** | |
| * Calculate distance from transcript start to a position | |
| * mRNA is roughly the same than cDNA. Strictly speaking mRNA | |
| * has a poly-A tail and 5'cap. | |
| */ | |
| public synchronized int baseNumber2MRnaPos(int pos) { | |
| int count = 0; | |
| for (Exon eint : sortedStrand()) { | |
| if (eint.intersects(pos)) { | |
| // Intersect this exon? Calculate the number of bases from the beginning | |
| int dist = 0; | |
| if (isStrandPlus()) dist = pos - eint.getStart(); | |
| else dist = eint.getEnd() - pos; | |
| // Sanity check | |
| if (dist < 0) throw new RuntimeException("Negative distance for position " + pos + ". This should never happen!\n" + this); | |
| return count + dist; | |
| } | |
| count += eint.size(); | |
| } | |
| return -1; | |
| } | |
| /** | |
| * Calculate base number in a CDS where 'pos' maps | |
| * | |
| * @param usePrevBaseIntron: When 'pos' is intronic this method returns: | |
| * - if( usePrevBaseIntron== false) => The first base in the exon after 'pos' (i.e. first coding base after intron) | |
| * - if( usePrevBaseIntron== true) => The last base in the exon before 'pos' (i.e. last coding base before intron) | |
| * | |
| * @returns Base number or '-1' if it does not map to a coding base | |
| */ | |
| public synchronized int baseNumberCds(int pos, boolean usePrevBaseIntron) { | |
| // Doesn't hit this transcript? | |
| if (!intersects(pos)) return -1; | |
| // Is it in UTR instead of CDS? | |
| if (isUtr(pos)) return -1; | |
| // Calculate cdsStart and cdsEnd (if not already done) | |
| calcCdsStartEnd(); | |
| // All exons.. | |
| int firstCdsBaseInExon = 0; // Where the exon maps to the CDS (i.e. which CDS base number does the first base in this exon maps to). | |
| for (Exon eint : sortedStrand()) { | |
| if (eint.intersects(pos)) { | |
| int cdsBaseInExon; // cdsBaseInExon: base number relative to the beginning of the coding part of this exon (i.e. excluding 5'UTRs) | |
| if (isStrandPlus()) cdsBaseInExon = pos - Math.max(eint.getStart(), cdsStart); | |
| else cdsBaseInExon = Math.min(eint.getEnd(), cdsStart) - pos; | |
| cdsBaseInExon = Math.max(0, cdsBaseInExon); | |
| return firstCdsBaseInExon + cdsBaseInExon; | |
| } else { | |
| // Before exon begins? | |
| if ((isStrandPlus() && (pos < eint.getStart())) // Before exon begins (positive strand)? | |
| || (isStrandMinus() && (pos > eint.getEnd()))) // Before exon begins (negative strand)? | |
| return firstCdsBaseInExon - (usePrevBaseIntron ? 1 : 0); | |
| } | |
| if (isStrandPlus()) firstCdsBaseInExon += Math.max(0, eint.getEnd() - Math.max(eint.getStart(), cdsStart) + 1); | |
| else firstCdsBaseInExon += Math.max(0, Math.min(cdsStart, eint.getEnd()) - eint.getStart() + 1); | |
| } | |
| return firstCdsBaseInExon - 1; | |
| } | |
| /** | |
| * Return a codon that includes 'cdsBaseNumber' | |
| */ | |
| public String baseNumberCds2Codon(int cdsBaseNumber) { | |
| int codonNum = cdsBaseNumber / CodonChange.CODON_SIZE; | |
| int min = codonNum * CodonChange.CODON_SIZE; | |
| int max = codonNum * CodonChange.CODON_SIZE + CodonChange.CODON_SIZE; | |
| if ((min >= 0) && (max <= cds().length())) return cds().substring(min, max).toUpperCase(); | |
| return null; | |
| } | |
| /** | |
| * Calculate chromosome position as function of CDS number | |
| * | |
| * @returns An array mapping 'cds2pos[cdsBaseNumber] = chromosmalPos' | |
| */ | |
| public synchronized int[] baseNumberCds2Pos() { | |
| if (cds2pos != null) return cds2pos; | |
| calcCdsStartEnd(); | |
| cds2pos = new int[cds().length()]; | |
| for (int i = 0; i < cds2pos.length; i++) | |
| cds2pos[i] = -1; | |
| int cdsMin = Math.min(cdsStart, cdsEnd); | |
| int cdsMax = Math.max(cdsStart, cdsEnd); | |
| // For each exon, add CDS position to array | |
| int cdsBaseNum = 0; | |
| for (Exon exon : sortedStrand()) { | |
| int min = isStrandPlus() ? exon.getStart() : exon.getEnd(); | |
| int step = isStrandPlus() ? 1 : -1; | |
| for (int pos = min; exon.intersects(pos) && cdsBaseNum < cds2pos.length; pos += step) | |
| if ((cdsMin <= pos) && (pos <= cdsMax)) cds2pos[cdsBaseNum++] = pos; | |
| } | |
| return cds2pos; | |
| } | |
| /** | |
| * Calculate CDS start and CDS end | |
| */ | |
| synchronized void calcCdsStartEnd() { | |
| // Do we need to calculate these values? | |
| // Note: In circular genomes, one of cdsStart / cdsEnd might be less | |
| // than zero (we must check both) | |
| if ((cdsStart < 0) && (cdsEnd < 0)) { | |
| // Calculate coding start (after 5 prime UTR) | |
| if (utrs.isEmpty()) { | |
| // No UTRs => Use all exons | |
| cdsStart = (isStrandPlus() ? end : start); // cdsStart is the position of the first base in the CDS (i.e. the first base after all 5'UTR) | |
| cdsEnd = (isStrandPlus() ? start : end); // cdsEnd is the position of the last base in the CDS (i.e. the first base before all 3'UTR) | |
| for (Exon ex : this) { | |
| if (isStrandPlus()) { | |
| cdsStart = Math.min(cdsStart, ex.getStart()); | |
| cdsEnd = Math.max(cdsEnd, ex.getEnd()); | |
| } else { | |
| cdsStart = Math.max(cdsStart, ex.getEnd()); | |
| cdsEnd = Math.min(cdsEnd, ex.getStart()); | |
| } | |
| } | |
| } else { | |
| // We have to take into account UTRs | |
| cdsStart = (isStrandPlus() ? start : end); // cdsStart is the position of the first base in the CDS (i.e. the first base after all 5'UTR) | |
| cdsEnd = (isStrandPlus() ? end : start); // cdsEnd is the position of the last base in the CDS (i.e. the first base before all 3'UTR) | |
| int cdsStartNotExon = cdsStart; | |
| for (Utr utr : utrs) { | |
| if (utr instanceof Utr5prime) { | |
| if (isStrandPlus()) cdsStart = Math.max(cdsStart, utr.getEnd() + 1); | |
| else cdsStart = Math.min(cdsStart, utr.getStart() - 1); | |
| } else if (utr instanceof Utr3prime) { | |
| if (isStrandPlus()) cdsEnd = Math.min(cdsEnd, utr.getStart() - 1); | |
| else cdsEnd = Math.max(cdsEnd, utr.getEnd() + 1); | |
| } | |
| } | |
| // Make sure cdsStart and cdsEnd lie within an exon | |
| if (isStrandPlus()) { | |
| cdsStart = firstExonPositionAfter(cdsStart); | |
| cdsEnd = lastExonPositionBefore(cdsEnd); | |
| } else { | |
| cdsStart = lastExonPositionBefore(cdsStart); | |
| cdsEnd = firstExonPositionAfter(cdsEnd); | |
| } | |
| // We were not able to find cdsStart & cdsEnd within exon limits. | |
| // Probably there is something wrong with the database and the transcript does | |
| // not have a single coding base (e.g. all of it is UTR). | |
| if (cdsStart < 0 || cdsEnd < 0) cdsStart = cdsEnd = cdsStartNotExon; | |
| } | |
| } | |
| } | |
| /** | |
| * Retrieve coding sequence | |
| */ | |
| public synchronized String cds() { | |
| if (cds != null) return cds; | |
| // Concatenate all exons | |
| List<Exon> exons = sortedStrand(); | |
| StringBuilder sequence = new StringBuilder(); | |
| int utr5len = 0, utr3len = 0; | |
| // 5 prime UTR length | |
| for (Utr utr : get5primeUtrs()) | |
| utr5len += utr.size(); | |
| // Append all exon sequences | |
| boolean missingSequence = false; | |
| for (Exon exon : exons) { | |
| missingSequence |= !exon.hasSequence(); // If there is no sequence, we are in trouble | |
| sequence.append(exon.getSequence()); | |
| } | |
| if (missingSequence) cds = ""; // One or more exons does not have sequence. Nothing to do | |
| else { | |
| // OK, all exons have sequences | |
| // 3 prime UTR length | |
| for (Utr utr : get3primeUtrs()) | |
| utr3len += utr.size(); | |
| // Cut 5 prime UTR and 3 prime UTR points | |
| int subEnd = sequence.length() - utr3len; | |
| if (utr5len > subEnd) cds = ""; | |
| else cds = sequence.substring(utr5len, subEnd); | |
| } | |
| return cds; | |
| } | |
| /** | |
| * Create a marker of the coding region in this transcript | |
| */ | |
| public Marker cdsMarker() { | |
| return isStrandPlus() // | |
| ? new Marker(this, getCdsStart(), getCdsEnd()) // | |
| : new Marker(this, getCdsEnd(), getCdsStart()) // | |
| ; | |
| } | |
| @Override | |
| public Transcript cloneShallow() { | |
| Transcript clone = (Transcript) super.cloneShallow(); | |
| clone.proteinCoding = proteinCoding; | |
| clone.canonical = canonical; | |
| clone.bioType = bioType; | |
| clone.aaCheck = aaCheck; | |
| clone.dnaCheck = dnaCheck; | |
| clone.corrected = corrected; | |
| clone.ribosomalSlippage = ribosomalSlippage; | |
| return clone; | |
| } | |
| /** | |
| * Return an array of 3 genomic positions where amino acid number 'aaNum' maps | |
| * | |
| * @return aa2pos[0], aa2pos[1], aa2pos[2] are the coordinates (within the chromosome) | |
| * of the three bases conforming codon 'aaNum'. Any aa2pos[i] = -1 means that | |
| * it could a base in the codon could not be mapped. | |
| */ | |
| public int[] codonNumber2Pos(int codonNum) { | |
| if (cds2pos == null) baseNumberCds2Pos(); | |
| // Initialize | |
| int codon[] = new int[3]; | |
| int step = isStrandPlus() ? 1 : -1; | |
| int idxStart = isStrandPlus() ? 0 : 2; | |
| for (int i = idxStart, j = 3 * codonNum; (i < codon.length) && (i >= 0) && j < cds2pos.length; i += step, j++) { | |
| codon[i] = cds2pos[j]; | |
| } | |
| return codon; | |
| } | |
| /** | |
| * Collapses exons having gaps of zero (i.e. exons that followed by other exons). | |
| * Does the same for CDSs and UTRs. | |
| * @return true of any exon in the transcript was 'collapsed' | |
| */ | |
| public boolean collapseZeroGap() { | |
| if (ribosomalSlippage) return false; // Overlapping exons are representing ribosomal slippage, so they are not annotations errors and must not be corrected. | |
| boolean ret = false; | |
| introns = null; // These need to be recalculated | |
| //--- | |
| // Collapse Exons | |
| //--- | |
| Markers markers = new Markers(); | |
| markers.addAll(subIntervals()); | |
| Map<Marker, Marker> collapse = MarkerUtil.collapseZeroGap(markers); // Create a map of collapsed exons | |
| // Replace exons | |
| for (Marker exon : collapse.keySet()) { | |
| Exon collapsedExon = (Exon) collapse.get(exon); | |
| // Is this exon to be replaced? (i.e. collapseZeroGap returns different coordinates) | |
| if (exon.size() != collapsedExon.size() // | |
| || exon.getStart() != collapsedExon.getStart() // | |
| || exon.getEnd() != collapsedExon.getEnd() // | |
| ) { | |
| ret = true; | |
| // Show debugging information | |
| if (Config.get().isDebug()) System.err.println("\t\t\tTranscript " + getId() + ": Collapsing exon " + exon.getId() + "\t[ " + exon.getStart() + " - " + exon.getEnd() + " ]\t=>\t[ " + collapsedExon.getStart() + " - " + collapsedExon.getEnd() + " ]"); | |
| // Replace exon | |
| remove((Exon) exon); | |
| if (!containsId(collapsedExon.getId())) add(collapsedExon); // Add collapsedExon. Make sure we don't add it twice (since many exons can be collapsed into one). | |
| // Change parent exon in UTRs | |
| for (Marker m : getUtrs()) { | |
| Utr utr = (Utr) m; | |
| if (utr.getParent() == exon) utr.setParent(collapsedExon); | |
| } | |
| } | |
| } | |
| //--- | |
| // Collapse CDS | |
| //--- | |
| collapse = MarkerUtil.collapseZeroGap(new Markers(cdss)); | |
| cdss = new ArrayList<>(); // Re-create CDSs list | |
| Markers uniqCollapsedCds = new Markers(collapse.values()).unique(); // Create a set of unique CDSs and add them to CDSs list | |
| for (Marker cds : uniqCollapsedCds) | |
| cdss.add((Cds) cds); | |
| //--- | |
| // Collapse UTRs | |
| //--- | |
| collapse = MarkerUtil.collapseZeroGap(new Markers(utrs)); | |
| Markers uniqCollapsedUtrs = new Markers(collapse.values()).unique(); // Create a set of unique UTRs, and add them to the list | |
| utrs = new ArrayList<>(); // Re-generate UTRs list | |
| for (Marker utr : uniqCollapsedUtrs) | |
| utrs.add((Utr) utr); | |
| return ret; | |
| } | |
| /** | |
| * Calculate CpG bias: number of CpG / expected[CpG] | |
| */ | |
| public double cpgExonBias() { | |
| ObservedOverExpectedCpG oe = new ObservedOverExpectedCpG(); | |
| return oe.oe(this); | |
| } | |
| /** | |
| * Count total CpG in this transcript's exons | |
| */ | |
| public int cpgExons() { | |
| ObservedOverExpectedCpG oe = new ObservedOverExpectedCpG(); | |
| return oe.observed(this); | |
| } | |
| /** | |
| * Find all splice sites. | |
| */ | |
| public void createSpliceSites(int spliceSiteSize, int spliceRegionExonSize, int spliceRegionIntronMin, int spliceRegionIntronMax) { | |
| this.spliceSiteSize = spliceSiteSize; | |
| this.spliceRegionExonSize = spliceRegionExonSize; | |
| this.spliceRegionIntronMin = spliceRegionIntronMin; | |
| this.spliceRegionIntronMax = spliceRegionIntronMax; | |
| // Create spliceSiteRegion on the Exon side | |
| ArrayList<Exon> exons = (ArrayList<Exon>) sortedStrand(); | |
| if (exons.size() > 0) { | |
| for (int i = 0; i < exons.size(); i++) { // Iterate like this to check rank | |
| Exon exon = exons.get(i); | |
| if (i > 0) exon.createSpliceSiteRegionStart(spliceRegionExonSize); // Splice site region at the start | |
| if (i < (exon.size() - 1)) exon.createSpliceSiteRegionEnd(spliceRegionExonSize); // Splice site region at the end | |
| // Sanity check | |
| int rank = i + 1; | |
| if (exon.getRank() != rank) { | |
| String msg = "Rank numbers do not march: " + rank + " != " + exon.getRank() + "\n\tTranscript: " + this; | |
| throw new RuntimeException(msg); | |
| } | |
| } | |
| } | |
| // Create spliceSite (donor/acceptor) and spliceSiteRegion on the Intron side | |
| List<Intron> introns = introns(); | |
| if (introns != null) { | |
| for (int i = 0; i < introns.size(); i++) { | |
| Intron intron = introns.get(i); | |
| intron.createSpliceSiteAcceptor(spliceSiteSize); // Acceptor splice site | |
| intron.createSpliceSiteDonor(spliceSiteSize); // Acceptor splice site | |
| // Splice region | |
| intron.createSpliceSiteRegionStart(spliceRegionIntronMin, spliceRegionIntronMax); | |
| intron.createSpliceSiteRegionEnd(spliceRegionIntronMin, spliceRegionIntronMax); | |
| } | |
| } | |
| } | |
| /** | |
| * Creates a list of UP/DOWN stream regions (for each transcript) | |
| * Upstream (downstream) stream is defined as upDownLength before (after) transcript | |
| */ | |
| public void createUpDownStream(int upDownLength) { | |
| this.upDownLength = upDownLength; | |
| Chromosome chr = getChromosome(); | |
| int chrMin = chr.getStart(), chrMax = chr.getEnd(); | |
| // Create up/down stream intervals and add them to the list | |
| int beforeStart = Math.max(start - upDownLength, chrMin); | |
| int beforeEnd = Math.max(start - 1, chrMin); | |
| int afterStart = Math.min(end + 1, chrMax); | |
| int afterEnd = Math.min(end + upDownLength, chrMax); | |
| if (isStrandPlus()) { | |
| if (beforeStart < beforeEnd) upstream = new Upstream(this, beforeStart, beforeEnd, false, id); | |
| if (afterStart < afterEnd) downstream = new Downstream(this, afterStart, afterEnd, false, id); | |
| } else { | |
| if (afterStart < afterEnd) upstream = new Upstream(this, afterStart, afterEnd, false, id); | |
| if (beforeStart < beforeEnd) downstream = new Downstream(this, beforeStart, beforeEnd, false, id); | |
| } | |
| } | |
| /** | |
| * Deletes redundant exons (i.e. exons that are totally included in other exons). | |
| * Does the same for CDSs. | |
| Does the same for UTRs. | |
| */ | |
| public boolean deleteRedundant() { | |
| boolean ret = false; | |
| introns = null; // These need to be recalculated | |
| //--- | |
| // Delete redundant exons | |
| //--- | |
| Map<Marker, Marker> includedIn = MarkerUtil.redundant(subIntervals()); | |
| for (Marker exon : includedIn.keySet()) { | |
| ret = true; | |
| remove((Exon) exon); | |
| // Change parent exon in UTRs | |
| for (Marker m : getUtrs()) { | |
| Utr utr = (Utr) m; | |
| if (utr.getParent() == exon) utr.setParent(includedIn.get(exon)); | |
| } | |
| } | |
| //--- | |
| // Delete redundant CDS | |
| //--- | |
| includedIn = MarkerUtil.redundant(cdss); | |
| for (Marker cds : includedIn.keySet()) | |
| cdss.remove(cds); | |
| //--- | |
| // Delete redundant UTRs | |
| //--- | |
| includedIn = MarkerUtil.redundant(utrs); | |
| for (Marker utr : includedIn.keySet()) | |
| utrs.remove(utr); | |
| return ret; | |
| } | |
| /** | |
| * Find a CDS that matches exactly the exon | |
| */ | |
| public Cds findCds(Exon exon) { | |
| for (Cds cds : cdss) | |
| if (exon.includes(cds)) return cds; | |
| return null; | |
| } | |
| /** | |
| * Return the an exon that intersects 'pos' | |
| */ | |
| public Exon findExon(int pos) { | |
| for (Exon exon : this) | |
| if (exon.intersects(pos)) return exon; | |
| return null; | |
| } | |
| /** | |
| * Return an exon intersecting 'marker' (first exon found) | |
| */ | |
| public Exon findExon(Marker marker) { | |
| for (Exon exon : this) | |
| if (exon.intersects(marker)) return exon; | |
| return null; | |
| } | |
| /** | |
| * Return an intron overlapping position 'pos' | |
| */ | |
| public Intron findIntron(int pos) { | |
| // Is 'pos' in intron? | |
| for (Intron intron : introns()) | |
| if (intron.intersects(pos)) return intron; | |
| return null; | |
| } | |
| /** | |
| * Return the UTR that hits position 'pos' | |
| * @return An UTR intersecting 'pos' (null if not found) | |
| */ | |
| public Utr findUtr(int pos) { | |
| // Is it in UTR? | |
| for (Utr utr : utrs) | |
| if (utr.intersects(pos)) return utr; | |
| return null; | |
| } | |
| /** | |
| * Return the UTR that intersects 'marker' (null if not found) | |
| */ | |
| public List<Utr> findUtrs(Marker marker) { | |
| List<Utr> utrs = new LinkedList<>(); | |
| // Is it in UTR instead of CDS? | |
| for (Utr utr : utrs) | |
| if (utr.intersects(marker)) utrs.add(utr); | |
| return utrs.isEmpty() ? null : utrs; | |
| } | |
| /** | |
| * Find the first position after 'pos' within an exon | |
| */ | |
| int firstExonPositionAfter(int pos) { | |
| for (Exon ex : sorted()) { | |
| if (pos <= ex.getStart()) return ex.getStart(); | |
| if (pos <= ex.getEnd()) return pos; | |
| } | |
| System.err.println("WARNING: Cannot find first exonic position after " + pos + " for transcript '" + id + "'"); | |
| return -1; | |
| } | |
| /** | |
| * Correct exons based on frame information. | |
| * | |
| * E.g. if the frame information (form a genomic | |
| * database file, such as a GTF) does not | |
| * match the calculated frame, we correct exon's | |
| * boundaries to make them match. | |
| * | |
| * This is performed in two stages: | |
| * i) First exon is corrected by adding a fake 5'UTR | |
| * ii) Other exons are corrected by changing the start (or end) coordinates. | |
| */ | |
| public synchronized boolean frameCorrection() { | |
| // Copy frame information form CDSs to Exons (if missing) | |
| frameFromCds(); | |
| // First exon is corrected by adding a fake 5'UTR | |
| boolean changedFirst = frameCorrectionFirstCodingExon(); | |
| // Other exons are corrected by changing the start (or end) coordinates. | |
| // boolean changedNonFirst = false; | |
| // Gpr.debug("UNCOMMENT!"); | |
| boolean changedNonFirst = frameCorrectionNonFirstCodingExon(); | |
| boolean changed = changedFirst || changedNonFirst; | |
| // We have to reset cached CDS data if anything changed | |
| if (changed) { | |
| resetCache(); | |
| corrected = true; | |
| } | |
| // Return true if there was any adjustment | |
| return changed; | |
| } | |
| /** | |
| * Fix transcripts having non-zero frames in first exon | |
| * | |
| * Transcripts whose first exon has a non-zero frame indicate problems. | |
| * We add a 'fake' UTR5 to compensate for reading frame. | |
| */ | |
| synchronized boolean frameCorrectionFirstCodingExon() { | |
| List<Exon> exons = sortedStrand(); | |
| // No exons? Nothing to do | |
| if ((exons == null) || exons.isEmpty()) return false; | |
| Exon exonFirst = getFirstCodingExon(); // Get first exon | |
| // Exon exonFirst = exons.get(0); // Get first exon | |
| if (exonFirst.getFrame() <= 0) return false; // Frame OK (or missing), nothing to do | |
| // First exon is not zero? => Create a UTR5 prime to compensate | |
| Utr5prime utr5 = null; | |
| int frame = exonFirst.getFrame(); | |
| if (isStrandPlus()) { | |
| int end = exonFirst.getStart() + (frame - 1); | |
| utr5 = new Utr5prime(exonFirst, exonFirst.getStart(), end, isStrandMinus(), exonFirst.getId()); | |
| } else { | |
| int start = exonFirst.getEnd() - (frame - 1); | |
| utr5 = new Utr5prime(exonFirst, start, exonFirst.getEnd(), isStrandMinus(), exonFirst.getId()); | |
| } | |
| // Reset frame, since it was already corrected | |
| exonFirst.setFrame(0); | |
| Cds cds = findCds(exonFirst); | |
| if (cds != null) cds.frameCorrection(cds.getFrame()); | |
| // Add UTR5' | |
| add(utr5); | |
| // Update counter | |
| return true; | |
| } | |
| /** | |
| * Correct exons according to frame information | |
| */ | |
| synchronized boolean frameCorrectionNonFirstCodingExon() { | |
| boolean corrected = false; | |
| // Concatenate all exons to create a CDS | |
| List<Exon> exons = sortedStrand(); | |
| StringBuilder sequence = new StringBuilder(); | |
| // We don't need to correct if there is no sequence! (the problem only exists due to sequence frame) | |
| for (Exon exon : exons) | |
| if (!exon.hasSequence()) return false; | |
| // 5'UTR length | |
| int utr5Start = Integer.MAX_VALUE, utr5End = -1; | |
| for (Utr utr : get5primeUtrs()) { | |
| utr.size(); | |
| utr5Start = Math.min(utr5Start, utr.getStart()); | |
| utr5End = Math.max(utr5End, utr.getEnd()); | |
| } | |
| // Create UTR | |
| Marker utr5 = utr5End >= 0 ? new Marker(this, utr5Start, utr5End, strandMinus, "") : null; | |
| // Append all exon sequences | |
| for (Exon exon : exons) { | |
| String seq = ""; | |
| int utrOverlap = 0; | |
| boolean utrIncluded = false; | |
| // Check if exon overlaps UTR | |
| if (utr5 != null && utr5.includes(exon)) { | |
| // The whole exon is included => No sequence change | |
| seq = ""; | |
| utrIncluded = true; | |
| } else { | |
| // Add sequence | |
| seq = exon.getSequence(); | |
| if (utr5 != null && utr5.intersects(exon)) { | |
| utrOverlap = utr5.intersectSize(exon); | |
| if (utrOverlap > 0) { | |
| if (utrOverlap < seq.length()) seq = seq.substring(utrOverlap); | |
| else seq = ""; | |
| } | |
| } | |
| } | |
| //--- | |
| // Frame check | |
| //--- | |
| if (exon.getFrame() < 0) { | |
| // Nothing to do (assume current frame is right) | |
| } else if (utrIncluded) { | |
| // Exon does not code, frame information is meaningless | |
| if (exon.getFrame() >= 0) exon.setFrame(-1); | |
| } else { | |
| // Calculate frame | |
| // We use GFF style frame calculation | |
| // References: http://mblab.wustl.edu/GTF22.html | |
| int frameReal = FrameType.GFF.frameFromLength(sequence.length()); | |
| // Does calculated frame match? | |
| if (frameReal != exon.getFrame()) { | |
| if (utrOverlap > 0) { | |
| throw new RuntimeException("Fatal Error: First exon needs correction: This should never happen!"// | |
| + "\n\tThis method is supposed to be called AFTER method"// | |
| + "\n\tSnpEffPredictorFactory.frameCorrectionFirstCodingExon(), which"// | |
| + "\n\tshould have taken care of this problem." // | |
| + "\n\t" + this // | |
| ); | |
| } else { | |
| if (Config.get().isDebug()) { | |
| System.err.println("\t\tFrame correction: " // | |
| + "Position " + toStr() // | |
| + "Transcript '" + getId() + "'" // | |
| + "\tExon rank " + exon.getRank() // | |
| + "\tExpected frame: " + frameReal // | |
| + "\tExon frame: " + exon.getFrame() // | |
| + "\tSequence len: " + sequence.length() // | |
| ); | |
| } | |
| // Find matching CDS | |
| Cds cdsToCorrect = findCds(exon); | |
| // Correct exon until we get the expected frame | |
| for (boolean ok = true; ok && frameReal != exon.getFrame();) { | |
| // Correct both Exon and CDS | |
| ok &= exon.frameCorrection(1); | |
| if (cdsToCorrect != null) cdsToCorrect.frameCorrection(1); | |
| corrected = true; | |
| } | |
| // Get new exon's sequence | |
| seq = exon.getSequence(); | |
| } | |
| } | |
| } | |
| // Append sequence | |
| sequence.append(seq); | |
| } | |
| return corrected; | |
| } | |
| /** | |
| * Copy frame info from CDSs into Exons | |
| */ | |
| void frameFromCds() { | |
| for (Exon ex : this) { | |
| // No frame info? => try to find matching CDS | |
| if (ex.getFrame() < 0) { | |
| // Check a CDS that matches an exon | |
| for (Cds cds : getCds()) { | |
| // CDS matches the exon coordinates? => Copy frame info | |
| if (isStrandPlus() && (ex.getStart() == cds.getStart())) { | |
| ex.setFrame(cds.getFrame()); | |
| break; | |
| } else if (isStrandMinus() && (ex.getEnd() == cds.getEnd())) { | |
| ex.setFrame(cds.getFrame()); | |
| break; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| /** | |
| * Create a list of 3 prime UTRs | |
| */ | |
| public List<Utr3prime> get3primeUtrs() { | |
| ArrayList<Utr3prime> list = new ArrayList<>(); | |
| for (Utr utr : utrs) | |
| if (utr instanceof Utr3prime) list.add((Utr3prime) utr); | |
| return list; | |
| } | |
| public List<Utr3prime> get3primeUtrsSorted() { | |
| List<Utr3prime> list = get3primeUtrs(); | |
| Collections.sort(list); | |
| return list; | |
| } | |
| /** | |
| * Create a list of 5 prime UTRs | |
| */ | |
| public List<Utr5prime> get5primeUtrs() { | |
| ArrayList<Utr5prime> list = new ArrayList<>(); | |
| for (Utr utr : utrs) | |
| if (utr instanceof Utr5prime) list.add((Utr5prime) utr); | |
| return list; | |
| } | |
| public List<Utr5prime> get5primeUtrsSorted() { | |
| List<Utr5prime> list = get5primeUtrs(); | |
| Collections.sort(list); | |
| return list; | |
| } | |
| public BioType getBioType() { | |
| return bioType; | |
| } | |
| /** | |
| * Get all CDSs | |
| */ | |
| public List<Cds> getCds() { | |
| return cdss; | |
| } | |
| public int getCdsEnd() { | |
| calcCdsStartEnd(); | |
| return cdsEnd; | |
| } | |
| public int getCdsStart() { | |
| calcCdsStartEnd(); | |
| return cdsStart; | |
| } | |
| public Downstream getDownstream() { | |
| return downstream; | |
| } | |
| /** | |
| * Get first coding exon | |
| */ | |
| public synchronized Exon getFirstCodingExon() { | |
| if (firstCodingExon == null) { | |
| // Get transcription start position | |
| long cstart = getCdsStart(); | |
| // Pick exon intersecting cdsStart (TSS) | |
| for (Exon exon : sortedStrand()) | |
| if (exon.intersects(cstart)) firstCodingExon = exon; | |
| // Sanity check | |
| if (firstCodingExon == null) throw new RuntimeException("Error: Cannot find first coding exon for transcript:\n" + this); | |
| } | |
| return firstCodingExon; | |
| } | |
| public TranscriptSupportLevel getTranscriptSupportLevel() { | |
| return transcriptSupportLevel; | |
| } | |
| /** | |
| * Create a TSS marker | |
| */ | |
| public Marker getTss() { | |
| calcCdsStartEnd(); | |
| Marker tss = new Marker(this, start + (isStrandPlus() ? 0 : -1), start + (isStrandPlus() ? 1 : 0), false, "TSS_" + id); | |
| return tss; | |
| } | |
| public Upstream getUpstream() { | |
| return upstream; | |
| } | |
| /** | |
| * Get all UTRs | |
| */ | |
| public List<Utr> getUtrs() { | |
| return utrs; | |
| } | |
| public String getVersion() { | |
| return version; | |
| } | |
| /** | |
| * Does this transcript have any errors? | |
| */ | |
| public boolean hasError() { | |
| return isErrorProteinLength() || isErrorStartCodon() || isErrorStopCodonsInCds(); | |
| } | |
| /** | |
| * Does this transcript have any errors? | |
| */ | |
| public boolean hasErrorOrWarning() { | |
| return hasError() || hasWarning(); | |
| } | |
| public boolean hasTranscriptSupportLevelInfo() { | |
| return (transcriptSupportLevel != null) && (transcriptSupportLevel != TranscriptSupportLevel.TSL_NA); | |
| } | |
| /** | |
| * Does this transcript have any errors? | |
| */ | |
| public boolean hasWarning() { | |
| return isWarningStopCodon() // All possible warnings | |
| ; | |
| } | |
| /** | |
| * Get all introns (lazy init) | |
| */ | |
| public synchronized List<Intron> introns() { | |
| if (introns == null) { | |
| introns = new ArrayList<>(); | |
| Exon exBefore = null; | |
| for (Exon ex : sortedStrand()) { | |
| if (exBefore != null) { | |
| // Create intron | |
| Intron intron; | |
| int rank = introns.size() + 1; | |
| // Find intron start and end | |
| int start, end; | |
| if (isStrandPlus()) { | |
| start = exBefore.getEnd() + 1; | |
| end = ex.getStart() - 1; | |
| } else { | |
| start = ex.getEnd() + 1; | |
| end = exBefore.getStart() - 1; | |
| } | |
| int size = end - start + 1; | |
| if (size > 0) { | |
| // Add intron to list | |
| intron = new Intron(this, start, end, strandMinus, id + "_intron_" + rank, exBefore, ex); | |
| intron.setRank(rank); | |
| introns.add(intron); | |
| } | |
| } | |
| exBefore = ex; | |
| } | |
| } | |
| return introns; | |
| } | |
| public boolean isAaCheck() { | |
| return aaCheck; | |
| } | |
| @Override | |
| protected boolean isAdjustIfParentDoesNotInclude(Marker parent) { | |
| return true; | |
| } | |
| public boolean isCanonical() { | |
| return canonical; | |
| } | |
| /** | |
| * Is this variant in the CDS part of this transcript? | |
| */ | |
| boolean isCds(Variant variant) { | |
| calcCdsStartEnd(); | |
| int cs = cdsStart; | |
| int ce = cdsEnd; | |
| if (isStrandMinus()) { | |
| cs = cdsEnd; | |
| ce = cdsStart; | |
| } | |
| return (variant.getEnd() >= cs) && (variant.getStart() <= ce); | |
| } | |
| /** | |
| * Has this transcript been checked against CDS/DNA/AA sequences? | |
| */ | |
| public boolean isChecked() { | |
| return aaCheck || dnaCheck; | |
| } | |
| public boolean isCorrected() { | |
| return corrected; | |
| } | |
| public boolean isDnaCheck() { | |
| return dnaCheck; | |
| } | |
| public boolean isDownstream(int pos) { | |
| return downstream != null && downstream.intersects(pos); | |
| } | |
| /** | |
| * Check if coding length is multiple of 3 in protein coding transcripts | |
| * @return true on Error | |
| */ | |
| public boolean isErrorProteinLength() { | |
| if (!Config.get().isTreatAllAsProteinCoding() && !isProteinCoding()) return false; | |
| return (cds().length() % 3) != 0; | |
| } | |
| /** | |
| * Is the first codon a START codon? | |
| */ | |
| public boolean isErrorStartCodon() { | |
| if (!Config.get().isTreatAllAsProteinCoding() && !isProteinCoding()) return false; | |
| // Not even one codon in this protein? Error | |
| String cds = cds(); | |
| if (cds.length() < 3) return true; | |
| String codon = cds.substring(0, 3); | |
| return !codonTable().isStart(codon); | |
| } | |
| /** | |
| * Check if protein sequence has STOP codons in the middle of the coding sequence | |
| * @return true on Error | |
| */ | |
| public boolean isErrorStopCodonsInCds() { | |
| if (!Config.get().isTreatAllAsProteinCoding() && !isProteinCoding()) return false; | |
| // Get protein sequence | |
| String prot = protein(); | |
| if (prot == null) return false; | |
| // Any STOP codon before the end? | |
| char bases[] = prot.toCharArray(); | |
| int max = bases.length - 1; | |
| int countErrs = 0; | |
| for (int i = 0; i < max; i++) | |
| if (bases[i] == '*') { | |
| countErrs++; | |
| // We allow up to one STOP codon because it can be a RARE_AMINO_ACID which is coded as a STOP codon. | |
| // More than one STOP codon is not "normal", so it's probably an error in the genomic annotations (e.g. ENSEMBL or UCSC) | |
| if (countErrs > 1) return true; | |
| } | |
| // OK | |
| return false; | |
| } | |
| public boolean isIntron(int pos) { | |
| return findIntron(pos) != null; | |
| } | |
| public boolean isProteinCoding() { | |
| return proteinCoding; | |
| } | |
| public boolean isRibosomalSlippage() { | |
| return ribosomalSlippage; | |
| } | |
| public boolean isUpstream(int pos) { | |
| return upstream != null && upstream.intersects(pos); | |
| } | |
| public boolean isUtr(int pos) { | |
| return findUtr(pos) != null; | |
| } | |
| public boolean isUtr(Marker marker) { | |
| return findUtrs(marker) != null; | |
| } | |
| public boolean isUtr3(int pos) { | |
| Utr utr = findUtr(pos); | |
| return utr != null && utr instanceof Utr3prime; | |
| } | |
| public boolean isUtr5(int pos) { | |
| Utr utr = findUtr(pos); | |
| return utr != null && utr instanceof Utr5prime; | |
| } | |
| /** | |
| * Is the last codon a STOP codon? | |
| */ | |
| public boolean isWarningStopCodon() { | |
| if (!Config.get().isTreatAllAsProteinCoding() && !isProteinCoding()) return false; | |
| // Not even one codon in this protein? Error | |
| String cds = cds(); | |
| if (cds.length() < 3) return true; | |
| String codon = cds.substring(cds.length() - 3); | |
| return !codonTable().isStop(codon); | |
| } | |
| /** | |
| * Find the last position before 'pos' within an exon | |
| */ | |
| int lastExonPositionBefore(int pos) { | |
| int last = -1; | |
| for (Exon ex : sorted()) { | |
| if (pos < ex.getStart()) { | |
| // Nothing found? | |
| if (last < 0) { | |
| System.err.println("WARNING: Cannot find last exonic position before " + pos + " for transcript '" + id + "'"); | |
| return -1; | |
| } | |
| return last; | |
| } else if (pos <= ex.getEnd()) return pos; | |
| last = ex.getEnd(); | |
| } | |
| if (last < 0) System.err.println("WARNING: Cannot find last exonic position before " + pos + " for transcript '" + id + "'"); | |
| return pos; | |
| } | |
| /** | |
| * A list of all markers in this transcript | |
| */ | |
| @Override | |
| public Markers markers() { | |
| Markers markers = new Markers(); | |
| markers.addAll(subIntervals()); | |
| markers.addAll(utrs); | |
| markers.addAll(cdss); | |
| markers.add(upstream); | |
| markers.add(downstream); | |
| markers.addAll(introns()); | |
| return markers; | |
| } | |
| /** | |
| * Retrieve coding sequence AND the UTRs (mRNA = 5'UTR + CDS + 3'UTR) | |
| * I.e. Concatenate all exon sequences | |
| */ | |
| public synchronized String mRna() { | |
| if (mRna != null) return mRna; | |
| List<Exon> exons = sortedStrand(); | |
| // Concatenate all exons | |
| StringBuilder sequence = new StringBuilder(); | |
| for (Exon ex : exons) | |
| sequence.append(ex.getSequence()); | |
| mRna = sequence.toString(); | |
| return mRna; | |
| } | |
| /** | |
| * Protein sequence (amino acid sequence produced by this transcripts) | |
| */ | |
| public String protein() { | |
| if (protein == null) { | |
| if (!(Config.get() != null && Config.get().isTreatAllAsProteinCoding()) && !isProteinCoding()) protein = ""; | |
| else protein = codonTable().aa(cds(), true); | |
| } | |
| return protein; | |
| } | |
| /** | |
| * Query all genomic regions that intersect 'marker' | |
| */ | |
| @Override | |
| public Markers query(Marker marker) { | |
| Set<Marker> results = new HashSet<>(); | |
| // Add exons | |
| for (Exon ex : this) | |
| if (ex.intersects(marker)) { | |
| results.add(ex); | |
| // Query deeper | |
| for (Marker ee : ex.query(marker)) | |
| results.add(ee); | |
| } | |
| // Ad UTRs | |
| for (Utr u : utrs) | |
| if (u.intersects(marker)) results.add(u); | |
| // Add CDSs | |
| for (Cds m : cdss) | |
| if (m.intersects(marker)) results.add(m); | |
| // Add introns | |
| for (Intron intr : introns()) | |
| if (intr.intersects(marker)) { | |
| results.add(intr); | |
| // Query deeper | |
| for (Marker ee : intr.query(marker)) | |
| results.add(ee); | |
| } | |
| // Get results into markers | |
| Markers markers = new Markers(); | |
| markers.addAll(results); | |
| return markers; | |
| } | |
| /** | |
| * Return the first exon that intersects 'interval' (null if not found) | |
| */ | |
| public Exon queryExon(Marker interval) { | |
| for (Exon ei : this) | |
| if (ei.intersects(interval)) return ei; | |
| return null; | |
| } | |
| /** | |
| * Assign ranks to exons | |
| */ | |
| public boolean rankExons() { | |
| boolean changed = false; | |
| int rank = 1; | |
| for (Exon exon : sortedStrand()) { | |
| if (rank != exon.getRank()) { | |
| exon.setRank(rank); | |
| changed = true; | |
| } | |
| rank++; | |
| } | |
| return changed; | |
| } | |
| @Override | |
| public void reset() { | |
| super.reset(); | |
| utrs = new ArrayList<>(); | |
| cdss = new ArrayList<>(); | |
| introns = null; | |
| upstream = null; | |
| downstream = null; | |
| resetCache(); | |
| } | |
| public void resetCache() { | |
| cdsStart = -1; | |
| cdsEnd = -1; | |
| firstCodingExon = null; | |
| cds = null; | |
| cds2pos = null; | |
| aa2pos = null; | |
| mRna = null; | |
| protein = null; | |
| } | |
| public void resetExons() { | |
| super.reset(); | |
| resetCache(); | |
| } | |
| /** | |
| * Perfom some baseic chekcs, return error type, if any | |
| */ | |
| public ErrorWarningType sanityCheck(Variant variant) { | |
| if (isErrorStopCodonsInCds()) return ErrorWarningType.WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS; | |
| if (isErrorProteinLength()) return ErrorWarningType.WARNING_TRANSCRIPT_INCOMPLETE; | |
| if (isErrorStartCodon()) return ErrorWarningType.WARNING_TRANSCRIPT_NO_START_CODON; | |
| if (isWarningStopCodon()) return ErrorWarningType.WARNING_TRANSCRIPT_NO_STOP_CODON; | |
| return null; | |
| } | |
| /** | |
| * Parse a line from a serialized file | |
| */ | |
| @Override | |
| public void serializeParse(MarkerSerializer markerSerializer) { | |
| super.serializeParse(markerSerializer); | |
| bioType = BioType.parse(markerSerializer.getNextField()); | |
| proteinCoding = markerSerializer.getNextFieldBoolean(); | |
| dnaCheck = markerSerializer.getNextFieldBoolean(); | |
| aaCheck = markerSerializer.getNextFieldBoolean(); | |
| corrected = markerSerializer.getNextFieldBoolean(); | |
| ribosomalSlippage = markerSerializer.getNextFieldBoolean(); | |
| transcriptSupportLevel = TranscriptSupportLevel.parse(markerSerializer.getNextField()); | |
| version = markerSerializer.getNextField(); | |
| upstream = (Upstream) markerSerializer.getNextFieldMarker(); | |
| downstream = (Downstream) markerSerializer.getNextFieldMarker(); | |
| for (Marker m : markerSerializer.getNextFieldMarkers()) | |
| utrs.add((Utr) m); | |
| for (Marker m : markerSerializer.getNextFieldMarkers()) | |
| cdss.add((Cds) m); | |
| } | |
| /** | |
| * Create a string to serialize to a file | |
| */ | |
| @SuppressWarnings({ "unchecked", "rawtypes" }) | |
| @Override | |
| public String serializeSave(MarkerSerializer markerSerializer) { | |
| return super.serializeSave(markerSerializer) // | |
| + "\t" + bioType // | |
| + "\t" + proteinCoding // | |
| + "\t" + dnaCheck // | |
| + "\t" + aaCheck // | |
| + "\t" + corrected // | |
| + "\t" + ribosomalSlippage // | |
| + "\t" + (transcriptSupportLevel == null ? "" : transcriptSupportLevel.toString()) // | |
| + "\t" + version // | |
| + "\t" + markerSerializer.save(upstream) // | |
| + "\t" + markerSerializer.save(downstream) // | |
| + "\t" + markerSerializer.save((Iterable) utrs)// | |
| + "\t" + markerSerializer.save((Iterable) cdss)// | |
| ; | |
| } | |
| public void setAaCheck(boolean aaCheck) { | |
| this.aaCheck = aaCheck; | |
| } | |
| public void setBioType(BioType bioType) { | |
| this.bioType = bioType; | |
| } | |
| public void setCanonical(boolean canonical) { | |
| this.canonical = canonical; | |
| } | |
| public void setDnaCheck(boolean dnaCheck) { | |
| this.dnaCheck = dnaCheck; | |
| } | |
| public void setProteinCoding(boolean proteinCoding) { | |
| this.proteinCoding = proteinCoding; | |
| } | |
| public void setRibosomalSlippage(boolean ribosomalSlippage) { | |
| this.ribosomalSlippage = ribosomalSlippage; | |
| } | |
| public void setTranscriptSupportLevel(TranscriptSupportLevel transcriptSupportLevel) { | |
| this.transcriptSupportLevel = transcriptSupportLevel; | |
| } | |
| public void setVersion(String version) { | |
| this.version = version; | |
| } | |
| public void sortCds() { | |
| Collections.sort(cdss); | |
| resetCache(); | |
| } | |
| public List<SpliceSite> spliceSites() { | |
| List<SpliceSite> sslist = new ArrayList<>(); | |
| for (Exon ex : this) | |
| sslist.addAll(ex.getSpliceSites()); | |
| for (Intron intr : introns()) | |
| sslist.addAll(intr.getSpliceSites()); | |
| return sslist; | |
| } | |
| @Override | |
| public String toString() { | |
| return toString(false); | |
| } | |
| public String toString(boolean full) { | |
| StringBuilder sb = new StringBuilder(); | |
| sb.append(getChromosomeName() + ":" + start + "-" + end); | |
| sb.append(", strand: " + (isStrandPlus() ? "+" : "-")); | |
| if ((id != null) && (id.length() > 0)) sb.append(", id:" + id); | |
| if ((bioType != null) && (bioType != null)) sb.append(", bioType:" + bioType); | |
| if (isProteinCoding()) sb.append(", Protein"); | |
| if (isAaCheck()) sb.append(", AA check"); | |
| if (isDnaCheck()) sb.append(", DNA check"); | |
| if (numChilds() > 0) { | |
| sb.append("\n"); | |
| for (Utr utr : get5primeUtrsSorted()) | |
| sb.append("\t\t5'UTR :\t" + utr + "\n"); | |
| sb.append("\t\tExons:\n"); | |
| for (Exon exon : sorted()) | |
| sb.append("\t\t" + exon + "\n"); | |
| for (Utr utr : get3primeUtrsSorted()) | |
| sb.append("\t\t3'UTR :\t" + utr + "\n"); | |
| // We may show CDS | |
| if (isProteinCoding()) { | |
| sb.append("\t\tCDS :\t" + cds() + "\n"); | |
| sb.append("\t\tProtein :\t" + protein() + "\n"); | |
| } | |
| } | |
| if (full) { | |
| // Show errors or warnings | |
| String warn = ""; | |
| if (isErrorStopCodonsInCds()) warn += ErrorWarningType.WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS + " "; | |
| if (isErrorProteinLength()) warn += ErrorWarningType.WARNING_TRANSCRIPT_INCOMPLETE + " "; | |
| if (isErrorStartCodon()) warn += ErrorWarningType.WARNING_TRANSCRIPT_NO_START_CODON + " "; | |
| if (isWarningStopCodon()) warn += ErrorWarningType.WARNING_TRANSCRIPT_NO_STOP_CODON + " "; | |
| if (!warn.isEmpty()) sb.append("\tWarnings :" + warn); | |
| // Sequence checks | |
| sb.append("\t\tCDS check : " + (isDnaCheck() ? "OK" : "Failed (or missing)") + "\n"); | |
| sb.append("\t\tAA check : " + (isAaCheck() ? "OK" : "Failed (or missing)") + "\n"); | |
| } | |
| return sb.toString(); | |
| } | |
| /** | |
| * Show a transcript as an ASCII Art | |
| */ | |
| public String toStringAsciiArt(boolean full) { | |
| //--- | |
| // ASCII art for transcript | |
| //--- | |
| char art[] = new char[size()]; | |
| for (int i = start, j = 0; i <= end; i++, j++) { | |
| Utr utr = findUtr(i); | |
| if (utr != null) art[j] = utr.isUtr5prime() ? '5' : '3'; | |
| else { | |
| Exon exon = findExon(i); | |
| if (exon != null) { | |
| art[j] = exon.isStrandPlus() ? '>' : '<'; | |
| } else art[j] = '-'; | |
| } | |
| } | |
| // Only 'basic' ASCII art? | |
| if (!full) return new String(art); | |
| //--- | |
| // DNA Sequence | |
| //--- | |
| StringBuilder seq = new StringBuilder(); | |
| for (int i = start; i <= end; i++) { | |
| Exon exon = findExon(i); | |
| if (exon != null) { | |
| String s = exon.getSequence().toLowerCase(); | |
| if (exon.isStrandMinus()) s = GprSeq.reverseWc(s); | |
| s = GprSeq.padN(s, exon.size()); | |
| seq.append(s); | |
| i += s.length() - 1; | |
| } else seq.append('.'); | |
| } | |
| //--- | |
| // AA Sequence and frame | |
| //--- | |
| StringBuilder aa = new StringBuilder(); | |
| StringBuilder frameSb = new StringBuilder(); | |
| StringBuilder pos1sb = new StringBuilder(); | |
| StringBuilder pos10sb = new StringBuilder(); | |
| StringBuilder pos100sb = new StringBuilder(); | |
| StringBuilder pos1000sb = new StringBuilder(); | |
| int pos = start; | |
| if (isProteinCoding()) { | |
| char codon[] = new char[3]; | |
| int step = isStrandPlus() ? 1 : -1; | |
| int frame = 0; | |
| for (int i = (isStrandPlus() ? 0 : art.length - 1), j = 0; (i >= 0) && (i < art.length); i += step) { | |
| if (art[i] == '3' || art[i] == '5') { | |
| // 5'UTR or 3'UTR | |
| aa.append(' '); | |
| frameSb.append(' '); | |
| } else { | |
| char b = seq.charAt(i); | |
| if (b == 'a' || b == 'c' || b == 'g' || b == 't') { | |
| // Coding sequence | |
| codon[j++] = b; | |
| if (j >= 3) { | |
| j = 0; | |
| String cod = new String(codon); | |
| if (isStrandMinus()) cod = GprSeq.wc(cod); // Bases are already reversed, we only need WC complement | |
| boolean translateStart = (aa.length() <= 0); // First codon? Translate start coddon accordingly | |
| aa.append(" " + codonTable().aa(cod, translateStart) + " "); | |
| } | |
| // Update frame | |
| frameSb.append(frame); | |
| frame = (frame + 1) % 3; | |
| } else { | |
| // Intron | |
| aa.append(' '); | |
| frameSb.append(' '); | |
| } | |
| } | |
| // Position | |
| pos1sb.append(pos % 10); | |
| if (pos % 10 == 0) pos10sb.append((pos % 100) / 10); | |
| else pos10sb.append(' '); | |
| if (pos % 100 == 0) pos100sb.append((pos % 1000) / 100); | |
| else pos100sb.append(' '); | |
| if (pos % 1000 == 0) pos1000sb.append((pos % 10000) / 1000); | |
| else pos1000sb.append(' '); | |
| pos++; | |
| } | |
| } | |
| String aaStr = isStrandPlus() ? aa.toString() : aa.reverse().toString(); | |
| String frameStr = isStrandPlus() ? frameSb.toString() : frameSb.reverse().toString(); | |
| //--- | |
| // Coordinates | |
| //--- | |
| // Create 'vertical lines' | |
| StringBuilder lines = new StringBuilder(); | |
| if (isStrandPlus()) lines.append(' '); | |
| int prev = start; | |
| for (Exon ex : this.sorted()) { | |
| lines.append(Gpr.repeat(' ', ex.getStart() - prev - 1) + "|"); | |
| prev = ex.getStart(); | |
| lines.append(Gpr.repeat(' ', ex.getEnd() - prev - 1) + "|"); | |
| prev = ex.getEnd(); | |
| } | |
| StringBuilder coords = new StringBuilder(); | |
| coords.append(lines + "\n"); | |
| ArrayList<Exon> exons = new ArrayList<>(); | |
| exons.addAll(subIntervals()); | |
| Collections.sort(exons, new IntervalComparatorByStart(true)); // Sort by reverse position | |
| int n, len; | |
| for (Exon ex : exons) { | |
| // Right-side coordinate | |
| n = ex.getEnd(); | |
| len = n - start + 1; | |
| coords.append((len > 0 ? lines.subSequence(0, len) : "") + "^" + n + "\n"); | |
| // Left-side coordinate | |
| n = ex.getStart(); | |
| len = n - start + 1; | |
| coords.append((len > 0 ? lines.subSequence(0, len) : "") + "^" + n + "\n"); | |
| } | |
| // Result | |
| return "" // | |
| + (pos1000sb.toString().trim().isEmpty() ? "" : "\n" + pos1000sb) // | |
| + (pos100sb.toString().trim().isEmpty() ? "" : "\n" + pos100sb) // | |
| + (pos10sb.toString().trim().isEmpty() ? "" : "\n" + pos10sb) // | |
| + "\n" + pos1sb // | |
| + "\n" + seq // | |
| + (isProteinCoding() ? "\n" + aaStr + "\n" + frameStr : "") // | |
| + "\n" + new String(art) // | |
| + "\n" + coords // | |
| ; | |
| } | |
| /** | |
| * Calculate UTR regions from CDSs | |
| */ | |
| public boolean utrFromCds(boolean verbose) { | |
| if (cdss.size() <= 0) return false; // Cannot do this if we don't have CDS information | |
| // All exons minus all UTRs and CDS should give us the missing UTRs | |
| Markers exons = new Markers(); | |
| Markers minus = new Markers(); | |
| // Add all exons | |
| for (Exon e : this) | |
| exons.add(e); | |
| // Add all UTRs and CDSs to the 'minus' set | |
| for (Utr uint : getUtrs()) | |
| minus.add(uint); | |
| for (Cds cint : cdss) | |
| minus.add(cint); | |
| Markers missingUtrs = exons.minus(minus); // Perform interval minus | |
| if (!missingUtrs.isEmpty()) return addMissingUtrs(missingUtrs, verbose); // Anything left? => There was a missing UTR | |
| return false; | |
| } | |
| /** | |
| * Get some details about the effect on this transcript | |
| */ | |
| @Override | |
| public boolean variantEffect(Variant variant, VariantEffects variantEffects) { | |
| if (!intersects(variant)) return false; // Sanity check | |
| // Large structural variant including the whole transcript? | |
| if (variant.includes(this) && variant.isStructural()) { | |
| CodonChange codonChange = CodonChange.factory(variant, this, variantEffects); | |
| codonChange.codonChange(); | |
| return true; | |
| } | |
| //--- | |
| // Structural variants may affect more than one exon | |
| //--- | |
| boolean mayAffectSeveralExons = variant.isStructural() || variant.isMixed() || variant.isMnp(); | |
| if (mayAffectSeveralExons) { | |
| int countExon = 0; | |
| for (Exon ex : this) | |
| if (ex.intersects(variant)) countExon++; | |
| // More than one exon? | |
| if (countExon > 1) { | |
| CodonChange codonChange = CodonChange.factory(variant, this, variantEffects); | |
| codonChange.codonChange(); | |
| return true; | |
| } | |
| } | |
| //--- | |
| // Does it hit an exon? | |
| // Note: This only adds spliceSites effects, for detailed codon | |
| // changes effects we use 'CodonChange' class | |
| //--- | |
| boolean exonAnnotated = false; | |
| for (Exon ex : this) | |
| if (ex.intersects(variant)) { | |
| exonAnnotated |= ex.variantEffect(variant, variantEffects); | |
| } | |
| //--- | |
| // Hits a UTR region? | |
| //--- | |
| boolean included = false; | |
| for (Utr utr : utrs) | |
| if (utr.intersects(variant)) { | |
| // Calculate the effect | |
| utr.variantEffect(variant, variantEffects); | |
| included |= utr.includes(variant); // Is this variant fully included in the UTR? | |
| } | |
| if (included) return true; // Variant fully included in the UTR? => We are done. | |
| //--- | |
| // Does it hit an intron? | |
| //--- | |
| for (Intron intron : introns()) | |
| if (intron.intersects(variant)) { | |
| intron.variantEffect(variant, variantEffects); | |
| included |= intron.includes(variant); // Is this variant fully included in this intron? | |
| } | |
| if (included) return true; // Variant fully included? => We are done. | |
| //--- | |
| // No annotations from exons? => Add transcript | |
| //--- | |
| if (!exonAnnotated) { | |
| variantEffects.add(variant, this, EffectType.TRANSCRIPT, ""); | |
| return true; | |
| } | |
| return exonAnnotated; | |
| } | |
| } |