## Biokotlin GenomicFeatures Testing

Biokotlin GenomicFeatures class takes a gff file and stores the feature data it holds into Kotlin DataFrame objects.
These objects may be manipulated via Kotlin DataFrame methods, or functions created within the GenomicFeatures class

This notebook demonstrates operations to construct and manage biokotlin GenomicFeatures

In [None]:
//If this does not exist run from cmdline: ./gradlew shadowjar
// This is old code - if the imports don't work, or if you have an unofficial development version of Biokotlin
@file:DependsOn("../build/libs/biokotlin-0.03-all.jar")
// @file:Repository("https://jcenter.bintray.com/")
// @file:DependsOn("org.biokotlin:biokotlin:0.03"

In [None]:
// We'll use the genome.GenomicFeatures class
import biokotlin.genome.*

// define your GFF file
val b73GFF = "/Users/lcj34/notes_files/phg_2018/b73v5_gff/Zm-B73-REFERENCE-NAM-5.0_Zm00001e.1.gff3"

// create an instance of the GenomicFeatures class with the gff3 file as input
val myGF = GenomicFeatures(b73GFF)

// list the available functions
myGF.help()




In [None]:
// Get and print the CDS dataframe column names
    
val cdsColNames = myGF.columnNames("CDS")
println("CDS column names:\n${cdsColNames}")


In [None]:
// subset the CDS DataFrame
myGF.cds().select{seqid and name and phase}


In [None]:
// Print all the GFF exons
myGF.exons().print()

In [None]:
// Filtering using GenomicFeatures:getFeaturesByRange
// This method allows the user to grab data for multiple features that fall within
// the specified chromosome/position boundaries

var featuresByRange = myGF.featuresByRange("chr1",43000..46204,"three_prime_UTR,five_prime_UTR")
featuresByRange.print()
                         
                         

In [None]:
val transcriptEntries = myGF.featuresWithTranscript("Zm00001e000002_T001")
transcriptEntries.print()


In [None]:
// Filter the CDS entries for only chr1, within the range of 34617..40204
var cdsFilteredRange = myGF.cds().filter{seqid == "chr1" && start <= 40204 && end >= 34617}
cdsFilteredRange.print()



In [None]:
import biokotlin.genome.*
// Example of a genomic feature that includes the fasta.
// Fasta association allows for adding sequence information to the queries

       val b73GFF_full = "/Users/lcj34/notes_files/phg_2018/b73v5_gff/Zm-B73-REFERENCE-NAM-5.0_Zm00001e.1.gff3"
        val b73Fasta = "/Users/lcj34/notes_files/phg_2018/genomes/Zm-B73-REFERENCE-NAM-5.0.fa"
        val time = System.nanoTime()
        // Create an instance of the class so we have access to the lists that are
        // created on the read.
        val myGF = GenomicFeatures(b73GFF_full,b73Fasta)

        val readingTime = (System.nanoTime() - time)/1e9
        println("Reading/parsing GFF and ref fasta files took ${readingTime} seconds")

        println("myGF chromDF size: ${myGF.chromosomes().size()}")
        val nucSeqList = myGF.refNucSeqFasta
        val numContigs = nucSeqList!!.keys.size

        val chr5GeneSRangeSet = mutableSetOf<SRange>()

        // Things to note here:  Both SRanges and the GFF are 1-based physical positions
        //  that are inclusive/inclusive. So moving between them will remain consistent.
        // If you pull sequence based on the range, it will be correctly adjusted for that
        // (because sequence is stored as 0-based)
        myGF.genes().filter{seqid == "chr5"}.select{start and end}.forEachRow {
            val record = nucSeqList!!["chr5"]!!.range(start..end)
            chr5GeneSRangeSet.add(record)
        }
        
        // You now have a rangeSet - you can pull sequence from the NucSeqRecords in the
        // chr5GeneSRangeSet.
        // You can also perform any operations on this SRange set now - flanking, shift, sequence,
        // complement, intersections, overlaps, or get a dataFrame from the SRange set.

   
        val rangeDF = chr5GeneSRangeSet.toDataFrame()

        rangeDF.print()

        // Get sequence for a specific chromosome/range:
        val chr5seq = myGF.sequenceForChrRange("chr5",1..50)
        println("Sequence for chr5, 1..50")
        println(chr5seq)


        println()
        val fakeChrSeq = myGF.sequenceForChrRange("fakeChr", 1..60)
  
