# Create SwanGraph from C2C12 PacBio data

## Load libraries

In [1]:
import swan_vis as swan
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt

## Load data into Python
We have data from C2C12 myoblasts and 3-day differentiated myotubes. Bulk long-read RNA-seq was performed using polyA primers and PacBio sequencing. To quantify transcript expression, reads were mapped to the mouse genome using Minimap2, then the reads were cleaned and pre-processed with TranscriptClean and [TALON](https://github.com/mortazavilab/TALON). 

In short, BAM files were downloaded from [this ENCODE data cart](https://www.encodeproject.org/carts/c2c12_bulk_pb/) and pre-processed with the walkthrough described [here](https://freese.gitbook.io/swan/tutorials/data_processing).

We will use [Swan](https://freese.gitbook.io/swan/) to analyze and visualize the C2C12 long-read transcriptome.

## Initialize a new SwanGraph


In [30]:
sg = swan.SwanGraph()

In [31]:
annot_gtf = '../swan_data/gencode.vM21.primary_assembly.annotation_UCSC_names.gtf'
data_gtf = '../swan_data/all_talon_observedOnly.gtf'
ab_file = '../swan_data/all_talon_abundance_filtered.tsv'
talon_db = '../swan_data/talon.db'
pass_list = '../swan_data/all_pass_list.csv'
meta = '../swan_data/swan_metadata.tsv'

## Adding a reference transcriptome

In [32]:
# add an annotation transcriptome
sg.add_annotation(annot_gtf)


Adding annotation to the SwanGraph


## Adding transcript models from a TALON database

In [33]:
# add a dataset's transcriptome and abundance information to
# the SwanGraph
sg.add_transcriptome(talon_db, pass_list=pass_list)


Adding transcriptome to the SwanGraph


## Adding datasets and their abundance


In [34]:
# add each dataset's abundance information to the SwanGraph
sg.add_abundance(ab_file)


Adding abundance for datasets ENCFF202MCY, ENCFF396UFT, ENCFF408MUF, ENCFF652XIT to SwanGraph.


  self.adata = anndata.AnnData(var=var, obs=obs, X=X)


## Adding metadata

In [35]:
sg.add_metadata(meta)

AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


## Saving and loading your SwanGraph

In [36]:
# save the SwanGraph as a Python pickle file
sg.save_graph('../swan_data/swan')

Saving graph as ../swan_data/swan.p


In [11]:
# load up a saved SwanGraph from a pickle file
sg = swan.read('../swan_data/swan.p')

Read in graph from ../swan_data/swan.p


## Differential gene expression 

In [3]:
obs_col = 'timepoint'
obs_conditions = ['72hr', '0hr']

In [3]:
# perform a differential gene expression
# Wald test on the provided metadata column and conditions
test = sg.de_gene_test(obs_col, obs_conditions=obs_conditions)

AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


training location model: False
training scale model: True
iter   0: ll=6977754.071072
iter   1: ll=6977754.071072, converged: 0.00% (loc: 100.00%, scale update: False), in 0.00sec
iter   2: ll=4245564.439715, converged: 75.25% (loc: 75.25%, scale update: True), in 424.18sec
iter   3: ll=4245564.439715, converged: 75.25% (loc: 100.00%, scale update: False), in 0.00sec
iter   4: ll=3765947.158149, converged: 88.38% (loc: 88.38%, scale update: True), in 117.62sec
iter   5: ll=3765947.158149, converged: 88.38% (loc: 100.00%, scale update: False), in 0.00sec
iter   6: ll=3755613.415562, converged: 95.98% (loc: 95.98%, scale update: True), in 61.59sec
iter   7: ll=3755613.415562, converged: 95.98% (loc: 100.00%, scale update: False), in 0.00sec
iter   8: ll=3754419.144885, converged: 99.32% (loc: 99.32%, scale update: True), in 24.35sec
iter   9: ll=3754419.144885, converged: 99.32% (loc: 100.00%, scale update: False), in 0.00sec
iter  10: ll=3754147.360595, converged: 99.86% (loc: 99.86%, s

  size = (limit / dtype.itemsize / largest_block) ** (1 / len(autos))


## Differential transcript expression 

In [4]:
# perform a differential transcript expression
# Wald test on the provided metadata column and conditions
test = sg.de_transcript_test(obs_col, obs_conditions=obs_conditions)

training location model: False
training scale model: True
iter   0: ll=19686469.746446
iter   1: ll=19686469.746446, converged: 0.00% (loc: 100.00%, scale update: False), in 0.00sec
iter   2: ll=8765751.196071, converged: 77.21% (loc: 77.21%, scale update: True), in 1136.23sec
iter   3: ll=8765751.196071, converged: 77.21% (loc: 100.00%, scale update: False), in 0.00sec
iter   4: ll=6952752.499967, converged: 88.46% (loc: 88.46%, scale update: True), in 296.23sec
iter   5: ll=6952752.499967, converged: 88.46% (loc: 100.00%, scale update: False), in 0.00sec
iter   6: ll=6926513.598118, converged: 95.91% (loc: 95.91%, scale update: True), in 160.05sec
iter   7: ll=6926513.598118, converged: 95.91% (loc: 100.00%, scale update: False), in 0.00sec
iter   8: ll=6924363.958168, converged: 99.32% (loc: 99.32%, scale update: True), in 67.40sec
iter   9: ll=6924363.958168, converged: 99.32% (loc: 100.00%, scale update: False), in 0.00sec
iter  10: ll=6923879.447278, converged: 99.86% (loc: 99.86

  size = (limit / dtype.itemsize / largest_block) ** (1 / len(autos))


## Differential isoform expression

In [4]:
# find genes that exhibit DIE between HFFc6 and HepG2
die_table = sg.die_gene_test(obs_col=obs_col,
                             obs_conditions=obs_conditions,
                             verbose=True)

Testing for DIE for each gene: 100%|██████████| 55868/55868 [12:42<00:00, 73.28it/s]


In [5]:
die_table.head()

Unnamed: 0,gid,p_val,dpi,adj_p_val
0,ENSMUSG00000000058.6,0.6010027,2.66629,0.8207392
1,ENSMUSG00000000078.7,0.755111,0.828447,0.901547
2,ENSMUSG00000000088.7,0.8816134,0.459934,0.9513182
3,ENSMUSG00000061689.15,3.5393789999999996e-19,66.746536,8.354474e-18
4,ENSMUSG00000000127.15,0.1044654,8.423532,0.2864358


In [None]:
# save the SwanGraph as a Python pickle file
sg.save_graph('../swan_data/swan')

In [2]:
# load up a saved SwanGraph from a pickle file
sg = swan.read('../swan_data/swan.p')

Read in graph from ../swan_data/swan.p
