In [None]:
from pyseqrna import pyseqrna_utils as pu
from pyseqrna import quality_check as qc
from pyseqrna import quality_trimming as qt
from pyseqrna import  aligners as al
from pyseqrna import pyseqrna_stats as ps
from pyseqrna import quants
from pyseqrna import differential_expression as de
import pandas as pd

In [None]:
from pyseqrna.pyseqrna_utils import PyseqrnaLogger

log = PyseqrnaLogger(mode='w', log='pp')

log.info("Analysis started")

In [None]:
data = pu.read_input_file("pyseqrna/example/input_Sample_PE.txt", "pyseqrna/example/data/" , paired=True)
samples= data['samples']

In [None]:
pu.make_directory("pySeqRNA_results")

In [None]:

a = qt.trim_galoreRun(sampleDict=samples, paired=True)

In [None]:
aligner = al.STAR_Aligner(genome="pyseqrna/example/data/arabidopsis10.fasta")

In [None]:
j = aligner.build_index()

In [None]:
aligner.check_index()

In [None]:
outalign, job = aligner.run_Alignment(target=a[0], pairedEND=True)

In [None]:
df= ps.align_stats(sampleDict=samples,trimDict=a[0],bamDict=outalign, pairedEND=True)

In [None]:
df.to_excel("pySeqRNA_results/alignment_stats.xlsx", index=False)

In [None]:
df

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
import seaborn as sns

In [None]:
sample = list(df['Sample'])
creads = list(df['Input_reads2x'])
iread = list(df['%_Cleaned2x'])
tc= list(100-df['%_Cleaned2x'])

plt.barh(sample,iread,0.6,label="cleaned Reads", color='skyblue')
plt.barh(sample, tc, 0.6,left=iread, label="discarded", color='tomato')
plt.margins(y=0.01)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel("% Number of reads", fontsize=18)
plt.ylabel("Samples", fontsize=18)
plt.legend(['Cleaned', 'Discarded'], bbox_to_anchor=(1.0, 1.0), loc='upper left', fontsize=15)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(18.5, 10.5)

# plt.legend(frameon=False, fontsize=17)



In [None]:
tc

In [None]:
a= quants.featureCount(bamDict=outalign, gff="pyseqrna/example/data/arabidopsis10.gff", outDir="pySeqRNA_results")

In [None]:
counts= pd.read_csv("pySeqRNA_results/Counts_final.txt", sep="\t")

In [None]:
counts

In [None]:
targets= data['targets']
comb= data['combinations']

In [None]:
dd= de.runDESeq2(countDF=counts,targetFile=targets,design='sample', combination=comb)

In [None]:
import os 
dd.to_excel(os.path.join("pySeqRNA_results","Raw_DEGs_all.xlsx"), index=False)

In [None]:
dd= pd.read_excel("pySeqRNA_results/Raw_DEGs_all.xlsx")
filtered_DEG = de.degFilter(degDF=dd, CompareList=comb, FDR=0.05, FOLD=1)

In [None]:
summary= filtered_DEG['summary']

In [None]:
wd= pd.ExcelWriter(os.path.join("pySeqRNA_results","filtered_DEGs.xlsx"))
for key, value in filtered_DEG['filtered'].items():
    value.to_excel(wd,sheet_name=key)
    wd.save()
wd.close()

In [None]:
pu.getGenes(os.path.join("pySeqRNA_results","filtered_DEGs.xlsx"), combinations=comb)

In [None]:
from pyseqrna import normalize_counts as nc

In [None]:
rpkm = nc.Normalization(countFile="pySeqRNA_results/Counts_final.txt", featureFile="pyseqrna/example/data/arabidopsis10.gff")

In [None]:
rpk = rpkm.meanRatioCount()

In [None]:
rpk.to_excel("pySeqRNA_results/rpkm.xlsx")

In [None]:
rcount= pd.read_excel("pySeqRNA_results/rpkm.xlsx")

In [None]:
rcount

In [None]:
from pyseqrna import pyseqrna_plots as pp

In [None]:
heatmap, ax = pp.plotHeatmap(dd,combinations=comb,num=50, type='deg')

In [None]:
from pyseqrna import gene_ontology as go

In [None]:
bdata= go.query('athaliana')

In [None]:
gdata = go.preprocessBioMart(bdata)

In [None]:
results = go.enrichGO(gdata,"pySeqRNA_results.1/diff_genes/M1-V6.txt")

In [None]:
results.to_excel("pySeqRNA_results.1/M1-V6_go.xlsx", index=False)

In [None]:
cd= pd.read_excel("pySeqRNA_results.1/M1-V6_deg.xlsx")

In [None]:
gdata

In [None]:
def get_request(url,  **params):

    if params:
        r = requests.get(url, params=params, stream=True)
    else:
        r = requests.get(url)
    r.raise_for_status()

    return r


def _add_attr_node(root, attr):
    attr_el = ElementTree.SubElement(root, 'Attribute')
    attr_el.set('name', attr)


In [None]:


root = ElementTree.Element('Query')
root.set('virtualSchemaName', 'plants_mart')
root.set('formatter', 'TSV')
root.set('header', '1')
root.set('uniqueRows', native_str(int(True)))
root.set('datasetConfigVersion', '0.6')

dataset = ElementTree.SubElement(root, 'Dataset')
dataset.set('name', species+"_eg_gene")
dataset.set('interface', 'default')
attributes = ["ensembl_gene_id", "ensembl_transcript_id",
                "go_id", "name_1006", "namespace_1003", "definition_1006"]
for attr in attributes:
    _add_attr_node(dataset, attr)

response = get_request(
    "https://plants.ensembl.org/biomart/martservice", query=ElementTree.tostring(root))
result = pd.read_csv(StringIO(response.text), sep='\t')



In [1]:
from pyseqrna import pathway


In [2]:
dk = pathway.enrichKEGG(file="pySeqRNA_results/diff_genes/C1-D6.txt", species='ath')

[12:40:48]  pathway :: INFO : Reading pathways from KEGG


ENTRY       ath00010                    Pathway
NAME        Glycolysis / Gluconeogenesis - Arabidopsis thaliana (thale cress)
DESCRIPTION Glycolysis is the process of converting glucose into pyruvate and generating small amounts of ATP (energy) and NADH (reducing power). It is a central pathway that produces important precursor metabolites: six-carbon compounds of glucose-6P and fructose-6P and three-carbon compounds of glycerone-P, glyceraldehyde-3P, glycerate-3P, phosphoenolpyruvate, and pyruvate [MD:M00001]. Acetyl-CoA, another important precursor metabolite, is produced by oxidative decarboxylation of pyruvate [MD:M00307]. When the enzyme genes of this pathway are examined in completely sequenced genomes, the reaction steps of three-carbon compounds from glycerone-P to pyruvate form a conserved core module [MD:M00002], which is found in almost all organisms and which sometimes contains operon structures in bacterial genomes. Gluconeogenesis is a synthesis pathway of glucose from no

[12:41:41]  pathway :: INFO : Performing KEGG enrichment analysis on pySeqRNA_results/diff_genes/C1-D6.txt


ENTRY       ath04712                    Pathway
NAME        Circadian rhythm - plant - Arabidopsis thaliana (thale cress)
DESCRIPTION The circadian system is an endogenous rhythm with the period of about 24 hours to provide temporal organization of biological activity. In Arabidopsis the circadian oscillator at the core of this system is composed of the interlocked feedback loop formed by the major transcriptional factors CCA1 (circadian clock associated), LHY (late elongated hypocotyl) and TOC1 (timing of cab), a pseudo response regulator (PRR). The morning-expressed CCA1/LHY Myb transcription factors suppress TOC1 expression by binding to its promoter. Moreover, CCA1/LHY activates the expression of PRR7/9 in the morning and then PRR7/9 repress the transcription of CCA1/LHY during the rest of the day. By contrast, the evening-expressed TOC1 activates the expression of CCA1/LHY.
CLASS       Organismal Systems; Environmental adaptation
PATHWAY_MAP ath04712  Circadian rhythm - plant
DBLI

In [None]:
dk