From fb6b1fcf668ee64f8b35e323e4bdb70c914261cd Mon Sep 17 00:00:00 2001 From: Binghan Xiao <99703980+iskoldt-X@users.noreply.github.com> Date: Thu, 20 Apr 2023 13:39:11 +0800 Subject: [PATCH] fixing paths --- scripts/Plasmer | 6 +++--- scripts/check_length_by_input.py | 22 ++++++++++++++++++++-- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/scripts/Plasmer b/scripts/Plasmer index 1345f4b..a284102 100755 --- a/scripts/Plasmer +++ b/scripts/Plasmer @@ -121,7 +121,7 @@ if [ ! -s ${outpath}/intermediate/${prefix}.seqkit ];then fi echo "Checking sequence length..." -python3 ${script_Dir}/check_length_by_input.py ${genome} ${outpath}/intermediate/${prefix}.seqkit $length $minimum_length ${outpath}/intermediate/${prefix}.plasmer.length.class ${outpath}/intermediate/${prefix}.plasmer.length.unclass.fasta +python3 ${script_Dir}/check_length_by_input.py ${genome} ${outpath}/intermediate/${prefix}.seqkit $length $minimum_length ${outpath}/intermediate/${prefix}.plasmer.length.class ${outpath}/intermediate/${prefix}.plasmer.length.unclass.fasta ${outpath}/intermediate/${prefix}.plasmer.shorterM.class ${outpath}/results/${prefix}.plasmer.shorterM.fasta echo ${outpath}/intermediate/${prefix}.plasmer.length.unclass.fasta > ${outpath}/intermediate/${prefix}.sample.list @@ -171,7 +171,7 @@ perl ${script_Dir}/getFeatDomTbl.pl ${outpath}/intermediate/${prefix}.plasmer.le perl ${script_Dir}/getFeatDomTbl.pl ${outpath}/intermediate/${prefix}.plasmer.length.unclass.fasta ${outpath}/intermediate/${prefix}.replication.domtblout > ${outpath}/intermediate/${prefix}.replication.domtblout.feature #MPS - RDS score -perl ${script_Dir}/getFeatMDS.pl ${outpath}/intermediate/${prefix}.plasmer.length.unclass.fasta ${outpath}/intermediate/${prefix}.mps.dmnd.out /share/pasteur/zhuqh/lung_microbiome/chromosome_plasmid/platon_db/mps.tsv > ${outpath}/intermediate/${prefix}.mps.rds.feature +perl ${script_Dir}/getFeatMDS.pl ${outpath}/intermediate/${prefix}.plasmer.length.unclass.fasta ${outpath}/intermediate/${prefix}.mps.dmnd.out ${db}/platon_db/mps.tsv > ${outpath}/intermediate/${prefix}.mps.rds.feature perl ${script_Dir}/getFeatBlastn.pl ${outpath}/intermediate/${prefix}.plasmer.length.unclass.fasta ${outpath}/intermediate/${prefix}.orit.blastn.out > ${outpath}/intermediate/${prefix}.orit.feature @@ -201,7 +201,7 @@ echo "Predicting..." Rscript ${script_Dir}/predict_by_rpmodel.R ${db}/All.rk18pk25.rp.all2class.per0.8OR10w.traindata.rfmodel.RData ${outpath}/intermediate/${prefix}.allFeatures #Rscript ${script_Dir}/predict_by_rpmodel.R ${db}/All.sample6w.filter.features.rmGC.2class.rfmodel.RData ${outpath}/intermediate/${prefix}.allFeatures -cat ${outpath}/intermediate/${prefix}.allFeatures.plasmer.predClass.tsv ${outpath}/intermediate/${prefix}.plasmer.length.class > ${outpath}/results/${prefix}.plasmer.predClass.tsv +cat ${outpath}/intermediate/${prefix}.allFeatures.plasmer.predClass.tsv ${outpath}/intermediate/${prefix}.plasmer.length.class ${outpath}/intermediate/${prefix}.plasmer.shorterM.class > ${outpath}/results/${prefix}.plasmer.predClass.tsv #extract plasmids sequences python3 ${script_Dir}/extract_predPlasmids_seqs.py ${genome} ${outpath}/results/${prefix}.plasmer.predClass.tsv ${outpath}/results/${prefix}.plasmer.predPlasmids.fa diff --git a/scripts/check_length_by_input.py b/scripts/check_length_by_input.py index e638602..7a18d50 100644 --- a/scripts/check_length_by_input.py +++ b/scripts/check_length_by_input.py @@ -9,11 +9,14 @@ min_length = sys.argv[4].strip() classfile = sys.argv[5].strip() unclassfasta = sys.argv[6].strip() +labelshortfile = sys.argv[7].strip() +labelshortfasta = sys.argv[8].strip() + w1 = open(classfile,"w") unclassID = [] - +shortID = [] with open(seqkit) as r: for line in r: line = line.strip().split("\t") @@ -23,19 +26,34 @@ unclassID.append(line[0]) elif float(line[1]) > float(min_length) and float(max_length) == 0: unclassID.append(line[0]) + elif float(line[1]) <= float(min_length): + shortID.append(line[0]) w1.close() w2 = open(unclassfasta,'w') +w3 = open(labelshortfasta,"w") if genome.endswith('gz'): with gzip.open(genome,"rt") as f: for record in SeqIO.parse(f,"fasta"): if record.id in unclassID: SeqIO.write(record,w2,'fasta') + elif record.id in shortID: + SeqIO.write(record,w3,'fasta') + elif genome.endswith('fasta') or genome.endswith('fa') or genome.endswith('fna'): for record in SeqIO.parse(genome,"fasta"): if record.id in unclassID: SeqIO.write(record,w2,'fasta') - + elif record.id in shortID: + SeqIO.write(record,w3,'fasta') w2.close() +w3.close() + +w4 = open(labelshortfile,"w") +for i in shortID: + w4.write(i + "\t" + "shorter_than_" + str(min_length) + "\n") +w4.close() + +