From fb6b1fcf668ee64f8b35e323e4bdb70c914261cd Mon Sep 17 00:00:00 2001
From: Binghan Xiao <99703980+iskoldt-X@users.noreply.github.com>
Date: Thu, 20 Apr 2023 13:39:11 +0800
Subject: [PATCH] fixing paths

---
 scripts/Plasmer                  |  6 +++---
 scripts/check_length_by_input.py | 22 ++++++++++++++++++++--
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/scripts/Plasmer b/scripts/Plasmer
index 1345f4b..a284102 100755
--- a/scripts/Plasmer
+++ b/scripts/Plasmer
@@ -121,7 +121,7 @@ if [ ! -s ${outpath}/intermediate/${prefix}.seqkit ];then
 fi
 
 echo "Checking sequence length..."
-python3 ${script_Dir}/check_length_by_input.py ${genome} ${outpath}/intermediate/${prefix}.seqkit $length $minimum_length ${outpath}/intermediate/${prefix}.plasmer.length.class ${outpath}/intermediate/${prefix}.plasmer.length.unclass.fasta
+python3 ${script_Dir}/check_length_by_input.py ${genome} ${outpath}/intermediate/${prefix}.seqkit $length $minimum_length ${outpath}/intermediate/${prefix}.plasmer.length.class ${outpath}/intermediate/${prefix}.plasmer.length.unclass.fasta ${outpath}/intermediate/${prefix}.plasmer.shorterM.class ${outpath}/results/${prefix}.plasmer.shorterM.fasta
 
 echo ${outpath}/intermediate/${prefix}.plasmer.length.unclass.fasta > ${outpath}/intermediate/${prefix}.sample.list
 
@@ -171,7 +171,7 @@ perl ${script_Dir}/getFeatDomTbl.pl ${outpath}/intermediate/${prefix}.plasmer.le
 perl ${script_Dir}/getFeatDomTbl.pl ${outpath}/intermediate/${prefix}.plasmer.length.unclass.fasta ${outpath}/intermediate/${prefix}.replication.domtblout > ${outpath}/intermediate/${prefix}.replication.domtblout.feature
 
 #MPS - RDS score
-perl ${script_Dir}/getFeatMDS.pl ${outpath}/intermediate/${prefix}.plasmer.length.unclass.fasta ${outpath}/intermediate/${prefix}.mps.dmnd.out /share/pasteur/zhuqh/lung_microbiome/chromosome_plasmid/platon_db/mps.tsv > ${outpath}/intermediate/${prefix}.mps.rds.feature
+perl ${script_Dir}/getFeatMDS.pl ${outpath}/intermediate/${prefix}.plasmer.length.unclass.fasta ${outpath}/intermediate/${prefix}.mps.dmnd.out ${db}/platon_db/mps.tsv > ${outpath}/intermediate/${prefix}.mps.rds.feature
 
 perl ${script_Dir}/getFeatBlastn.pl ${outpath}/intermediate/${prefix}.plasmer.length.unclass.fasta ${outpath}/intermediate/${prefix}.orit.blastn.out > ${outpath}/intermediate/${prefix}.orit.feature
 
@@ -201,7 +201,7 @@ echo "Predicting..."
 Rscript ${script_Dir}/predict_by_rpmodel.R ${db}/All.rk18pk25.rp.all2class.per0.8OR10w.traindata.rfmodel.RData ${outpath}/intermediate/${prefix}.allFeatures
 #Rscript ${script_Dir}/predict_by_rpmodel.R ${db}/All.sample6w.filter.features.rmGC.2class.rfmodel.RData ${outpath}/intermediate/${prefix}.allFeatures
 
-cat ${outpath}/intermediate/${prefix}.allFeatures.plasmer.predClass.tsv ${outpath}/intermediate/${prefix}.plasmer.length.class > ${outpath}/results/${prefix}.plasmer.predClass.tsv
+cat ${outpath}/intermediate/${prefix}.allFeatures.plasmer.predClass.tsv ${outpath}/intermediate/${prefix}.plasmer.length.class ${outpath}/intermediate/${prefix}.plasmer.shorterM.class > ${outpath}/results/${prefix}.plasmer.predClass.tsv
 
 #extract plasmids sequences
 python3 ${script_Dir}/extract_predPlasmids_seqs.py ${genome} ${outpath}/results/${prefix}.plasmer.predClass.tsv ${outpath}/results/${prefix}.plasmer.predPlasmids.fa
diff --git a/scripts/check_length_by_input.py b/scripts/check_length_by_input.py
index e638602..7a18d50 100644
--- a/scripts/check_length_by_input.py
+++ b/scripts/check_length_by_input.py
@@ -9,11 +9,14 @@
 min_length = sys.argv[4].strip()
 classfile = sys.argv[5].strip()
 unclassfasta = sys.argv[6].strip()
+labelshortfile = sys.argv[7].strip()
+labelshortfasta = sys.argv[8].strip()
+
 
 w1 = open(classfile,"w")
 
 unclassID = []
-
+shortID = []
 with open(seqkit) as r:
 	for line in r:
 		line = line.strip().split("\t")
@@ -23,19 +26,34 @@
 			unclassID.append(line[0])
 		elif float(line[1]) > float(min_length) and float(max_length) == 0:
 			unclassID.append(line[0])
+		elif float(line[1]) <= float(min_length):
+			shortID.append(line[0])
 
 w1.close()
 
 w2 = open(unclassfasta,'w')
+w3 = open(labelshortfasta,"w")
 
 if genome.endswith('gz'):
 	with gzip.open(genome,"rt") as f:
 		for record in SeqIO.parse(f,"fasta"):
 			if record.id in unclassID:
 				SeqIO.write(record,w2,'fasta')
+			elif record.id in shortID:
+				SeqIO.write(record,w3,'fasta')
+
 elif genome.endswith('fasta') or genome.endswith('fa') or genome.endswith('fna'):
 	for record in SeqIO.parse(genome,"fasta"):
 		if record.id in unclassID:
 			SeqIO.write(record,w2,'fasta')
-
+		elif record.id in shortID:
+			SeqIO.write(record,w3,'fasta')
 w2.close()
+w3.close()
+
+w4 = open(labelshortfile,"w")
+for i in shortID:
+	w4.write(i + "\t" + "shorter_than_" + str(min_length) + "\n")
+w4.close()
+
+