diff --git a/snakePipes/shared/rules/three_prime_seq.snakefile b/snakePipes/shared/rules/three_prime_seq.snakefile index ae572059e..26fbda21e 100644 --- a/snakePipes/shared/rules/three_prime_seq.snakefile +++ b/snakePipes/shared/rules/three_prime_seq.snakefile @@ -150,7 +150,7 @@ rule preprocess_cluster_pas: input: find_replicates_cluster_pas output: - temp("three_prime_seq/tmp/condition-{condition}_preprocessed.txt") + "three_prime_seq/tmp/condition-{condition}_preprocessed.txt" shell: "cat {input} | " "sed '/^[ ]*Chrom/ d' | " @@ -162,7 +162,7 @@ rule clusterPAS: input: "three_prime_seq/tmp/condition-{condition}_preprocessed.txt" output: - temp("three_prime_seq/tmp/condition-{condition}_clusterPAS_tmpdb.txt") + "three_prime_seq/tmp/condition-{condition}_clusterPAS_tmpdb.txt" conda: CONDA_SHARED_ENV params: @@ -175,6 +175,7 @@ rule clusterPAS: # awk command: remove entries with multiple genes in 4th column (must be unambiguous) # python script: add "_1", "_2", to each cluster label (4th column) to make each # unique for each genomic position +# also strip header of "bedlike" file rule postprocess_cluster_pas: input: "three_prime_seq/tmp/condition-{condition}_clusterPAS_tmpdb.txt" diff --git a/snakePipes/shared/tools/three_prime_seq/dedup_clusterPAS.py b/snakePipes/shared/tools/three_prime_seq/dedup_clusterPAS.py index a5129465b..82a90c781 100644 --- a/snakePipes/shared/tools/three_prime_seq/dedup_clusterPAS.py +++ b/snakePipes/shared/tools/three_prime_seq/dedup_clusterPAS.py @@ -28,11 +28,9 @@ def dedup(sub_df): def main(): - df = pd.read_table(sys.stdin, index_col=None, header=None) - df.columns = HEADERS - df = df.groupby("Gene").apply(dedup) - df.to_csv(sys.stdout, sep="\t", index=False) - + df = pd.read_table(sys.stdin, index_col=None, header=0) + df = df.groupby("Gene", group_keys=False).apply(dedup).reset_index(drop=True) + df.to_csv(sys.stdout, sep="\t", index=False, header=False) if __name__ == "__main__": main() \ No newline at end of file