In [5]:
import os
import logging
import pandas as pd
import numpy as np
from collections import defaultdict

In [6]:
def get_sample_files(path,outfile):
    samples = defaultdict(dict)
    seen = set()
    for dir_name, sub_dirs, files in os.walk(os.path.abspath(path)):
        for fname in files:

            if ".fastq" in fname or ".fq" in fname:

                sample_id = fname.split(".fastq")[0].split(".fq")[0]

                sample_id = sample_id.replace("_R1", "").replace("_r1", "").replace("_R2", "").replace("_r2", "")
                #sample_id = sample_id.replace("_", "-").replace(" ", "-")

                fq_path = os.path.join(dir_name, fname)

                if fq_path in seen: continue

                if "_R2" in fname or "_r2" in fname:

                    if 'R2' in samples[sample_id]:
                        logging.error(f"Duplicate sample {sample_id} was found after renaming; skipping... \n Samples: \n{samples}")

                    samples[sample_id]['R2'] = fq_path
                else:
                    if 'R1' in samples[sample_id]:
                        logging.error(f"Duplicate sample {sample_id} was found after renaming; skipping... \n Samples: \n{samples}")

                    samples[sample_id]['R1'] = fq_path


    samples= pd.DataFrame(samples).T
    samples['GROUP']=samples.index.to_series().apply(lambda x: int(x.split("-")[1]) < 101).map({True: 'lib1', False:'lib2'})

    if samples.isna().any().any():
        logging.error(f"Missing files:\n {samples}")

    #if os.path.exists(outfile):
        #logging.error(f"Output file {outfile} already exists I don't dare to overwrite it.")
        #exit(1)
    else:
        samples.to_csv(outfile,sep='\t')


    return samples

In [7]:
LUMI_raw = "/home/lvelo/working_directory/microbiology/LUMI_2/00_raw/"
LUMI_adapter = "/home/lvelo/working_directory/microbiology/LUMI_2/00_adapter/"
METEL_raw = "/home/lvelo/working_directory/microbiology/METEL/00_raw/"
METEL_adapter = "/home/lvelo/working_directory/microbiology/METEL/00_adapter/"
MARVIN_raw = "/home/lvelo/working_directory/microbiology/MARVIN/00_raw/"
MARVIN_adapter = "/home/lvelo/working_directory/microbiology/MARVIN/00_adaptors/"

In [8]:
D = get_sample_files(LUMI_adapter,"LUMI_adapter.tsv")

In [6]:
D = D.sort_index()
D

Unnamed: 0,R1,R2,GROUP
AUPO-101,/home/lvelo/working_directory/microbiology/MAR...,/home/lvelo/working_directory/microbiology/MAR...,lib2
AUPO-102,/home/lvelo/working_directory/microbiology/MAR...,/home/lvelo/working_directory/microbiology/MAR...,lib2
AUPO-103,/home/lvelo/working_directory/microbiology/MAR...,/home/lvelo/working_directory/microbiology/MAR...,lib2
AUPO-104,/home/lvelo/working_directory/microbiology/MAR...,/home/lvelo/working_directory/microbiology/MAR...,lib2
AUPO-105,/home/lvelo/working_directory/microbiology/MAR...,/home/lvelo/working_directory/microbiology/MAR...,lib2
...,...,...,...
AUPO-264,/home/lvelo/working_directory/microbiology/MAR...,/home/lvelo/working_directory/microbiology/MAR...,lib2
AUPO-265,/home/lvelo/working_directory/microbiology/MAR...,/home/lvelo/working_directory/microbiology/MAR...,lib2
AUPO-266,/home/lvelo/working_directory/microbiology/MAR...,/home/lvelo/working_directory/microbiology/MAR...,lib2
AUPO-267,/home/lvelo/working_directory/microbiology/MAR...,/home/lvelo/working_directory/microbiology/MAR...,lib2


In [7]:
D['place'] = D.index.to_series().apply(lambda x: int(x.split("-")[1]) < 176).map({True: 'lung', False:'gut'})
D

Unnamed: 0,R1,R2,GROUP,place
AUPO-101,/home/lvelo/working_directory/microbiology/MAR...,/home/lvelo/working_directory/microbiology/MAR...,lib2,lung
AUPO-102,/home/lvelo/working_directory/microbiology/MAR...,/home/lvelo/working_directory/microbiology/MAR...,lib2,lung
AUPO-103,/home/lvelo/working_directory/microbiology/MAR...,/home/lvelo/working_directory/microbiology/MAR...,lib2,lung
AUPO-104,/home/lvelo/working_directory/microbiology/MAR...,/home/lvelo/working_directory/microbiology/MAR...,lib2,lung
AUPO-105,/home/lvelo/working_directory/microbiology/MAR...,/home/lvelo/working_directory/microbiology/MAR...,lib2,lung
...,...,...,...,...
AUPO-264,/home/lvelo/working_directory/microbiology/MAR...,/home/lvelo/working_directory/microbiology/MAR...,lib2,gut
AUPO-265,/home/lvelo/working_directory/microbiology/MAR...,/home/lvelo/working_directory/microbiology/MAR...,lib2,gut
AUPO-266,/home/lvelo/working_directory/microbiology/MAR...,/home/lvelo/working_directory/microbiology/MAR...,lib2,gut
AUPO-267,/home/lvelo/working_directory/microbiology/MAR...,/home/lvelo/working_directory/microbiology/MAR...,lib2,gut


In [8]:
D[D['place']=='gut'].append(D['AUPO-151':].head(n=1)).to_csv('MARVIN_gut_raw.tsv',sep='\t')