# Match Samples between marker datasets
- Remove Control Samples from 16S
- Remove duplicates from 18S

In [1]:
import pandas as pd

### Set Locations

In [2]:
directory = "./Data/merged_data/"
prefix =  'Merged2018'

# Import Data

In [3]:
markers = ['16S', '18S', 'COI']
prefix =  'Merged2018'
# create space to store dfs
asvs = []
taxas = []
metas = []
seqs = []

In [4]:
for marker in markers:
    #asv
    file = './Data/merged_data/'+prefix+'_'+marker+'_otu_filtered.csv'
    df = pd.read_csv(file)
    df.set_index('ASV', inplace=True)
    asv = df.copy()
    asvs.append(asv)

    #taxa
    file = './Data/merged_data/'+prefix+'_'+marker+'_taxa_filtered.csv'
    df = pd.read_csv(file)
    df.set_index('ASV', inplace=True)
    taxa = df.copy()
    taxas.append(taxa)

    #seq
    file = './Data/merged_data/'+prefix+'_'+marker+'_seq_filtered.csv'
    df = pd.read_csv(file)
    df.set_index('ASV', inplace=True)
    seq = df.copy()
    seqs.append(seq)

    #meta
    file = './Data/merged_data/'+prefix+'_'+marker+'_meta_filtered.csv'
    df = pd.read_csv(file)
    df.set_index('sample_name', inplace=True)
    meta = df.copy()
    metas.append(meta)


## 18S

- Remove duplicate samples in 18S (for now)
- remove NOAA filter ID in Lasker sample name to match with 16S

In [71]:
i=1
marker = markers[i]
print(markers[i])
df = metas[i].copy()
df = df.reset_index()
df = df.sort_values('seqID')
df.loc[df['sample_name'].str.contains('CN18F'), 'cruise'] = 'CN18F'
df.loc[df['sample_name'].str.contains('CN18S'), 'cruise'] = 'CN18S'
df.loc[df['sample_name'].str.contains('Lasker'), 'cruise'] = 'Lasker'
df = df.sort_values('cruise')
#remove ID and join name back up
df.loc[df['cruise']=='Lasker', 'sample_name'] = df['sample_name'].str.split('_').str[:-1].str.join('_')
df = df.drop_duplicates('sample_name')
df.set_index('sample_name', inplace=True)
print('Num_samples:',len(df.index))
samples_18S = df.index.tolist()
meta_project = df.copy()

# now from this metadata file limit asv file
df = df.reset_index().set_index('seqID')
#df = df[['sample_name']].copy()
df = pd.concat([df[['sample_name']], asvs[i].T], axis=1, join='inner')
print(len(df.index))
df.set_index('sample_name', inplace=True)
df = df.T
#remove 0 read ASVs
df['tot'] = df.sum(axis=1)
df = df.loc[df['tot']>0]
df = df.drop('tot', axis=1)
asv_project = df.copy()

#now need to pair down taxonomy file:
levels = list(taxas[i])
df = pd.concat([df, taxas[i]], axis=1, join='inner')
df = df[levels]
taxa_project = df.copy()

# for good measure do the same with the sequences:
df = pd.concat([df, seqs[i]], join='inner', axis=1)
df = df[['sequence']]
seq_project = df.copy()


18S
Num_samples: 131
131


### Save Files

In [60]:
plot_dir = './Data/merged_data/no_duplicates/'
marker = '18S'

In [61]:
#export to csv files for R plotting
dfs = [asv_project, taxa_project, seq_project, meta_project]
names = ['otu', 'taxa', 'seq', 'meta']
for df, name in zip(dfs,names):
    df.to_csv(plot_dir + 'Merged2018_'+marker+'_'+name+'_filtered.csv')
    print(plot_dir + 'Merged2018_'+marker+'_'+name+'_filtered.csv')
df.head()

./Data/merged_data/no_duplicates/Merged2018_18S_otu_filtered.csv
./Data/merged_data/no_duplicates/Merged2018_18S_taxa_filtered.csv
./Data/merged_data/no_duplicates/Merged2018_18S_seq_filtered.csv
./Data/merged_data/no_duplicates/Merged2018_18S_meta_filtered.csv


Unnamed: 0_level_0,seqID,Dataset,order,NOAA_ID,library,library_tag_combo,tag_sequence,primer_sequence_F,primer_sequence_R,sample_type,...,F_primer,R_primer,primers_FR,seq_platform,seq_model,seq_chemistry,run_center,run_date,Description,cruise
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CN18FESPkoa_SC24,CN18FESPkoa_SC24_CC,Merged2018,,,,,,,,,...,515F (Parada),806R (Apprill),515F_806R,Illumina,MiSeq,2x150bp PE,Laragen,July_2019,Spring and fall 2018 Flyer cruise and paired E...,CN18F
CN18Fc21_6_eDNA,CN18Fc21_6_eDNA_HH,Merged2018,,,,,,,,,...,515F (Parada),806R (Apprill),515F_806R,Illumina,MiSeq,2x150bp PE,Laragen,July_2019,Spring and fall 2018 Flyer cruise and paired E...,CN18F
CN18Fc22_6_eDNA,CN18Fc22_6_eDNA_HH,Merged2018,,,,,,,,,...,515F (Parada),806R (Apprill),515F_806R,Illumina,MiSeq,2x150bp PE,Laragen,July_2019,Spring and fall 2018 Flyer cruise and paired E...,CN18F
CN18Fc24_6_eDNA,CN18Fc24_6_eDNA_HH,Merged2018,,,,,,,,,...,515F (Parada),806R (Apprill),515F_806R,Illumina,MiSeq,2x150bp PE,Laragen,July_2019,Spring and fall 2018 Flyer cruise and paired E...,CN18F
CN18Fc25_5_eDNA,CN18Fc25_5_eDNA_HH,Merged2018,,,,,,,,,...,515F (Parada),806R (Apprill),515F_806R,Illumina,MiSeq,2x150bp PE,Laragen,July_2019,Spring and fall 2018 Flyer cruise and paired E...,CN18F


## COI

In [72]:
i=2
marker = markers[i]
print(markers[i])
df = metas[i].copy()
df = df.reset_index()
df = df.sort_values('seqID')
df.loc[df['sample_name'].str.contains('CN18F'), 'cruise'] = 'CN18F'
df.loc[df['sample_name'].str.contains('CN18S'), 'cruise'] = 'CN18S'
df.loc[df['sample_name'].str.contains('Lasker'), 'cruise'] = 'Lasker'
df = df.sort_values('cruise')
#remove ID and join name back up
#df.loc[df['cruise']=='Lasker', 'sample_name'] = df['sample_name'].str.split('_').str[:-1].str.join('_')
df = df.drop_duplicates('sample_name')
samples_COI = df.index.tolist()
df.set_index('sample_name', inplace=True)
print('Num_samples:',len(df.index))
meta_project = df.copy()

# now from this metadata file limit asv file
df = df.reset_index().set_index('seqID')
#df = df[['sample_name']].copy()
df = pd.concat([df[['sample_name']], asvs[i].T], axis=1, join='inner')
print(len(df.index))
df.set_index('sample_name', inplace=True)
df = df.T
#remove 0 read ASVs
df['tot'] = df.sum(axis=1)
df = df.loc[df['tot']>0]
df = df.drop('tot', axis=1)
asv_project = df.copy()

#now need to pair down taxonomy file:
levels = list(taxas[i])
df = pd.concat([df, taxas[i]], axis=1, join='inner')
df = df[levels]
taxa_project = df.copy()

# for good measure do the same with the sequences:
df = pd.concat([df, seqs[i]], join='inner', axis=1)
df = df[['sequence']]
seq_project = df.copy()


COI
Num_samples: 131
131


### Save Files

In [66]:
plot_dir = './Data/merged_data/no_duplicates/'
marker = 'COI'

In [67]:
#export to csv files for R plotting
dfs = [asv_project, taxa_project, seq_project, meta_project]
names = ['otu', 'taxa', 'seq', 'meta']
for df, name in zip(dfs,names):
    df.to_csv(plot_dir + 'Merged2018_'+marker+'_'+name+'_filtered.csv')
    print(plot_dir + 'Merged2018_'+marker+'_'+name+'_filtered.csv')
df.head()

./Data/merged_data/no_duplicates/Merged2018_COI_otu_filtered.csv
./Data/merged_data/no_duplicates/Merged2018_COI_taxa_filtered.csv
./Data/merged_data/no_duplicates/Merged2018_COI_seq_filtered.csv
./Data/merged_data/no_duplicates/Merged2018_COI_meta_filtered.csv


Unnamed: 0_level_0,seqID,Dataset,order,library,library_tag_combo,tag_sequence,primer_sequence_F,primer_sequence_R,sample_type,sample_locus,...,F_primer,R_primer,primers_FR,seq_platform,seq_model,seq_chemistry,run_center,run_date,Description,cruise
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CN18FESPkoa_SC24,CN18FESPkoa_SC24_FF,Merged2018,,,,,,,,,...,515F (Parada),806R (Apprill),515F_806R,Illumina,MiSeq,2x150bp PE,Laragen,July_2019,Spring and fall 2018 Flyer cruise and paired E...,CN18F
CN18Fc21_6_eDNA,CN18Fc21_6_eDNA_GG,Merged2018,,,,,,,,,...,515F (Parada),806R (Apprill),515F_806R,Illumina,MiSeq,2x150bp PE,Laragen,July_2019,Spring and fall 2018 Flyer cruise and paired E...,CN18F
CN18Fc22_6_eDNA,CN18Fc22_6_eDNA_GG,Merged2018,,,,,,,,,...,515F (Parada),806R (Apprill),515F_806R,Illumina,MiSeq,2x150bp PE,Laragen,July_2019,Spring and fall 2018 Flyer cruise and paired E...,CN18F
CN18Fc25_5_eDNA,CN18Fc25_5_eDNA_GG,Merged2018,,,,,,,,,...,515F (Parada),806R (Apprill),515F_806R,Illumina,MiSeq,2x150bp PE,Laragen,July_2019,Spring and fall 2018 Flyer cruise and paired E...,CN18F
CN18Fc27_4_eDNA,CN18Fc27_4_eDNA_GG,Merged2018,,,,,,,,,...,515F (Parada),806R (Apprill),515F_806R,Illumina,MiSeq,2x150bp PE,Laragen,July_2019,Spring and fall 2018 Flyer cruise and paired E...,CN18F


## 16S

In [78]:
i=0
marker = markers[i]
print(markers[i])
df = metas[i].copy()
df = df.reset_index()
df = df.sort_values('seqID')
df.loc[df['sample_name'].str.contains('CN18F'), 'cruise'] = 'CN18F'
df.loc[df['sample_name'].str.contains('CN18S'), 'cruise'] = 'CN18S'
df.loc[df['sample_name'].str.contains('Lasker'), 'cruise'] = 'Lasker'
df = df.sort_values('cruise')
df = df.drop_duplicates('sample_name')

# only keep 16S samples that are in the 18S list:
df = df.loc[df['sample_name'].isin(samples_18S)]

df.set_index('sample_name', inplace=True)
print('Num_samples:',len(df.index))
meta_project = df.copy()

# now from this metadata file limit asv file
#df = df.reset_index().set_index('sample_name')
#df = df[['sample_name']].copy()
df = pd.concat([df[['seqID']], asvs[i].T], axis=1, join='inner')
print(len(df.index))
#df.set_index('sample_name', inplace=True)
df.drop('seqID', axis=1, inplace=True)
df = df.T
#remove 0 read ASVs
df['tot'] = df.sum(axis=1)
df = df.loc[df['tot']>0]
df = df.drop('tot', axis=1)
asv_project = df.copy()

#now need to pair down taxonomy file:
levels = list(taxas[i])
df = pd.concat([df, taxas[i]], axis=1, join='inner')
df = df[levels]
taxa_project = df.copy()

# for good measure do the same with the sequences:
df = pd.concat([df, seqs[i]], join='inner', axis=1)
df = df[['sequence']]
seq_project = df.copy()


16S
Num_samples: 131
131


### Save Files

In [79]:
plot_dir = './Data/merged_data/no_duplicates/'
marker = '16S'

In [80]:
#export to csv files for R plotting
dfs = [asv_project, taxa_project, seq_project, meta_project]
names = ['otu', 'taxa', 'seq', 'meta']
for df, name in zip(dfs,names):
    df.to_csv(plot_dir + 'Merged2018_'+marker+'_'+name+'_filtered.csv')
    print(plot_dir + 'Merged2018_'+marker+'_'+name+'_filtered.csv')
df.head()

./Data/merged_data/no_duplicates/Merged2018_16S_otu_filtered.csv
./Data/merged_data/no_duplicates/Merged2018_16S_taxa_filtered.csv
./Data/merged_data/no_duplicates/Merged2018_16S_seq_filtered.csv
./Data/merged_data/no_duplicates/Merged2018_16S_meta_filtered.csv


Unnamed: 0_level_0,Dataset,seqID,order,library,library_tag_combo,tag_sequence,primer_sequence_F,primer_sequence_R,sample_type,sample_locus,...,F_primer,R_primer,primers_FR,seq_platform,seq_model,seq_chemistry,run_center,run_date,Description,cruise
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CN18Fc43_5_eDNA,Merged2018,,,,,,,,,,...,515F (Parada),806R (Apprill),515F_806R,Illumina,MiSeq,2x150bp PE,Laragen,July_2019,Spring and fall 2018 Flyer cruise and paired E...,CN18F
CN18FESPkoa_SC58,Merged2018,,,,,,,,,,...,515F (Parada),806R (Apprill),515F_806R,Illumina,MiSeq,2x150bp PE,Laragen,July_2019,Spring and fall 2018 Flyer cruise and paired E...,CN18F
CN18FESPkoa_SC43,Merged2018,,,,,,,,,,...,515F (Parada),806R (Apprill),515F_806R,Illumina,MiSeq,2x150bp PE,Laragen,July_2019,Spring and fall 2018 Flyer cruise and paired E...,CN18F
CN18FESPkoa_SC40,Merged2018,,,,,,,,,,...,515F (Parada),806R (Apprill),515F_806R,Illumina,MiSeq,2x150bp PE,Laragen,July_2019,Spring and fall 2018 Flyer cruise and paired E...,CN18F
CN18FESPkoa_SC37,Merged2018,,,,,,,,,,...,515F (Parada),806R (Apprill),515F_806R,Illumina,MiSeq,2x150bp PE,Laragen,July_2019,Spring and fall 2018 Flyer cruise and paired E...,CN18F


## Match up 18S, COI, and 16S samples

- check they are all represented!

In [90]:
directory = './Data/merged_data/no_duplicates/'
markers = ['16S', '18S', 'COI']
prefix =  'Merged2018'
# create space to store dfs
asvs = []
taxas = []
metas = []
seqs = []

In [91]:
for marker in markers:
    #asv
    file = directory+prefix+'_'+marker+'_otu_filtered.csv'
    df = pd.read_csv(file)
    #df.set_index('ASV', inplace=True)
    asv = df.copy()
    asvs.append(asv)

    #taxa
    file = directory+prefix+'_'+marker+'_taxa_filtered.csv'
    df = pd.read_csv(file)
    #df.set_index('ASV', inplace=True)
    taxa = df.copy()
    taxas.append(taxa)

    #seq
    file = directory+prefix+'_'+marker+'_seq_filtered.csv'
    df = pd.read_csv(file)
    #df.set_index('ASV', inplace=True)
    seq = df.copy()
    seqs.append(seq)

    #meta
    file = directory+prefix+'_'+marker+'_meta_filtered.csv'
    df = pd.read_csv(file)
    df.set_index('sample_name', inplace=True)
    meta = df.copy()
    metas.append(meta)


In [95]:
# join together metadata files
dfs=[]
for i in range(len(markers)):
    df = metas[i]
    df[markers[i]]=1
    df = df[[markers[i]]]
    dfs.append(df)
df = pd.concat(dfs, axis=1)
df = df.sort_values(['16S', '18S', 'COI'])
df['tot'] = df.sum(axis=1)
df = df.sort_values('tot')
df

Unnamed: 0_level_0,16S,18S,COI,tot
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CN18Fc43_5_eDNA,1,1,1,3
Lasker18Sc124_17,1,1,1,3
Lasker18Sc126_5,1,1,1,3
Lasker18Sc126_3,1,1,1,3
Lasker18Sc126_17,1,1,1,3
...,...,...,...,...
CN18FESPkoa_SC53,1,1,1,3
CN18FESPkoa_SC24,1,1,1,3
CN18FESPkoa_SC52,1,1,1,3
CN18FESPkoa_SC55,1,1,1,3
