## GISAID Metadata preparation

In [1]:
import pandas as pd
import numpy as np
import glob, os, re
from datetime import  datetime

In [2]:
dt = datetime.today().strftime(format='%d-%m-%Y')
ref = 'seq16-21'

### **Establish parent directory**

In [3]:
#uniqueness in directory and file names is assumed for all analyses
sars_dir = "SARS-CoV-2"
home_dir = os.getenv('HOME')
parent_dir = glob.glob(f'{home_dir}/**/{sars_dir}', recursive=True)[0]

### Load submission form

In [4]:
df_gisaid = pd.read_excel(glob.glob(f'{parent_dir}/**/20210222_EpiCoV.xls', recursive=True)[0], 'Submissions')

In [5]:
df_gisaid.head()

Unnamed: 0,submitter,fn,covv_virus_name,covv_type,covv_passage,covv_collection_date,covv_location,covv_add_location,covv_host,covv_add_host_info,...,covv_coverage,covv_orig_lab,covv_orig_lab_addr,covv_provider_sample_id,covv_subm_lab,covv_subm_lab_addr,covv_subm_sample_id,covv_authors,covv_comment,comment_type
0,Submitter,FASTA filename,Virus name,Type,Passage details/history,Collection date,Location,Additional location information,Host,Additional host information,...,Coverage,Originating lab,Address,Sample ID given by originating laboratory,Submitting lab,Address,Sample ID given by the submitting laboratory,Authors,Comment,Comment Icon
1,GISAID username,all_sequences.fasta,hCoV-19/Country/Identifier/2020,betacoronavirus,"e.g. Original, Vero",2020-03-02,e.g. Continent / Country / Region,"e.g. Cruise Ship, Convention, Live animal market","e.g. Human, Animal, Environment, Laboratory de...",e.g. Patient infected while traveling in ….,...,"e.g. 70x, 1,000x, 10,000x (average)",Where the clinical specimen or virus isolate w...,,,Where sequence data have been generated and su...,,,"e.g. Jane Doe, John Doe",,


In [6]:
df_gisaid.columns

Index(['submitter', 'fn', 'covv_virus_name', 'covv_type', 'covv_passage',
       'covv_collection_date', 'covv_location', 'covv_add_location',
       'covv_host', 'covv_add_host_info', 'covv_sampling_strategy',
       'covv_gender', 'covv_patient_age', 'covv_patient_status',
       'covv_specimen', 'covv_outbreak', 'covv_last_vaccinated',
       'covv_treatment', 'covv_seq_technology', 'covv_assembly_method',
       'covv_coverage', 'covv_orig_lab', 'covv_orig_lab_addr',
       'covv_provider_sample_id', 'covv_subm_lab', 'covv_subm_lab_addr',
       'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type'],
      dtype='object')

In [7]:
gisaid_header = ['submitter', 'fn', 'covv_virus_name', 'covv_type', 'covv_passage', 'DT_SAM_COLL', 
 'DT_SAM_RECEP', 'covv_location', 'covv_add_location','covv_host', 'covv_add_host_info', 'covv_sampling_strategy',
 'GEND', 'AGE_YRS', 'covv_patient_status', 'covv_specimen', 'covv_outbreak', 
 'covv_last_vaccinated', 'covv_treatment', 'covv_seq_technology', 'covv_assembly_method', 'covv_coverage', 
 'covv_orig_lab', 'covv_orig_lab_addr', 'S_NUM', 'covv_subm_lab', 'covv_subm_lab_addr', 
 'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type']

### **Pull-in the metadata**

In [8]:
df_metadata = pd.read_excel(glob.glob(f'{parent_dir}/**/COVID19-resultsCts-merged-cln.xlsx', recursive=True)[0])

In [9]:
#get the columns necessary
df_headers = df_metadata[['S_NUM', 'AGE_YRS', 'GEND', 'NAT', 
           'COUNT_RES',  
           'DT_SAM_COLL', 
           'DT_SAM_RECEP']]

In [10]:
df_gisaid_all = pd.read_excel(glob.glob(f'{parent_dir}/**/seq-summary-metadata_seq16-21_gsaid_01-11-2021.xlsx')[0], usecols=['sequence_name'])

In [11]:
df_gisaid_all.head

<bound method NDFrame.head of     sequence_name
0       COVC24477
1       COVC24478
2       COVC24479
3       COVC24480
4       COVC24481
..            ...
828     COVM01412
829     COVM01413
830     COVM01415
831     COVM01416
832     COVM01417

[833 rows x 1 columns]>

In [12]:
df_mrgl = df_gisaid_all.merge(df_headers, how='left', left_on='sequence_name', right_on='S_NUM').drop('sequence_name', axis=1)
df_mrgl.shape

(833, 7)

In [13]:
df_mrgl.head()

Unnamed: 0,S_NUM,AGE_YRS,GEND,NAT,COUNT_RES,DT_SAM_COLL,DT_SAM_RECEP
0,COVC24477,80.0,F,Kenya,Nyamira,2021-09-18,2021-09-21
1,COVC24478,76.0,F,Kenya,Nyamira,2021-09-19,2021-09-21
2,COVC24479,60.0,F,Kenya,Nyamira,2021-09-20,2021-09-21
3,COVC24480,36.0,F,Kenya,Nyamira,2021-09-20,2021-09-21
4,COVC24481,61.0,F,Kenya,Nyamira,2021-09-18,2021-09-21


### **Be sure the next input is update**

In [14]:
df_seq_summ = pd.read_excel(glob.glob(f'{parent_dir}/**/seq-summary-metadata_seq16-21_gsaid_01-11-2021.xlsx')[0])
df_seq_summ.shape

(833, 10)

In [15]:
df_seqd_int = df_seq_summ.sort_values(['sequence_name', 'genome_coverage']).drop_duplicates('sequence_name', keep='last')
df_seqd_int.shape#.head()

(833, 10)

In [16]:
df_seqd_int.shape

(833, 10)

In [17]:
df_seqd_int.head()

Unnamed: 0,sequence_name,clade,lineage,genome_coverage,run_num,tech,seq_dt,lib_prep,primer_set,sno.
480,COVC17732,20C,B.1,92.4,Run16_ONT,ONT,15-09-2021,NEBNext,ARTIC_V3,1967
481,COVC17734,20C,B.1,91.4,Run16_ONT,ONT,15-09-2021,NEBNext,ARTIC_V3,1968
482,COVC17735,20C,B.1,94.2,Run16_ONT,ONT,15-09-2021,NEBNext,ARTIC_V3,1969
483,COVC17737,20C,B.1,91.5,Run16_ONT,ONT,15-09-2021,NEBNext,ARTIC_V3,1970
484,COVC17739,20C,B.1,97.2,Run16_ONT,ONT,15-09-2021,NEBNext,ARTIC_V3,1971


In [18]:
#Merge metadata with seq summary data
df_seq_meta_on = df_mrgl.merge(df_seqd_int, how='left', left_on='S_NUM', right_on='sequence_name')
df_seq_meta_on.shape

(833, 17)

In [19]:
mask = df_seq_meta_on.duplicated('sequence_name', keep=False)
df_seq_meta_on[mask == True]

Unnamed: 0,S_NUM,AGE_YRS,GEND,NAT,COUNT_RES,DT_SAM_COLL,DT_SAM_RECEP,sequence_name,clade,lineage,genome_coverage,run_num,tech,seq_dt,lib_prep,primer_set,sno.


In [20]:
df_seq_meta_on.head()

Unnamed: 0,S_NUM,AGE_YRS,GEND,NAT,COUNT_RES,DT_SAM_COLL,DT_SAM_RECEP,sequence_name,clade,lineage,genome_coverage,run_num,tech,seq_dt,lib_prep,primer_set,sno.
0,COVC24477,80.0,F,Kenya,Nyamira,2021-09-18,2021-09-21,COVC24477,21J (Delta),AY.7.1,95.6,Run18_NS,Illumina,10-10-2021,NEBNext_FS,ARTIC_V3,1487
1,COVC24478,76.0,F,Kenya,Nyamira,2021-09-19,2021-09-21,COVC24478,21J (Delta),AY.4,96.4,Run18_NS,Illumina,10-10-2021,NEBNext_FS,ARTIC_V3,1488
2,COVC24479,60.0,F,Kenya,Nyamira,2021-09-20,2021-09-21,COVC24479,21J (Delta),AY.7.1,88.5,Run18_NS,Illumina,10-10-2021,NEBNext_FS,ARTIC_V3,1489
3,COVC24480,36.0,F,Kenya,Nyamira,2021-09-20,2021-09-21,COVC24480,21A (Delta),AY.16,96.7,Run18_NS,Illumina,10-10-2021,NEBNext_FS,ARTIC_V3,1490
4,COVC24481,61.0,F,Kenya,Nyamira,2021-09-18,2021-09-21,COVC24481,21A (Delta),AY.16,94.6,Run18_NS,Illumina,10-10-2021,NEBNext_FS,ARTIC_V3,1491


In [21]:
#Merge metadata with seq summary data
df_seq_metal = df_mrgl.set_index('S_NUM').merge(df_seqd_int.set_index('sequence_name'), how='left', left_index=True, right_index=True)
df_seq_metal.shape

(833, 15)

In [22]:
def get_seq_tech(x):
    x = str(x)
    if 'ONT' in x:
        return x.replace(x, 'MinION')
    elif '_NS' in x:
        return x.replace(x, 'NextSeq')
    return x.replace(x, 'MiSeq')
        

In [23]:
df_seq_meta2 = df_seq_metal.assign(covv_seq_technology = df_seq_metal['run_num']
                                  .apply(lambda x: get_seq_tech(x)))

In [24]:
df_seq_meta2.shape

(833, 16)

In [25]:
df_seq_meta2.head()

Unnamed: 0_level_0,AGE_YRS,GEND,NAT,COUNT_RES,DT_SAM_COLL,DT_SAM_RECEP,clade,lineage,genome_coverage,run_num,tech,seq_dt,lib_prep,primer_set,sno.,covv_seq_technology
S_NUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
COVC24477,80.0,F,Kenya,Nyamira,2021-09-18,2021-09-21,21J (Delta),AY.7.1,95.6,Run18_NS,Illumina,10-10-2021,NEBNext_FS,ARTIC_V3,1487,NextSeq
COVC24478,76.0,F,Kenya,Nyamira,2021-09-19,2021-09-21,21J (Delta),AY.4,96.4,Run18_NS,Illumina,10-10-2021,NEBNext_FS,ARTIC_V3,1488,NextSeq
COVC24479,60.0,F,Kenya,Nyamira,2021-09-20,2021-09-21,21J (Delta),AY.7.1,88.5,Run18_NS,Illumina,10-10-2021,NEBNext_FS,ARTIC_V3,1489,NextSeq
COVC24480,36.0,F,Kenya,Nyamira,2021-09-20,2021-09-21,21A (Delta),AY.16,96.7,Run18_NS,Illumina,10-10-2021,NEBNext_FS,ARTIC_V3,1490,NextSeq
COVC24481,61.0,F,Kenya,Nyamira,2021-09-18,2021-09-21,21A (Delta),AY.16,94.6,Run18_NS,Illumina,10-10-2021,NEBNext_FS,ARTIC_V3,1491,NextSeq


In [26]:
df_seq_meta3 = df_seq_meta2.assign(covv_location = 'Africa/' + df_seq_meta2['NAT'] + '/' + df_seq_meta2['COUNT_RES'])

In [27]:
df_seq_meta3.shape

(833, 17)

In [28]:
df_subf = df_seq_meta3

In [29]:
submitter = 'Soyola'
fn = 'gisaid_all.fasta'
cvn = 'hCoV-19/Kenya/ILRI_'
vt = 'betacoronavirus'
cp = 'Original'
cal = 'unknown'
ch = 'Human'
cahi = 'unknown'
css = 'Surveillance'
cps = 'unknown'
cps = 'unknown'
cs = 'NP Swab'
co = 'unknown'
clv = 'unknown'
ct = 'unknown'
# cst = 'Illumina'
cam = 'Consensus'
cc = 'unknown'
col = 'International Livestock Research Institute'
cola = 'Uthiru, Naivasha road, Nairobi-Kenya'
# cpsi = 'ILRI'
csl = 'International Livestock Research Institute'
csla = 'Uthiru, Naivasha road, Nairobi-Kenya'
cssi = df_subf.index.tolist()
ca = 'Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, Shebbar Osiany, Edward Kiritu, Paul Dobi, Collins Muli, Patrick Amoth, Vishvanath Nene, Sonal P. Henson, Edward O. Abworo'
ccomm = ''
ct = ''

In [30]:
df_subf['submitter'] = submitter
df_subf['fn'] = fn
df_subf['covv_virus_name'] = cvn
df_subf['covv_type'] = vt
df_subf['covv_passage'] = cp
df_subf['covv_add_location'] = cal
df_subf['covv_host'] = ch
df_subf['covv_add_host_info'] = cahi
df_subf['covv_sampling_strategy'] = css
df_subf['covv_patient_status'] = cps
df_subf['covv_specimen'] = cs
df_subf['covv_outbreak'] = co
df_subf['covv_last_vaccinated'] = clv
df_subf['covv_treatment'] = ct
# df_subf['covv_seq_technology'] = cst
df_subf['covv_assembly_method'] = cam
df_subf['covv_coverage'] = cc
df_subf['covv_orig_lab'] = col
df_subf['covv_orig_lab_addr'] = cola
# df_subf['covv_provider_sample_id'] = cpsi
df_subf['covv_subm_lab'] = csl
df_subf['covv_subm_lab_addr'] = csla
df_subf['covv_subm_sample_id'] = cssi
df_subf['covv_authors'] = ca
df_subf['covv_comment'] = ccomm
df_subf['comment_type'] = ct
df_subf.shape

(833, 41)

In [31]:
df_subf.columns

Index(['AGE_YRS', 'GEND', 'NAT', 'COUNT_RES', 'DT_SAM_COLL', 'DT_SAM_RECEP',
       'clade', 'lineage', 'genome_coverage', 'run_num', 'tech', 'seq_dt',
       'lib_prep', 'primer_set', 'sno.', 'covv_seq_technology',
       'covv_location', 'submitter', 'fn', 'covv_virus_name', 'covv_type',
       'covv_passage', 'covv_add_location', 'covv_host', 'covv_add_host_info',
       'covv_sampling_strategy', 'covv_patient_status', 'covv_specimen',
       'covv_outbreak', 'covv_last_vaccinated', 'covv_treatment',
       'covv_assembly_method', 'covv_coverage', 'covv_orig_lab',
       'covv_orig_lab_addr', 'covv_subm_lab', 'covv_subm_lab_addr',
       'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type'],
      dtype='object')

In [32]:
df_subf1 = df_subf.drop(['NAT', 'COUNT_RES'], axis=1)

In [34]:
df_filter80 = df_subf1[df_subf1['genome_coverage'] >= 80.0]#.fillna('unknown')

In [35]:
df_filter80.shape

(685, 39)

In [36]:
df_filter80.columns

Index(['AGE_YRS', 'GEND', 'DT_SAM_COLL', 'DT_SAM_RECEP', 'clade', 'lineage',
       'genome_coverage', 'run_num', 'tech', 'seq_dt', 'lib_prep',
       'primer_set', 'sno.', 'covv_seq_technology', 'covv_location',
       'submitter', 'fn', 'covv_virus_name', 'covv_type', 'covv_passage',
       'covv_add_location', 'covv_host', 'covv_add_host_info',
       'covv_sampling_strategy', 'covv_patient_status', 'covv_specimen',
       'covv_outbreak', 'covv_last_vaccinated', 'covv_treatment',
       'covv_assembly_method', 'covv_coverage', 'covv_orig_lab',
       'covv_orig_lab_addr', 'covv_subm_lab', 'covv_subm_lab_addr',
       'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type'],
      dtype='object')

In [37]:
df_gisaid_sub = df_filter80.reset_index().rename(columns={'index': 'S_NUM'})[gisaid_header] #.drop('DT_SAM_COLL', axis=1)

In [38]:
df_gisaid_sub.head()

Unnamed: 0,submitter,fn,covv_virus_name,covv_type,covv_passage,DT_SAM_COLL,DT_SAM_RECEP,covv_location,covv_add_location,covv_host,...,covv_coverage,covv_orig_lab,covv_orig_lab_addr,S_NUM,covv_subm_lab,covv_subm_lab_addr,covv_subm_sample_id,covv_authors,covv_comment,comment_type
0,Soyola,gisaid_all.fasta,hCoV-19/Kenya/ILRI_,betacoronavirus,Original,2021-09-18,2021-09-21,Africa/Kenya/Nyamira,unknown,Human,...,unknown,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC24477,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC24477,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,
1,Soyola,gisaid_all.fasta,hCoV-19/Kenya/ILRI_,betacoronavirus,Original,2021-09-19,2021-09-21,Africa/Kenya/Nyamira,unknown,Human,...,unknown,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC24478,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC24478,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,
2,Soyola,gisaid_all.fasta,hCoV-19/Kenya/ILRI_,betacoronavirus,Original,2021-09-20,2021-09-21,Africa/Kenya/Nyamira,unknown,Human,...,unknown,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC24479,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC24479,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,
3,Soyola,gisaid_all.fasta,hCoV-19/Kenya/ILRI_,betacoronavirus,Original,2021-09-20,2021-09-21,Africa/Kenya/Nyamira,unknown,Human,...,unknown,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC24480,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC24480,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,
4,Soyola,gisaid_all.fasta,hCoV-19/Kenya/ILRI_,betacoronavirus,Original,2021-09-18,2021-09-21,Africa/Kenya/Nyamira,unknown,Human,...,unknown,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC24481,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC24481,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,


In [None]:
# df_gisaid_sub_srt = df_gisaid_sub.assign(DT_SAM_COLL=df_gisaid_sub.sort_values('DT_SAM_COLL')['DT_SAM_COLL'].map(lambda x: x.strftime('%Y-%m-%d')))

In [39]:
df_gisaid_sub.columns

Index(['submitter', 'fn', 'covv_virus_name', 'covv_type', 'covv_passage',
       'DT_SAM_COLL', 'DT_SAM_RECEP', 'covv_location', 'covv_add_location',
       'covv_host', 'covv_add_host_info', 'covv_sampling_strategy', 'GEND',
       'AGE_YRS', 'covv_patient_status', 'covv_specimen', 'covv_outbreak',
       'covv_last_vaccinated', 'covv_treatment', 'covv_seq_technology',
       'covv_assembly_method', 'covv_coverage', 'covv_orig_lab',
       'covv_orig_lab_addr', 'S_NUM', 'covv_subm_lab', 'covv_subm_lab_addr',
       'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type'],
      dtype='object')

### **Confirm the following input file is update**

In [40]:
#DT_SAM_RECEP
gisaid_cols = ['submitter', 'fn', 'covv_virus_name', 'covv_type', 'covv_passage',
       'covv_collection_date', 'DT_SAM_RECEP','covv_location', 'covv_add_location',
       'covv_host', 'covv_add_host_info', 'covv_sampling_strategy',
       'covv_gender', 'covv_patient_age', 'covv_patient_status',
       'covv_specimen', 'covv_outbreak', 'covv_last_vaccinated',
       'covv_treatment', 'covv_seq_technology', 'covv_assembly_method',
       'covv_coverage', 'covv_orig_lab', 'covv_orig_lab_addr',
       'covv_provider_sample_id', 'covv_subm_lab', 'covv_subm_lab_addr',
       'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type']


In [41]:
df_gisaid_sub.columns = gisaid_cols

In [42]:
df_dt = df_gisaid_sub[df_gisaid_sub['covv_collection_date'].isna() == False]
df_dt_fmt = df_dt.assign(covv_collection_date=df_dt['covv_collection_date'].dt.strftime('%Y-%m-%d'))
df_missing_dt = df_gisaid_sub[df_gisaid_sub['covv_collection_date'].isna() == True]

In [43]:
df_missing_dt_corr = df_missing_dt.assign(covv_collection_date=df_missing_dt['DT_SAM_RECEP'].apply(lambda x: pd.Timestamp(x).strftime('%Y-%m') if (isinstance(x, pd.Timestamp)) else x.replace(x, pd.NaT)))

In [44]:
df_gisaid_fin = df_dt_fmt.append(df_missing_dt_corr)

In [45]:
df_gisaid_fin.shape

(685, 31)

In [47]:
df_prev_sub = pd.read_excel(glob.glob(f'{parent_dir}/**/Submissions/20210914_ILRI_gisaid_submission_metadata_curated.xls')[0], sheet_name='Submissions', usecols=['covv_provider_sample_id'])

In [57]:
df_gisaid_fil = df_gisaid_fin[df_gisaid_fin['covv_provider_sample_id'].isin(df_prev_sub.covv_provider_sample_id) == False].fillna('unknown') 

df_gisaid_fil.to_excel(f"{glob.glob(f'{parent_dir}/Gisaid')[0]}/gisaid_data_{ref}_{dt}.xlsx", index=False)

pd.Series(df_gisaid_fil['covv_provider_sample_id']).to_csv(f"{glob.glob(f'{parent_dir}/Gisaid')[0]}/gisaid_IDs_{ref}_{dt}.csv", index=False, header=False)

df_gisaid_fil.shape

(685, 31)

In [None]:
df_gisaid_fil.to_csv

In [113]:
df_final_feedB = pd.read_excel(glob.glob(f'{parent_dir}/**/Submissions/20210914_ILRI_gisaid_submission_metadata_curated.xls')[0], sheet_name='Submissions')
df_final_feedB['covv_seq_technology'].value_counts()

NextSeq                  55
MiSeq                    39
Minion                   10
Sequencing technology     1
Name: covv_seq_technology, dtype: int64

In [89]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   submitter                732 non-null    object
 1   fn                       732 non-null    object
 2   covv_virus_name          732 non-null    object
 3   covv_type                732 non-null    object
 4   covv_passage             732 non-null    object
 5   covv_collection_date     732 non-null    object
 6   covv_location            732 non-null    object
 7   covv_add_location        732 non-null    object
 8   covv_host                732 non-null    object
 9   covv_add_host_info       732 non-null    object
 10  covv_sampling_strategy   732 non-null    object
 11  covv_gender              732 non-null    object
 12  covv_patient_age         732 non-null    object
 13  covv_patient_status      732 non-null    object
 14  covv_specimen            732 non-null    o

### **Get genome coverage data from nextclade output**

In [116]:
df_nxt_ill = pd.read_csv(glob.glob(f'{parent_dir}/**/Illumina/transposed_report.tsv')[0], sep='\t')

In [68]:
df_nxt_ill.head(12)

Unnamed: 0,Assembly,# contigs (>= 0 bp),# contigs (>= 1000 bp),# contigs (>= 5000 bp),# contigs (>= 10000 bp),# contigs (>= 25000 bp),# contigs (>= 50000 bp),Total length (>= 0 bp),Total length (>= 1000 bp),Total length (>= 5000 bp),...,Largest alignment,Total aligned length,NA50,NGA50,NA75,NGA75,LA50,LGA50,LA75,LGA75
0,COCV11310_S6.consensus,1,1,1,1,1,0,29903,29903,29903,...,17906,17906,17906,17906,-,-,1,1,-,-
1,COVC03617_S74.consensus,1,1,1,1,1,0,29903,29903,29903,...,29487,29487,29487,29487,29487,29487,1,1,1,1
2,COVC03665_S75.consensus,1,1,1,1,1,0,29903,29903,29903,...,29300,29300,29300,29300,29300,29300,1,1,1,1
3,COVC03696_S76.consensus,1,1,1,1,1,0,29903,29903,29903,...,29511,29511,29511,29511,29511,29511,1,1,1,1
4,COVC03723_S77.consensus,1,1,1,1,1,0,29903,29903,29903,...,29667,29667,29667,29667,29667,29667,1,1,1,1
5,COVC03813_S78.consensus,1,1,1,1,1,0,29903,29903,29903,...,29275,29275,29275,29275,29275,29275,1,1,1,1
6,COVC03955_S79.consensus,1,1,1,1,1,0,29903,29903,29903,...,29487,29487,29487,29487,29487,29487,1,1,1,1
7,COVC03960_S80.consensus,1,1,1,1,1,0,29903,29903,29903,...,29487,29487,29487,29487,29487,29487,1,1,1,1
8,COVC03965_S81.consensus,1,1,1,1,1,0,29903,29903,29903,...,29275,29275,29275,29275,29275,29275,1,1,1,1
9,COVC03977_S82.consensus,1,1,1,1,1,0,29903,29903,29903,...,29487,29487,29487,29487,29487,29487,1,1,1,1


In [70]:
df_nxt_ill_gcov = df_nxt_ill[['Assembly', 'Genome fraction (%)']]

In [71]:
df_nxt_ill_gcov.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 952 entries, 0 to 951
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Assembly             952 non-null    object
 1   Genome fraction (%)  952 non-null    object
dtypes: object(2)
memory usage: 15.0+ KB


In [73]:
def clean(x):
    if float(x) and x != '-':
        return round(float(x), 1)
    else:
        return np.NaN

In [76]:
#get sample ids only and rounded genome coverage
p = re.compile("_S[1-9]{0,3}\w.*")
new_ass = df_nxt_ill_gcov['Assembly'].apply(lambda x: p.sub('', x))
new_gen_cov = df_nxt_ill_gcov['Genome fraction (%)'].replace('-', np.NaN).apply(lambda x: clean(x))

In [77]:
new_ass

0       COCV11310
1       COVC03617
2       COVC03665
3       COVC03696
4       COVC03723
          ...    
947    NPHL_12780
948         NPHL1
949         NPHL2
950         NPHL3
951         NPHL4
Name: Assembly, Length: 952, dtype: object

In [78]:
new_gen_cov

0      59.9
1      98.6
2      98.0
3      98.7
4      99.2
       ... 
947    26.9
948    99.0
949    74.7
950    98.1
951    61.0
Name: Genome fraction (%), Length: 952, dtype: float64

In [79]:
df_nxt_ill_gcov_curated = pd.DataFrame({'sample_id': new_ass, 'genome_cov': new_gen_cov})

In [72]:
df_nxt_ill_gcov_curated[df_nxt_ill_gcov_curated['genome_cov'] >= 80.0].to_excel(glob.glob(f'{parent_dir}/**/Gisaid/Illumina/Illumina-genome-coverage80.xlsx')[0], index=False)

In [81]:
df_nxt_ont = pd.read_csv(glob.glob(f'{parent_dir}/**/Gisaid/ONT/transposed_report.tsv')[0], sep='\t')

In [89]:
df_nxt_ont_gcov = df_nxt_ont[['Assembly', 'Genome fraction (%)']]

In [71]:
df_nxt_ont_gcov.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 952 entries, 0 to 951
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Assembly             952 non-null    object
 1   Genome fraction (%)  952 non-null    object
dtypes: object(2)
memory usage: 15.0+ KB


In [76]:
#get sample ids only and rounded genome coverage
p = re.compile("_S[1-9]{0,3}\w.*")
new_ass_ont = df_nxt_ont_gcov['Assembly'].apply(lambda x: p.sub('', x))
new_gen_cov_ont = df_ont_gcov['Genome fraction (%)'].replace('-', np.NaN).apply(lambda x: clean(x))

In [77]:
new_ass_ont

0       COCV11310
1       COVC03617
2       COVC03665
3       COVC03696
4       COVC03723
          ...    
947    NPHL_12780
948         NPHL1
949         NPHL2
950         NPHL3
951         NPHL4
Name: Assembly, Length: 952, dtype: object

In [78]:
new_gen_cov_ont

0      59.9
1      98.6
2      98.0
3      98.7
4      99.2
       ... 
947    26.9
948    99.0
949    74.7
950    98.1
951    61.0
Name: Genome fraction (%), Length: 952, dtype: float64

In [79]:
df_nxt_ont_gcov_curated = pd.DataFrame({'sample_id': new_ass_ont, 'genome_cov': new_gen_cov_ont})

### **Reconcile the below output and the corresponding Illumina output to remove both within and across duplicate sequencing**
The next steps depend on these: appended `_nodup` suffix to resulting file name in `.csv` to allow filtering with `bash`

In [72]:
df_nxt_ont_gcov_curated[df_nxt_ont_gcov_curated['genome_cov'] >= 80.0].to_excel(glob.glob(f'{parent_dir}/**/Gisaid/ONT/ONT-genome-coverage80.xlsx')[0], index=False)

### **QC final data vs. input**

In [115]:
df_ill_ids = pd.read_csv(glob.glob(f'{parent_dir}/**/Illumina-genome-coverage80_nodup.csv')[0], names=['sample_id'])
df_ill_ids.shape

(663, 1)

In [66]:
df_gisaid_fin.merge(df_ill_ids, how='inner', left_on='covv_provider_sample_id', right_on='sample_id').shape

Unnamed: 0,submitter,fn,covv_virus_name,covv_type,covv_passage,covv_collection_date,DT_SAM_RECEP,covv_location,covv_add_location,covv_host,...,covv_orig_lab,covv_orig_lab_addr,covv_provider_sample_id,covv_subm_lab,covv_subm_lab_addr,covv_subm_sample_id,covv_authors,covv_comment,comment_type,sample_id
0,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-06 00:00:00,NaT,Africa/Kenya/Nairobi,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03617,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03617,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,COVC03617
1,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-06 00:00:00,NaT,Africa/Kenya/Nairobi,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03665,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03665,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,COVC03665
2,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-06 00:00:00,NaT,Africa/Kenya/Nairobi,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03696,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03696,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,COVC03696
3,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-06 00:00:00,NaT,Africa/Kenya/Nairobi,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03723,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03723,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,COVC03723
4,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-10 00:00:00,NaT,Africa/Kenya/Nairobi,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03813,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03813,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,COVC03813
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
645,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2021-04,2021-04-07,,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL-12773,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL-12773,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,NPHL-12773
646,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2021-04,2021-04-07,,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL1,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL1,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,NPHL1
647,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2021-04,2021-04-07,,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL2,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL2,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,NPHL2
648,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2021-04,2021-04-07,,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL3,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL3,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,NPHL3


In [56]:
df_ont_ids = pd.read_csv(glob.glob(f'{parent_dir}/**/ONT-genome-coverage80.xlsx_nodup.csv')[0], names=['sample_id'])
df_ont_ids.shape

Unnamed: 0,sample_id
0,COVC14674
1,COVC14676
2,COVC14699
3,COVC14753
4,COVC14784


In [66]:
df_gisaid_fin.merge(df_ont_ids, how='inner', left_on='covv_provider_sample_id', right_on='sample_id').shape

Unnamed: 0,submitter,fn,covv_virus_name,covv_type,covv_passage,covv_collection_date,DT_SAM_RECEP,covv_location,covv_add_location,covv_host,...,covv_orig_lab,covv_orig_lab_addr,covv_provider_sample_id,covv_subm_lab,covv_subm_lab_addr,covv_subm_sample_id,covv_authors,covv_comment,comment_type,sample_id
0,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-06 00:00:00,NaT,Africa/Kenya/Nairobi,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03617,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03617,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,COVC03617
1,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-06 00:00:00,NaT,Africa/Kenya/Nairobi,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03665,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03665,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,COVC03665
2,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-06 00:00:00,NaT,Africa/Kenya/Nairobi,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03696,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03696,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,COVC03696
3,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-06 00:00:00,NaT,Africa/Kenya/Nairobi,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03723,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03723,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,COVC03723
4,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-10 00:00:00,NaT,Africa/Kenya/Nairobi,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03813,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03813,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,COVC03813
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
645,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2021-04,2021-04-07,,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL-12773,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL-12773,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,NPHL-12773
646,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2021-04,2021-04-07,,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL1,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL1,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,NPHL1
647,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2021-04,2021-04-07,,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL2,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL2,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,NPHL2
648,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2021-04,2021-04-07,,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL3,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL3,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,NPHL3
