## **GISAID Submission Metadata Preparation**

In [1]:
import pandas as pd
import numpy as np
import glob, os, re
from datetime import  datetime

### **Preliminary variables**

In [2]:
#uniqueness in directory and file names is assumed for all analyses
sars_dir = "SARS-CoV-2"
home_dir = os.getenv('HOME')
parent_dir = glob.glob(f'{home_dir}/**/{sars_dir}', recursive=True)[0]

### **Load submission form**

In [3]:
df_gisaid = pd.read_excel(glob.glob(f'{parent_dir}/**/20210222_EpiCoV.xls', recursive=True)[0], 'Submissions')

In [4]:
# df_gisaid.head()

In [5]:
gisaid_header = ['submitter', 'fn', 'covv_virus_name', 'covv_type', 'covv_passage', 'DT_SAM_COLL', 
 'DT_SAM_RECEP', 'CASE_ID', 'covv_location', 'covv_add_location','covv_host', 'covv_add_host_info', 'covv_sampling_strategy',
 'GEND', 'AGE_YRS', 'covv_patient_status', 'covv_specimen', 'covv_outbreak', 
 'covv_last_vaccinated', 'covv_treatment', 'covv_seq_technology', 'covv_assembly_method', 'covv_coverage', 
 'covv_orig_lab', 'covv_orig_lab_addr', 'S_NUM', 'covv_subm_lab', 'covv_subm_lab_addr', 
 'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type']

### **Pull-in the metadata**

In [6]:
# get the necessary columns 
df_metadata = pd.read_excel(glob.glob(f'{parent_dir}/**/COVID19-resultsCts-merged-cln.xlsx', recursive=True)[0])[['CASE_ID', 'S_NUM', 'AGE_YRS', 'GEND', 'NAT', 
           'COUNT_RES', 'SAMP_TYPE', 'DT_SAM_COLL', 'DT_SAM_RECEP']]

In [7]:
dt = datetime.today().strftime(format='%d-%m-%Y')
ref = 'seq39-41'
ref_fasta = 'gisaid_all_7.fasta'

### **Be sure the next input is update**

In [8]:
# import sequence summary data
df_seq_summ = pd.read_excel(glob.glob(f'{parent_dir}/**/seq-summary-metadata-seq39-41_21-04-2022.xlsx')[0])
df_seq_summ.shape

(391, 12)

In [9]:
# mark duplicate
df_seq_summ[df_seq_summ.sequence_name.duplicated(keep='first') == True]

Unnamed: 0,case_id,sequence_name,genome_coverage,lineage,clade,run_num,tech,seq_dt,lib_prep,primer_set,analysis_pipeline,sno.


In [10]:
# filter duplicate
df_seq_dedup = df_seq_summ.sort_values(['sequence_name', 'genome_coverage']).drop_duplicates('sequence_name', keep='last')
df_seq_dedup.shape#.head()

(391, 12)

In [12]:
# filter genome coverage/fraction
df_seq_cutoff = df_seq_dedup[df_seq_dedup.genome_coverage >= 80.0]
df_seq_cutoff.shape#.head()

(360, 12)

In [13]:
# filter sample metadata to sequence metadata
metadata = df_metadata[df_metadata.S_NUM.isin(df_seq_cutoff.sequence_name)].rename(columns={'S_NUM': 'sequence_name'})

In [14]:
# mark duplicate
metadata[metadata.sequence_name.duplicated(keep=False) == True]

Unnamed: 0,CASE_ID,sequence_name,AGE_YRS,GEND,NAT,COUNT_RES,SAMP_TYPE,DT_SAM_COLL,DT_SAM_RECEP


### **Merge metadata with seq summary data**

In [15]:
# Merge metadata with seq summary data
df_seq_meta = metadata.merge(df_seq_cutoff, how='right', on='sequence_name')
df_seq_meta.shape

(360, 20)

In [16]:
# mark duplicate
df_seq_meta[df_seq_meta.sequence_name.duplicated(keep=False) == True]

Unnamed: 0,CASE_ID,sequence_name,AGE_YRS,GEND,NAT,COUNT_RES,SAMP_TYPE,DT_SAM_COLL,DT_SAM_RECEP,case_id,genome_coverage,lineage,clade,run_num,tech,seq_dt,lib_prep,primer_set,analysis_pipeline,sno.


### **Filter non-GISAID samples**

In [17]:
# Filter service samples; remain with internals only
df_seq_filtered1 = df_seq_meta[df_seq_meta.sequence_name.str.contains('COVS') == False]#.head()

# df_seq_filtered2 = df_seq_filtered1[df_seq_filtered1.CASE_ID.str.contains('DRC02') == False]
df_seq_filtered3 = df_seq_filtered1[df_seq_filtered1.sequence_name.str.contains('SSEQ') == False]
df_seq_filtered = df_seq_filtered3  # [df_seq_filtered1.lineage.str.contains('B.1.1.529') == False]
df_seq_filtered.shape[0]

360

In [18]:
# df_seq_filtered.head()

In [19]:
# update columns
df_gisaid1 = df_seq_filtered.assign(covv_location = 'Africa / ' + df_seq_filtered['NAT'] + ' / ' + df_seq_filtered['COUNT_RES'])
df_gisaid2 = df_gisaid1.assign(GEND = df_gisaid1.GEND.replace(['M', 'F'], ['Male', 'Female']))
df_gisaid3 = df_gisaid2.assign(covv_virus_name = df_gisaid2.sequence_name.map(lambda x: f'hCoV-19/Kenya/ILRI_{x}/'))
df_gisaid4 = df_gisaid3.assign(covv_subm_sample_id = df_gisaid3['sequence_name'])
df_gisaid5 = df_gisaid4.rename(columns={'AGE_YRS': 'covv_patient_age', 'tech': 'covv_seq_technology', 
                                        'sequence_name': 'covv_provider_sample_id', 'GEND': 'covv_gender', 
                                        'SAMP_TYPE': 'covv_specimen', 'DT_SAM_COLL': 'covv_collection_date'})


In [20]:
df_gisaid5.columns

Index(['CASE_ID', 'covv_provider_sample_id', 'covv_patient_age', 'covv_gender',
       'NAT', 'COUNT_RES', 'covv_specimen', 'covv_collection_date',
       'DT_SAM_RECEP', 'case_id', 'genome_coverage', 'lineage', 'clade',
       'run_num', 'covv_seq_technology', 'seq_dt', 'lib_prep', 'primer_set',
       'analysis_pipeline', 'sno.', 'covv_location', 'covv_virus_name',
       'covv_subm_sample_id'],
      dtype='object')

In [21]:
df_subf = df_gisaid5

In [22]:
# assign metada variables
submitter = 'soyola'
fn = ref_fasta
# cvn = 'hCoV-19/Kenya/ILRI_'
vt = 'betacoronavirus'
cp = 'Original'
cal = 'unknown'
ch = 'Human'
cahi = 'unknown'
css = 'Surveillance'
cps = 'unknown'
cps = 'unknown'
# cs = 'NP Swab'
co = 'unknown'
clv = 'unknown'
ct = 'unknown'
# cst = 'Illumina'
cam = 'Consensus'
cc = 'unknown'
col = 'International Livestock Research Institute'
cola = 'Uthiru, Naivasha road, Nairobi-Kenya'
# cpsi = 'ILRI'
csl = 'International Livestock Research Institute'
csla = 'Uthiru, Naivasha road, Nairobi-Kenya'
# cssi = df_subf.index.tolist()
ca = 'Samuel O. Oyola, Daniel Ouso, Gilbert Kibet, Shebbar Osiany, Edward Kiritu, Paul Dobi, Collins Muli, Patrick Amoth, Vishvanath Nene, Sonal P. Henson, Edward O. Abworo'
ccomm = ''
ct = ''

In [23]:
# update gisaid columns
df_subf['submitter'] = submitter
df_subf['fn'] = fn
# df_subf['covv_virus_name'] = cvn
df_subf['covv_type'] = vt
df_subf['covv_passage'] = cp
df_subf['covv_add_location'] = cal
df_subf['covv_host'] = ch
df_subf['covv_add_host_info'] = cahi
df_subf['covv_sampling_strategy'] = css
df_subf['covv_patient_status'] = cps
# df_subf['covv_specimen'] = cs
df_subf['covv_outbreak'] = co
df_subf['covv_last_vaccinated'] = clv
df_subf['covv_treatment'] = ct
# df_subf['covv_seq_technology'] = cst
df_subf['covv_assembly_method'] = cam
df_subf['covv_coverage'] = cc
df_subf['covv_orig_lab'] = col
df_subf['covv_orig_lab_addr'] = cola
# df_subf['covv_provider_sample_id'] = cpsi
df_subf['covv_subm_lab'] = csl
df_subf['covv_subm_lab_addr'] = csla
# df_subf['covv_subm_sample_id'] = cssi
df_subf['covv_authors'] = ca
df_subf['covv_comment'] = ccomm
df_subf['comment_type'] = ct
df_subf.shape

(360, 44)

In [24]:
# drop supplimentary columns
df_sub_draft1 = df_subf.drop(['NAT', 'COUNT_RES', 'genome_coverage', 'clade', 'lineage', 'run_num', 'seq_dt', 'lib_prep', 'primer_set',
       'analysis_pipeline', 'sno.'], axis=1)
df_sub_draft1.shape

(360, 33)

In [25]:
# df_sub_draft1.head()

In [26]:
# reorder columns according to gisaid template
gisaid_cols = ['submitter', 'fn', 'covv_virus_name', 'covv_type', 'covv_passage',
       'covv_collection_date', 'DT_SAM_RECEP', 'CASE_ID', 'covv_location', 'covv_add_location',
       'covv_host', 'covv_add_host_info', 'covv_sampling_strategy',
       'covv_gender', 'covv_patient_age', 'covv_patient_status',
       'covv_specimen', 'covv_outbreak', 'covv_last_vaccinated',
       'covv_treatment', 'covv_seq_technology', 'covv_assembly_method',
       'covv_coverage', 'covv_orig_lab', 'covv_orig_lab_addr',
       'covv_provider_sample_id', 'covv_subm_lab', 'covv_subm_lab_addr',
       'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type']


### **Format Dates**

In [27]:
# partition df to with(out) sample collection dates and fortmat dates to strings
df_dt = df_sub_draft1[df_sub_draft1['covv_collection_date'].isna() == False]
df_dt_fmt = df_dt.assign(covv_collection_date=df_dt['covv_collection_date'].dt.strftime('%Y-%m-%d'))
df_missing_dt = df_sub_draft1[df_sub_draft1['covv_collection_date'].isna() == True]

In [28]:
# add short/approx. dates for where sample collection dates are missing
df_missing_dt_corr = df_missing_dt.assign(covv_collection_date=df_missing_dt['DT_SAM_RECEP'].apply(lambda x: pd.Timestamp(x).strftime('%Y-%m') if (isinstance(x, pd.Timestamp)) else x.replace(x, pd.NaT)))

In [29]:
# merge approx. dates df to the string-formated dates df
df_sub_draft2 = df_dt_fmt.append(df_missing_dt_corr)
df_sub_draft2.shape

(360, 33)

In [30]:
# re-assign the date column with the updated dates column
df_sub_draft2['covv_virus_name'] = df_sub_draft2.covv_virus_name.map(str) + df_sub_draft2.covv_collection_date.map(lambda x: str(x).split('-')[0])

### **Previous Submissions**

In [31]:
# import previous submissions to filter submitted samples
df_prev_sub1 = pd.read_excel(glob.glob(f'{parent_dir}/**/Submissions/20210914_ILRI_gisaid_submission_metadata.xls')[0], sheet_name='Submissions', usecols=['covv_provider_sample_id'])
df_prev_sub2 = pd.read_excel(glob.glob(f'{parent_dir}/**/Submissions/20211102_ILRI_gisaid_submission_metadata.xls')[0], sheet_name='Submissions', usecols=['covv_provider_sample_id'])
df_prev_sub3 = pd.read_excel(glob.glob(f'{parent_dir}/**/Submissions/20211214_ILRI_gisaid_submission_metadata.xls')[0], sheet_name='Submissions', usecols=['covv_provider_sample_id'])
df_prev_sub4 = pd.read_excel(glob.glob(f'{parent_dir}/**/Submissions/20220110_ILRI_gisaid_submission_metadata.xls')[0], sheet_name='Submissions', usecols=['covv_provider_sample_id'])
df_prev_sub5 = pd.read_excel(glob.glob(f'{parent_dir}/**/Submissions/20220204_ILRI_gisaid_submission_metadata.xls')[0], sheet_name='Submissions', usecols=['covv_provider_sample_id'])
df_prev_sub6 = pd.read_excel(glob.glob(f'{parent_dir}/**/Submissions/20220318_ILRI_gisaid_submission_metadata.xls')[0], sheet_name='Submissions', usecols=['covv_provider_sample_id'])
df_prev_sub7 = pd.read_excel(glob.glob(f'{parent_dir}/**/Submissions/20220321_ILRI_gisaid_submission_metadata.xls')[0], sheet_name='Submissions', usecols=['covv_provider_sample_id'])

# merge all the previous submissions
df_prev_sub = df_prev_sub1.append([df_prev_sub2, df_prev_sub3, df_prev_sub4, df_prev_sub5, df_prev_sub6, df_prev_sub7])

In [32]:
# filter previously submitted sample
df_sub_draft = df_sub_draft2[df_sub_draft2['covv_provider_sample_id'].isin(df_prev_sub.covv_provider_sample_id) == False].fillna('unknown').sort_values('covv_provider_sample_id')[gisaid_cols] 

# export finalised submission form
df_sub_draft.to_excel(f"{glob.glob(f'{parent_dir}/Gisaid')[0]}/gisaid_data_{ref}_{dt}.xlsx", index=False)

# export sample IDs for retreieving and renaming fasta files
df_sub_draft[['covv_provider_sample_id', 'covv_virus_name', 'CASE_ID']].to_csv(f"{glob.glob(f'{parent_dir}/Gisaid')[0]}/gisaid_IDs-Names_{ref}_{dt}.csv", index=False, header=False)

df_sub_draft.shape

(360, 32)

In [33]:
# df_final_feedB = pd.read_excel(glob.glob(f'{parent_dir}/**/Submissions/20210914_ILRI_gisaid_submission_metadata_curated.xls')[0], sheet_name='Submissions')
df_sub_draft['covv_seq_technology'].value_counts()

NextSeq    197
MiSeq       95
MinION      68
Name: covv_seq_technology, dtype: int64

In [34]:
df_sub_draft.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 360 entries, 0 to 359
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   submitter                360 non-null    object
 1   fn                       360 non-null    object
 2   covv_virus_name          360 non-null    object
 3   covv_type                360 non-null    object
 4   covv_passage             360 non-null    object
 5   covv_collection_date     360 non-null    object
 6   DT_SAM_RECEP             360 non-null    object
 7   CASE_ID                  360 non-null    object
 8   covv_location            360 non-null    object
 9   covv_add_location        360 non-null    object
 10  covv_host                360 non-null    object
 11  covv_add_host_info       360 non-null    object
 12  covv_sampling_strategy   360 non-null    object
 13  covv_gender              360 non-null    object
 14  covv_patient_age         360 non-null    o