## GISAID Metadata preparation

In [1]:
import pandas as pd
import numpy as np
from datetime import  datetime

In [2]:
dt = datetime.today().strftime(format='%d-%m-%Y')

### Load submission form

In [3]:
parent_dir = '/home/douso/SarsGenomics/Gisaid'
df_gisaid = pd.read_excel(f'{parent_dir}/20210222_EpiCoV.xls', 'Submissions')

In [4]:
df_gisaid.head()

Unnamed: 0,submitter,fn,covv_virus_name,covv_type,covv_passage,covv_collection_date,covv_location,covv_add_location,covv_host,covv_add_host_info,...,covv_coverage,covv_orig_lab,covv_orig_lab_addr,covv_provider_sample_id,covv_subm_lab,covv_subm_lab_addr,covv_subm_sample_id,covv_authors,covv_comment,comment_type
0,Submitter,FASTA filename,Virus name,Type,Passage details/history,Collection date,Location,Additional location information,Host,Additional host information,...,Coverage,Originating lab,Address,Sample ID given by originating laboratory,Submitting lab,Address,Sample ID given by the submitting laboratory,Authors,Comment,Comment Icon
1,GISAID username,all_sequences.fasta,hCoV-19/Country/Identifier/2020,betacoronavirus,"e.g. Original, Vero",2020-03-02,e.g. Continent / Country / Region,"e.g. Cruise Ship, Convention, Live animal market","e.g. Human, Animal, Environment, Laboratory de...",e.g. Patient infected while traveling in ….,...,"e.g. 70x, 1,000x, 10,000x (average)",Where the clinical specimen or virus isolate w...,,,Where sequence data have been generated and su...,,,"e.g. Jane Doe, John Doe",,


In [5]:
df_gisaid.columns

Index(['submitter', 'fn', 'covv_virus_name', 'covv_type', 'covv_passage',
       'covv_collection_date', 'covv_location', 'covv_add_location',
       'covv_host', 'covv_add_host_info', 'covv_sampling_strategy',
       'covv_gender', 'covv_patient_age', 'covv_patient_status',
       'covv_specimen', 'covv_outbreak', 'covv_last_vaccinated',
       'covv_treatment', 'covv_seq_technology', 'covv_assembly_method',
       'covv_coverage', 'covv_orig_lab', 'covv_orig_lab_addr',
       'covv_provider_sample_id', 'covv_subm_lab', 'covv_subm_lab_addr',
       'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type'],
      dtype='object')

In [6]:
gisaid_header = ['submitter', 'fn', 'covv_virus_name', 'covv_type', 'covv_passage', 'DT_SAM_COLL', 
 'DT_SAM_RECEP', 'covv_location', 'covv_add_location','covv_host', 'covv_add_host_info', 'covv_sampling_strategy',
 'GEND', 'AGE_YRS', 'covv_patient_status', 'covv_specimen', 'covv_outbreak', 
 'covv_last_vaccinated', 'covv_treatment', 'covv_seq_technology', 'covv_assembly_method', 'covv_coverage', 
 'covv_orig_lab', 'covv_orig_lab_addr', 'S_NUM', 'covv_subm_lab', 'covv_subm_lab_addr', 
 'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type']

### **Pull-in the metadata**

In [7]:
df_metadata = pd.read_excel('/home/douso/Documents/TrendData/Results/ResultsMerged/COVID19-results-merged-cln-pos.xlsx')

In [8]:
#get the columns necessary
df_headers = df_metadata[['S_NUM', 'AGE_YRS', 'GEND', 'NAT', 
           'COUNT_RES',  
           'DT_SAM_COLL', 
           'DT_SAM_RECEP']]

### **Be sure the next input is update**

In [9]:
df_seq_summ = pd.read_excel('/home/douso/Documents/TrendData/Results/ResultsMerged/seq-summary-metadata_07-09-2021.xlsx')

In [10]:
df_seq_summ.head()

Unnamed: 0,sequence_name,clade,lineage,genome_coverage,run_num,seq_dt,lib_prep,primer_set
0,COVC11310,20C,B.1,59.3,Run1,09-02-2021,Nextera_XT,ARTIC_V1
1,COVC00854,20C,B.1,94.9,Run7,18-05-2021,Run7,ARTIC_V3
2,COVC00867,20C,B.1.446,68.2,Run7,18-05-2021,Run7,ARTIC_V3
3,COVC00893,20C,B.1.446,84.1,Run7,18-05-2021,Run7,ARTIC_V3
4,COVC00915,,Failed,0.0,Run7,18-05-2021,Run7,ARTIC_V3


In [11]:
#Filter service samples; remain with internals only
df_seqd_int = df_seq_summ[df_seq_summ['sequence_name'].str.contains('KEM') == False]#.head()

In [12]:
df_seqd_int.shape

(933, 8)

In [13]:
#Merge metadata with seq summary data
df_seq_meta = df_seqd_int.set_index('sequence_name').merge(df_headers.set_index('S_NUM'), how='left', left_index=True, right_index=True)

In [14]:
def get_seq_tech(x):
    if 'ONT' in x:
        return x.replace(x, 'Minion')
    elif '_NS' in x:
        return x.replace(x, 'NextSeq')
    return x.replace(x, 'MiSeq')
        

In [15]:
df_seq_meta2 = df_seq_meta.assign(covv_seq_technology = df_seq_meta['run_num']
                                  .apply(lambda x: get_seq_tech(x)))

In [16]:
df_seq_meta2.head()

Unnamed: 0,clade,lineage,genome_coverage,run_num,seq_dt,lib_prep,primer_set,AGE_YRS,GEND,NAT,COUNT_RES,DT_SAM_COLL,DT_SAM_RECEP,covv_seq_technology
COVC00854,20C,B.1,94.9,Run7,18-05-2021,Run7,ARTIC_V3,23.0,F,Kenya,Nairobi,2020-06-11,NaT,MiSeq
COVC00867,20C,B.1.446,68.2,Run7,18-05-2021,Run7,ARTIC_V3,32.0,F,Kenya,Nairobi,2020-06-10,NaT,MiSeq
COVC00893,20C,B.1.446,84.1,Run7,18-05-2021,Run7,ARTIC_V3,31.0,M,Kenya,Nairobi,2020-06-10,NaT,MiSeq
COVC00915,,Failed,0.0,Run7,18-05-2021,Run7,ARTIC_V3,22.0,M,Kenya,Nairobi,2020-06-10,NaT,MiSeq
COVC00962,20A,B.1,92.9,Run7,18-05-2021,Run7,ARTIC_V3,31.0,M,Kenya,Nairobi,2020-06-08,2020-06-14,MiSeq


In [17]:
df_seq_meta3 = df_seq_meta2.assign(covv_location = 'Africa/' + df_seq_meta2['NAT'] + '/' + df_seq_meta2['COUNT_RES'])

In [18]:
df_subf = df_seq_meta3

In [19]:
submitter = ''
fn = ''
cvn = 'hCoV-19'
vt = 'betacoronavirus'
cp = 'Original'
cal = 'Unkown'
ch = 'Human'
cahi = 'Unkown'
css = 'Surveillance'
cps = ''
cps = 'Unkown'
cs = 'NP Swab'
co = 'Unkown'
clv = 'Unkown'
ct = 'Unkown'
# cst = 'Illumina'
cam = 'Consensus'
cc = ''
col = 'International Livestock Research Institute'
cola = 'Uthiru, Naivasha road, Nairobi-Kenya'
# cpsi = 'ILRI'
csl = 'International Livestock Research Institute'
csla = 'Uthiru, Naivasha road, Nairobi-Kenya'
cssi = df_subf.index.tolist()
ca = ''
ccomm = ''
ct = ''

In [20]:
df_subf['submitter'] = submitter
df_subf['fn'] = fn
df_subf['covv_virus_name'] = cvn
df_subf['covv_type'] = vt
df_subf['covv_passage'] = cp
df_subf['covv_add_location'] = cal
df_subf['covv_host'] = ch
df_subf['covv_add_host_info'] = cahi
df_subf['covv_sampling_strategy'] = css
df_subf['covv_patient_status'] = cps
df_subf['covv_specimen'] = cs
df_subf['covv_outbreak'] = co
df_subf['covv_last_vaccinated'] = clv
df_subf['covv_treatment'] = ct
# df_subf['covv_seq_technology'] = cst
df_subf['covv_assembly_method'] = cam
df_subf['covv_coverage'] = cc
df_subf['covv_orig_lab'] = col
df_subf['covv_orig_lab_addr'] = cola
# df_subf['covv_provider_sample_id'] = cpsi
df_subf['covv_subm_lab'] = csl
df_subf['covv_subm_lab_addr'] = csla
df_subf['covv_subm_sample_id'] = cssi
df_subf['covv_authors'] = ca
df_subf['covv_comment'] = ccomm
df_subf['comment_type'] = ct
df_subf.shape

(933, 39)

In [21]:
df_subf.columns

Index(['clade', 'lineage', 'genome_coverage', 'run_num', 'seq_dt', 'lib_prep',
       'primer_set', 'AGE_YRS', 'GEND', 'NAT', 'COUNT_RES', 'DT_SAM_COLL',
       'DT_SAM_RECEP', 'covv_seq_technology', 'covv_location', 'submitter',
       'fn', 'covv_virus_name', 'covv_type', 'covv_passage',
       'covv_add_location', 'covv_host', 'covv_add_host_info',
       'covv_sampling_strategy', 'covv_patient_status', 'covv_specimen',
       'covv_outbreak', 'covv_last_vaccinated', 'covv_treatment',
       'covv_assembly_method', 'covv_coverage', 'covv_orig_lab',
       'covv_orig_lab_addr', 'covv_subm_lab', 'covv_subm_lab_addr',
       'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type'],
      dtype='object')

In [22]:
df_subf1 = df_subf.drop(['NAT', 'COUNT_RES'], axis=1)

In [23]:
df_filter80 = df_subf1[df_subf1['genome_coverage'] >= 80.0]#.fillna('Unkown')

In [24]:
df_filter80.columns

Index(['clade', 'lineage', 'genome_coverage', 'run_num', 'seq_dt', 'lib_prep',
       'primer_set', 'AGE_YRS', 'GEND', 'DT_SAM_COLL', 'DT_SAM_RECEP',
       'covv_seq_technology', 'covv_location', 'submitter', 'fn',
       'covv_virus_name', 'covv_type', 'covv_passage', 'covv_add_location',
       'covv_host', 'covv_add_host_info', 'covv_sampling_strategy',
       'covv_patient_status', 'covv_specimen', 'covv_outbreak',
       'covv_last_vaccinated', 'covv_treatment', 'covv_assembly_method',
       'covv_coverage', 'covv_orig_lab', 'covv_orig_lab_addr', 'covv_subm_lab',
       'covv_subm_lab_addr', 'covv_subm_sample_id', 'covv_authors',
       'covv_comment', 'comment_type'],
      dtype='object')

In [25]:
df_gisaid_sub = df_filter80.reset_index().rename(columns={'index': 'S_NUM'})[gisaid_header] #.drop('DT_SAM_COLL', axis=1)

In [26]:
df_gisaid_sub.head()

Unnamed: 0,submitter,fn,covv_virus_name,covv_type,covv_passage,DT_SAM_COLL,DT_SAM_RECEP,covv_location,covv_add_location,covv_host,...,covv_coverage,covv_orig_lab,covv_orig_lab_addr,S_NUM,covv_subm_lab,covv_subm_lab_addr,covv_subm_sample_id,covv_authors,covv_comment,comment_type
0,,,hCoV-19,betacoronavirus,Original,2020-06-11,NaT,Africa/Kenya/Nairobi,Unkown,Human,...,,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC00854,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC00854,,,
1,,,hCoV-19,betacoronavirus,Original,2020-06-10,NaT,Africa/Kenya/Nairobi,Unkown,Human,...,,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC00893,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC00893,,,
2,,,hCoV-19,betacoronavirus,Original,2020-06-08,2020-06-14,Africa/Kenya/Nairobi,Unkown,Human,...,,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC00962,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC00962,,,
3,,,hCoV-19,betacoronavirus,Original,2020-06-08,2020-06-14,Africa/Kenya/Nairobi,Unkown,Human,...,,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC00987,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC00987,,,
4,,,hCoV-19,betacoronavirus,Original,2020-06-08,2020-06-14,Africa/Kenya/Nairobi,Unkown,Human,...,,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC00998,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC00998,,,


In [27]:
# df_gisaid_sub_srt = df_gisaid_sub.assign(DT_SAM_COLL=df_gisaid_sub.sort_values('DT_SAM_COLL')['DT_SAM_COLL'].map(lambda x: x.strftime('%Y-%m-%d')))

In [28]:
df_gisaid_sub.columns

Index(['submitter', 'fn', 'covv_virus_name', 'covv_type', 'covv_passage',
       'DT_SAM_COLL', 'DT_SAM_RECEP', 'covv_location', 'covv_add_location',
       'covv_host', 'covv_add_host_info', 'covv_sampling_strategy', 'GEND',
       'AGE_YRS', 'covv_patient_status', 'covv_specimen', 'covv_outbreak',
       'covv_last_vaccinated', 'covv_treatment', 'covv_seq_technology',
       'covv_assembly_method', 'covv_coverage', 'covv_orig_lab',
       'covv_orig_lab_addr', 'S_NUM', 'covv_subm_lab', 'covv_subm_lab_addr',
       'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type'],
      dtype='object')

### **Confirm the following input file is update**

In [29]:
#DT_SAM_RECEP
gisaid_cols = ['submitter', 'fn', 'covv_virus_name', 'covv_type', 'covv_passage',
       'covv_collection_date', 'DT_SAM_RECEP','covv_location', 'covv_add_location',
       'covv_host', 'covv_add_host_info', 'covv_sampling_strategy',
       'covv_gender', 'covv_patient_age', 'covv_patient_status',
       'covv_specimen', 'covv_outbreak', 'covv_last_vaccinated',
       'covv_treatment', 'covv_seq_technology', 'covv_assembly_method',
       'covv_coverage', 'covv_orig_lab', 'covv_orig_lab_addr',
       'covv_provider_sample_id', 'covv_subm_lab', 'covv_subm_lab_addr',
       'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type']


In [30]:
df_gisaid_sub.columns = gisaid_cols

In [31]:
df_dt = df_gisaid_sub[df_gisaid_sub['covv_collection_date'].isna() == False]
df_missing_dt = df_gisaid_sub[df_gisaid_sub['covv_collection_date'].isna() == True]

In [32]:
df_missing_dt_corr = df_missing_dt.assign(covv_collection_date=df_missing_dt['DT_SAM_RECEP'].apply(lambda x: pd.Timestamp(x).strftime('%Y') if (isinstance(x, pd.Timestamp)) else x.replace(x, pd.NaT)))

In [33]:
df_gisaid_fin = df_dt.append(df_missing_dt_corr)

In [34]:
df_gisaid_fin.fillna('Unkown').to_excel(f'/home/douso/Documents/TrendData/Results/ResultsMerged/gisaid_data_{dt}.xlsx', index=False)

In [35]:
df_gisaid_fin.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 608 entries, 0 to 607
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   submitter                608 non-null    object        
 1   fn                       608 non-null    object        
 2   covv_virus_name          608 non-null    object        
 3   covv_type                608 non-null    object        
 4   covv_passage             608 non-null    object        
 5   covv_collection_date     603 non-null    object        
 6   DT_SAM_RECEP             523 non-null    datetime64[ns]
 7   covv_location            597 non-null    object        
 8   covv_add_location        608 non-null    object        
 9   covv_host                608 non-null    object        
 10  covv_add_host_info       608 non-null    object        
 11  covv_sampling_strategy   608 non-null    object        
 12  covv_gender              598 non-nul