## GISAID Metadata preparation

In [1]:
import pandas as pd
import numpy as np
from datetime import  datetime
import re

In [2]:
dt = datetime.today().strftime(format='%d-%m-%Y')

### Load submission form

In [3]:
parent_dir = '/home/douso/SarsGenomics/Gisaid'
df_gisaid = pd.read_excel(f'{parent_dir}/20210222_EpiCoV.xls', 'Submissions')

In [4]:
df_gisaid.head()

Unnamed: 0,submitter,fn,covv_virus_name,covv_type,covv_passage,covv_collection_date,covv_location,covv_add_location,covv_host,covv_add_host_info,...,covv_coverage,covv_orig_lab,covv_orig_lab_addr,covv_provider_sample_id,covv_subm_lab,covv_subm_lab_addr,covv_subm_sample_id,covv_authors,covv_comment,comment_type
0,Submitter,FASTA filename,Virus name,Type,Passage details/history,Collection date,Location,Additional location information,Host,Additional host information,...,Coverage,Originating lab,Address,Sample ID given by originating laboratory,Submitting lab,Address,Sample ID given by the submitting laboratory,Authors,Comment,Comment Icon


In [5]:
df_gisaid.columns

Index(['submitter', 'fn', 'covv_virus_name', 'covv_type', 'covv_passage',
       'covv_collection_date', 'covv_location', 'covv_add_location',
       'covv_host', 'covv_add_host_info', 'covv_sampling_strategy',
       'covv_gender', 'covv_patient_age', 'covv_patient_status',
       'covv_specimen', 'covv_outbreak', 'covv_last_vaccinated',
       'covv_treatment', 'covv_seq_technology', 'covv_assembly_method',
       'covv_coverage', 'covv_orig_lab', 'covv_orig_lab_addr',
       'covv_provider_sample_id', 'covv_subm_lab', 'covv_subm_lab_addr',
       'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type'],
      dtype='object')

In [6]:
gisaid_header = ['submitter', 'fn', 'covv_virus_name', 'covv_type', 'covv_passage', 'DT_SAM_COLL', 
 'DT_SAM_RECEP', 'covv_location', 'covv_add_location','covv_host', 'covv_add_host_info', 'covv_sampling_strategy',
 'GEND', 'AGE_YRS', 'covv_patient_status', 'covv_specimen', 'covv_outbreak', 
 'covv_last_vaccinated', 'covv_treatment', 'covv_seq_technology', 'covv_assembly_method', 'covv_coverage', 
 'covv_orig_lab', 'covv_orig_lab_addr', 'S_NUM', 'covv_subm_lab', 'covv_subm_lab_addr', 
 'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type']

### **Pull-in the metadata**

In [7]:
df_metadata = pd.read_excel('/home/douso/Documents/TrendData/Results/ResultsMerged/COVID19-resultsCts-merged-cln.xlsx')

In [8]:
#get the columns necessary
df_headers = df_metadata[['S_NUM', 'AGE_YRS', 'GEND', 'NAT', 
           'COUNT_RES',  
           'DT_SAM_COLL', 
           'DT_SAM_RECEP']]

In [9]:
df_gisaid_all = pd.read_excel('/home/douso/Gisaid/over80_id_all_nodup-2.xlsx')

In [10]:
df_gisaid_all.head

<bound method NDFrame.head of      sample_id  genome_cov
0    COVC03617        98.6
1    COVC03665        98.0
2    COVC03696        98.7
3    COVC03723        99.2
4    COVC03813        97.9
..         ...         ...
726  NHRL-S021        95.4
727  NHRL-S022        89.5
728  NHRL-S024        93.3
729  NHRL-S031        95.7
730  NHRL-S036        93.9

[731 rows x 2 columns]>

In [11]:
df_mrgl = df_gisaid_all.merge(df_headers, how='left', left_on='sample_id', right_on='S_NUM').drop('sample_id', axis=1)
df_mrgl.shape

(731, 8)

In [12]:
df_mrgl.head()

Unnamed: 0,genome_cov,S_NUM,AGE_YRS,GEND,NAT,COUNT_RES,DT_SAM_COLL,DT_SAM_RECEP
0,98.6,COVC03617,55.0,F,Kenya,Nairobi,2020-07-06,NaT
1,98.0,COVC03665,23.0,M,Kenya,Nairobi,2020-07-06,NaT
2,98.7,COVC03696,24.0,M,Kenya,Nairobi,2020-07-06,NaT
3,99.2,COVC03723,36.0,M,Kenya,Nairobi,2020-07-06,NaT
4,97.9,COVC03813,59.0,M,Kenya,Nairobi,2020-07-10,NaT


### **Be sure the next input is update**

In [13]:
df_seq_summ = pd.read_excel('/home/douso/Documents/TrendData/Results/ResultsMerged/seq-summary-metadata_07-09-2021.xlsx')
df_seq_summ.shape

(1170, 8)

In [14]:
df_seqd_int = df_seq_summ.sort_values(['sequence_name', 'genome_coverage']).drop_duplicates('sequence_name', keep='last')
df_seqd_int.shape#.head()

(1136, 8)

In [16]:
df_seqd_int.shape

(1136, 8)

In [15]:
df_seqd_int.head()

Unnamed: 0,sequence_name,clade,lineage,genome_coverage,run_num,seq_dt,lib_prep,primer_set
1,COVC00854,20C,B.1,94.9,Run7,18-05-2021,Run7,ARTIC_V3
2,COVC00867,20C,B.1.446,68.2,Run7,18-05-2021,Run7,ARTIC_V3
3,COVC00893,20C,B.1.446,84.1,Run7,18-05-2021,Run7,ARTIC_V3
4,COVC00915,,Failed,0.0,Run7,18-05-2021,Run7,ARTIC_V3
5,COVC00962,20A,B.1,92.9,Run7,18-05-2021,Run7,ARTIC_V3


In [16]:
#Merge metadata with seq summary data
df_seq_meta_on = df_mrgl.merge(df_seqd_int, how='left', left_on='S_NUM', right_on='sequence_name')
df_seq_meta_on.shape

(731, 16)

In [17]:
mask = df_seq_meta_on.duplicated('sequence_name', keep=False)
df_seq_meta_on[mask == True]

Unnamed: 0,genome_cov,S_NUM,AGE_YRS,GEND,NAT,COUNT_RES,DT_SAM_COLL,DT_SAM_RECEP,sequence_name,clade,lineage,genome_coverage,run_num,seq_dt,lib_prep,primer_set


In [19]:
df_seq_meta_on.head()

Unnamed: 0,genome_cov,S_NUM,AGE_YRS,GEND,NAT,COUNT_RES,DT_SAM_COLL,DT_SAM_RECEP,sequence_name,clade,lineage,genome_coverage,run_num,seq_dt,lib_prep,primer_set
0,98.6,COVC03617,55.0,F,Kenya,Nairobi,2020-07-06,NaT,COVC03617,20C,B.1,98.4,Run8,25-05-2021,Run8,ARTIC_V3
1,98.0,COVC03665,23.0,M,Kenya,Nairobi,2020-07-06,NaT,COVC03665,20C,B.1,95.5,Run8,25-05-2021,Run8,ARTIC_V3
2,98.7,COVC03696,24.0,M,Kenya,Nairobi,2020-07-06,NaT,COVC03696,20C,B.1.349,98.4,Run8,25-05-2021,Run8,ARTIC_V3
3,99.2,COVC03723,36.0,M,Kenya,Nairobi,2020-07-06,NaT,COVC03723,20C,B.1,98.9,Run8,25-05-2021,Run8,ARTIC_V3
4,97.9,COVC03813,59.0,M,Kenya,Nairobi,2020-07-10,NaT,COVC03813,20C,B.1,97.1,Run8,25-05-2021,Run8,ARTIC_V3


In [22]:
#Merge metadata with seq summary data
df_seq_metal = df_mrgl.set_index('S_NUM').merge(df_seqd_int.set_index('sequence_name'), how='left', left_index=True, right_index=True)
df_seq_metal.shape

(731, 14)

In [21]:
def get_seq_tech(x):
    x = str(x)
    if 'ONT' in x:
        return x.replace(x, 'Minion')
    elif '_NS' in x:
        return x.replace(x, 'NextSeq')
    return x.replace(x, 'MiSeq')
        

In [23]:
df_seq_meta2 = df_seq_metal.assign(covv_seq_technology = df_seq_metal['run_num']
                                  .apply(lambda x: get_seq_tech(x)))

In [24]:
df_seq_meta2.shape

(731, 15)

In [25]:
df_seq_meta2.head()

Unnamed: 0_level_0,genome_cov,AGE_YRS,GEND,NAT,COUNT_RES,DT_SAM_COLL,DT_SAM_RECEP,clade,lineage,genome_coverage,run_num,seq_dt,lib_prep,primer_set,covv_seq_technology
S_NUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
COVC03617,98.6,55.0,F,Kenya,Nairobi,2020-07-06,NaT,20C,B.1,98.4,Run8,25-05-2021,Run8,ARTIC_V3,MiSeq
COVC03665,98.0,23.0,M,Kenya,Nairobi,2020-07-06,NaT,20C,B.1,95.5,Run8,25-05-2021,Run8,ARTIC_V3,MiSeq
COVC03696,98.7,24.0,M,Kenya,Nairobi,2020-07-06,NaT,20C,B.1.349,98.4,Run8,25-05-2021,Run8,ARTIC_V3,MiSeq
COVC03723,99.2,36.0,M,Kenya,Nairobi,2020-07-06,NaT,20C,B.1,98.9,Run8,25-05-2021,Run8,ARTIC_V3,MiSeq
COVC03813,97.9,59.0,M,Kenya,Nairobi,2020-07-10,NaT,20C,B.1,97.1,Run8,25-05-2021,Run8,ARTIC_V3,MiSeq


In [26]:
df_seq_meta3 = df_seq_meta2.assign(covv_location = 'Africa / ' + df_seq_meta2['NAT'] + ' / ' + df_seq_meta2['COUNT_RES'])

In [27]:
df_seq_meta3.shape

(731, 16)

In [28]:
df_subf = df_seq_meta3

In [29]:
submitter = 'Soyola'
fn = 'gisaid_all.fasta'
cvn = 'hCoV-19'
vt = 'betacoronavirus'
cp = 'Original'
cal = 'unkown'
ch = 'Human'
cahi = 'unkown'
css = 'Surveillance'
cps = 'unkwon'
cps = 'unkown'
cs = 'NP Swab'
co = 'unkown'
clv = 'unkown'
ct = 'unkown'
# cst = 'Illumina'
cam = 'Consensus'
cc = 'unkown'
col = 'International Livestock Research Institute'
cola = 'Uthiru, Naivasha road, Nairobi-Kenya'
# cpsi = 'ILRI'
csl = 'International Livestock Research Institute'
csla = 'Uthiru, Naivasha road, Nairobi-Kenya'
cssi = df_subf.index.tolist()
ca = 'Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, Shebbar Osiany, Edward Kiritu, Paul Dobi, Collins Muli, Patrick Amoth, Vishvanath Nene, Sonal P. Henson, Edward O. Abworo.'
ccomm = ''
ct = ''

In [30]:
df_subf['submitter'] = submitter
df_subf['fn'] = fn
df_subf['covv_virus_name'] = cvn
df_subf['covv_type'] = vt
df_subf['covv_passage'] = cp
df_subf['covv_add_location'] = cal
df_subf['covv_host'] = ch
df_subf['covv_add_host_info'] = cahi
df_subf['covv_sampling_strategy'] = css
df_subf['covv_patient_status'] = cps
df_subf['covv_specimen'] = cs
df_subf['covv_outbreak'] = co
df_subf['covv_last_vaccinated'] = clv
df_subf['covv_treatment'] = ct
# df_subf['covv_seq_technology'] = cst
df_subf['covv_assembly_method'] = cam
df_subf['covv_coverage'] = cc
df_subf['covv_orig_lab'] = col
df_subf['covv_orig_lab_addr'] = cola
# df_subf['covv_provider_sample_id'] = cpsi
df_subf['covv_subm_lab'] = csl
df_subf['covv_subm_lab_addr'] = csla
df_subf['covv_subm_sample_id'] = cssi
df_subf['covv_authors'] = ca
df_subf['covv_comment'] = ccomm
df_subf['comment_type'] = ct
df_subf.shape

(731, 40)

In [31]:
df_subf.columns

Index(['genome_cov', 'AGE_YRS', 'GEND', 'NAT', 'COUNT_RES', 'DT_SAM_COLL',
       'DT_SAM_RECEP', 'clade', 'lineage', 'genome_coverage', 'run_num',
       'seq_dt', 'lib_prep', 'primer_set', 'covv_seq_technology',
       'covv_location', 'submitter', 'fn', 'covv_virus_name', 'covv_type',
       'covv_passage', 'covv_add_location', 'covv_host', 'covv_add_host_info',
       'covv_sampling_strategy', 'covv_patient_status', 'covv_specimen',
       'covv_outbreak', 'covv_last_vaccinated', 'covv_treatment',
       'covv_assembly_method', 'covv_coverage', 'covv_orig_lab',
       'covv_orig_lab_addr', 'covv_subm_lab', 'covv_subm_lab_addr',
       'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type'],
      dtype='object')

In [32]:
df_subf1 = df_subf.drop(['NAT', 'COUNT_RES'], axis=1)

In [33]:
df_filter80 = df_subf1[df_subf1['genome_cov'] >= 80.0]#.fillna('Unkown')

In [34]:
df_filter80.shape

(731, 38)

In [35]:
df_filter80.columns

Index(['genome_cov', 'AGE_YRS', 'GEND', 'DT_SAM_COLL', 'DT_SAM_RECEP', 'clade',
       'lineage', 'genome_coverage', 'run_num', 'seq_dt', 'lib_prep',
       'primer_set', 'covv_seq_technology', 'covv_location', 'submitter', 'fn',
       'covv_virus_name', 'covv_type', 'covv_passage', 'covv_add_location',
       'covv_host', 'covv_add_host_info', 'covv_sampling_strategy',
       'covv_patient_status', 'covv_specimen', 'covv_outbreak',
       'covv_last_vaccinated', 'covv_treatment', 'covv_assembly_method',
       'covv_coverage', 'covv_orig_lab', 'covv_orig_lab_addr', 'covv_subm_lab',
       'covv_subm_lab_addr', 'covv_subm_sample_id', 'covv_authors',
       'covv_comment', 'comment_type'],
      dtype='object')

In [36]:
df_gisaid_sub = df_filter80.reset_index().rename(columns={'index': 'S_NUM'})[gisaid_header] #.drop('DT_SAM_COLL', axis=1)

In [37]:
df_gisaid_sub.head()

Unnamed: 0,submitter,fn,covv_virus_name,covv_type,covv_passage,DT_SAM_COLL,DT_SAM_RECEP,covv_location,covv_add_location,covv_host,...,covv_coverage,covv_orig_lab,covv_orig_lab_addr,S_NUM,covv_subm_lab,covv_subm_lab_addr,covv_subm_sample_id,covv_authors,covv_comment,comment_type
0,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-06,NaT,Africa / Kenya / Nairobi,unkown,Human,...,unkown,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03617,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03617,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,
1,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-06,NaT,Africa / Kenya / Nairobi,unkown,Human,...,unkown,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03665,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03665,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,
2,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-06,NaT,Africa / Kenya / Nairobi,unkown,Human,...,unkown,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03696,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03696,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,
3,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-06,NaT,Africa / Kenya / Nairobi,unkown,Human,...,unkown,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03723,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03723,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,
4,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-10,NaT,Africa / Kenya / Nairobi,unkown,Human,...,unkown,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03813,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03813,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,


In [39]:
# df_gisaid_sub_srt = df_gisaid_sub.assign(DT_SAM_COLL=df_gisaid_sub.sort_values('DT_SAM_COLL')['DT_SAM_COLL'].map(lambda x: x.strftime('%Y-%m-%d')))

In [38]:
df_gisaid_sub.columns

Index(['submitter', 'fn', 'covv_virus_name', 'covv_type', 'covv_passage',
       'DT_SAM_COLL', 'DT_SAM_RECEP', 'covv_location', 'covv_add_location',
       'covv_host', 'covv_add_host_info', 'covv_sampling_strategy', 'GEND',
       'AGE_YRS', 'covv_patient_status', 'covv_specimen', 'covv_outbreak',
       'covv_last_vaccinated', 'covv_treatment', 'covv_seq_technology',
       'covv_assembly_method', 'covv_coverage', 'covv_orig_lab',
       'covv_orig_lab_addr', 'S_NUM', 'covv_subm_lab', 'covv_subm_lab_addr',
       'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type'],
      dtype='object')

### **Confirm the following input file is update**

In [39]:
#DT_SAM_RECEP
gisaid_cols = ['submitter', 'fn', 'covv_virus_name', 'covv_type', 'covv_passage',
       'covv_collection_date', 'DT_SAM_RECEP','covv_location', 'covv_add_location',
       'covv_host', 'covv_add_host_info', 'covv_sampling_strategy',
       'covv_gender', 'covv_patient_age', 'covv_patient_status',
       'covv_specimen', 'covv_outbreak', 'covv_last_vaccinated',
       'covv_treatment', 'covv_seq_technology', 'covv_assembly_method',
       'covv_coverage', 'covv_orig_lab', 'covv_orig_lab_addr',
       'covv_provider_sample_id', 'covv_subm_lab', 'covv_subm_lab_addr',
       'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type']


In [40]:
df_gisaid_sub.columns = gisaid_cols

In [41]:
df_dt = df_gisaid_sub[df_gisaid_sub['covv_collection_date'].isna() == False]
df_missing_dt = df_gisaid_sub[df_gisaid_sub['covv_collection_date'].isna() == True]

In [42]:
df_missing_dt_corr = df_missing_dt.assign(covv_collection_date=df_missing_dt['DT_SAM_RECEP'].apply(lambda x: pd.Timestamp(x).strftime('%Y-%m') if (isinstance(x, pd.Timestamp)) else x.replace(x, pd.NaT)))

In [43]:
df_gisaid_fin = df_dt.append(df_missing_dt_corr)

In [44]:
df_gisaid_fin.shape

(731, 31)

In [75]:
df_gisaid_fin.fillna('unkown').to_excel(f'/home/douso/Documents/TrendData/Results/ResultsMerged/gisaid_data_{dt}.xlsx', index=False)

In [90]:
final = pd.read_excel('/home/douso/SarsGenomics/Gisaid/20210914_ILRI_gisaid_submission_metadata.xls', sheet_name='Submissions')
# final.columns
final['covv_seq_technology'].value_counts()

MiSeq                    400
NextSeq                  263
Minion                    68
Sequencing technology      1
Name: covv_seq_technology, dtype: int64

In [91]:
# final['covv_collection_date'].apply(lambda x: x.strftime('%Y-%m-%d'))

AttributeError: 'str' object has no attribute 'strftime'

In [89]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   submitter                732 non-null    object
 1   fn                       732 non-null    object
 2   covv_virus_name          732 non-null    object
 3   covv_type                732 non-null    object
 4   covv_passage             732 non-null    object
 5   covv_collection_date     732 non-null    object
 6   covv_location            732 non-null    object
 7   covv_add_location        732 non-null    object
 8   covv_host                732 non-null    object
 9   covv_add_host_info       732 non-null    object
 10  covv_sampling_strategy   732 non-null    object
 11  covv_gender              732 non-null    object
 12  covv_patient_age         732 non-null    object
 13  covv_patient_status      732 non-null    object
 14  covv_specimen            732 non-null    o

In [64]:
ill = pd.read_csv('/home/douso/Gisaid/over80_id_ill_nodup.csv', names=['sample_id'])
ill.head()

Unnamed: 0,sample_id
0,COVC03617
1,COVC03665
2,COVC03696
3,COVC03723
4,COVC03813


In [65]:
ill.shape

(650, 1)

In [66]:
df_gisaid_fin.merge(ill, how='inner', left_on='covv_provider_sample_id', right_on='sample_id')#.shape

Unnamed: 0,submitter,fn,covv_virus_name,covv_type,covv_passage,covv_collection_date,DT_SAM_RECEP,covv_location,covv_add_location,covv_host,...,covv_orig_lab,covv_orig_lab_addr,covv_provider_sample_id,covv_subm_lab,covv_subm_lab_addr,covv_subm_sample_id,covv_authors,covv_comment,comment_type,sample_id
0,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-06 00:00:00,NaT,Africa/Kenya/Nairobi,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03617,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03617,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,COVC03617
1,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-06 00:00:00,NaT,Africa/Kenya/Nairobi,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03665,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03665,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,COVC03665
2,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-06 00:00:00,NaT,Africa/Kenya/Nairobi,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03696,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03696,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,COVC03696
3,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-06 00:00:00,NaT,Africa/Kenya/Nairobi,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03723,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03723,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,COVC03723
4,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2020-07-10 00:00:00,NaT,Africa/Kenya/Nairobi,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03813,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",COVC03813,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,COVC03813
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
645,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2021-04,2021-04-07,,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL-12773,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL-12773,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,NPHL-12773
646,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2021-04,2021-04-07,,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL1,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL1,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,NPHL1
647,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2021-04,2021-04-07,,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL2,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL2,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,NPHL2
648,Soyola,gisaid_all.fasta,hCoV-19,betacoronavirus,Original,2021-04,2021-04-07,,Unkown,Human,...,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL3,International Livestock Research Institute,"Uthiru, Naivasha road, Nairobi-Kenya",NPHL3,"Samuel O. Oyola, Gilbert Kibet, Daniel Ouso, S...",,,NPHL3


In [56]:
ont = pd.read_csv('/home/douso/Gisaid/over80_id_ont_nodup.csv', names=['sample_id'])
ont.headd()

Unnamed: 0,sample_id
0,COVC14674
1,COVC14676
2,COVC14699
3,COVC14753
4,COVC14784


In [58]:
ont.shape

(66, 1)

In [None]:
df_gisaid_fin.merge(ont, how='left', left_on='covv_provider_sample_id', right_on='sample_id')

In [49]:
l =list(df_mrgl['S_NUM'])
df_filter = df_gisaid_fin[df_gisaid_fin['covv_subm_sample_id'].isin(l) == True]
df_filter.shape

(716, 31)

In [67]:
df_nxt_ill = pd.read_csv('/home/douso/Gisaid/Illumina/transposed_report.tsv', sep='\t')

In [68]:
df_nxt_ill.head(12)

Unnamed: 0,Assembly,# contigs (>= 0 bp),# contigs (>= 1000 bp),# contigs (>= 5000 bp),# contigs (>= 10000 bp),# contigs (>= 25000 bp),# contigs (>= 50000 bp),Total length (>= 0 bp),Total length (>= 1000 bp),Total length (>= 5000 bp),...,Largest alignment,Total aligned length,NA50,NGA50,NA75,NGA75,LA50,LGA50,LA75,LGA75
0,COCV11310_S6.consensus,1,1,1,1,1,0,29903,29903,29903,...,17906,17906,17906,17906,-,-,1,1,-,-
1,COVC03617_S74.consensus,1,1,1,1,1,0,29903,29903,29903,...,29487,29487,29487,29487,29487,29487,1,1,1,1
2,COVC03665_S75.consensus,1,1,1,1,1,0,29903,29903,29903,...,29300,29300,29300,29300,29300,29300,1,1,1,1
3,COVC03696_S76.consensus,1,1,1,1,1,0,29903,29903,29903,...,29511,29511,29511,29511,29511,29511,1,1,1,1
4,COVC03723_S77.consensus,1,1,1,1,1,0,29903,29903,29903,...,29667,29667,29667,29667,29667,29667,1,1,1,1
5,COVC03813_S78.consensus,1,1,1,1,1,0,29903,29903,29903,...,29275,29275,29275,29275,29275,29275,1,1,1,1
6,COVC03955_S79.consensus,1,1,1,1,1,0,29903,29903,29903,...,29487,29487,29487,29487,29487,29487,1,1,1,1
7,COVC03960_S80.consensus,1,1,1,1,1,0,29903,29903,29903,...,29487,29487,29487,29487,29487,29487,1,1,1,1
8,COVC03965_S81.consensus,1,1,1,1,1,0,29903,29903,29903,...,29275,29275,29275,29275,29275,29275,1,1,1,1
9,COVC03977_S82.consensus,1,1,1,1,1,0,29903,29903,29903,...,29487,29487,29487,29487,29487,29487,1,1,1,1


In [69]:
df_nxt_ill.columns

Index(['Assembly', '# contigs (>= 0 bp)', '# contigs (>= 1000 bp)',
       '# contigs (>= 5000 bp)', '# contigs (>= 10000 bp)',
       '# contigs (>= 25000 bp)', '# contigs (>= 50000 bp)',
       'Total length (>= 0 bp)', 'Total length (>= 1000 bp)',
       'Total length (>= 5000 bp)', 'Total length (>= 10000 bp)',
       'Total length (>= 25000 bp)', 'Total length (>= 50000 bp)', '# contigs',
       'Largest contig', 'Total length', 'Reference length', 'GC (%)',
       'Reference GC (%)', 'N50', 'NG50', 'N75', 'NG75', 'L50', 'LG50', 'L75',
       'LG75', '# misassemblies', '# misassembled contigs',
       'Misassembled contigs length', '# local misassemblies',
       '# scaffold gap ext. mis.', '# scaffold gap loc. mis.',
       '# unaligned mis. contigs', '# unaligned contigs', 'Unaligned length',
       'Genome fraction (%)', 'Duplication ratio', '# N's per 100 kbp',
       '# mismatches per 100 kbp', '# indels per 100 kbp',
       '# genomic features', 'Largest alignment', 'Total a

In [70]:
df_nxt_ill_cov = df_nxt_ill[['Assembly', 'Genome fraction (%)']]

In [71]:
df_nxt_ill_cov.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 952 entries, 0 to 951
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Assembly             952 non-null    object
 1   Genome fraction (%)  952 non-null    object
dtypes: object(2)
memory usage: 15.0+ KB


In [72]:
df_nxt_ill_cov.to_excel('/home/douso/Gisaid/df_nxt_ill_cov.xlsx')

In [198]:
def cov_names(x):
    x = x.upper().replace(' ', '')
    if len(x) == 8:
        x = x.replace('COVC', 'COVC0')
    elif len(x) == 7:
        x = x.replace('COVC', 'COVC00')
    elif len(x) == 6:
        x = x.replace('COVC', 'COVC000')
    elif len(x) == 5:
        x = x.replace('COVC', 'COVC0000')
    return x

In [73]:
def clean(x):
    if float(x) and x != '-':
        return round(float(x), 1)
    else:
        return np.NaN

In [76]:
p = re.compile("_S[1-9]{0,3}\w.*")
new_ass = df_nxt_ill_cov['Assembly'].apply(lambda x: p.sub('', x))
new_gen_cov = df_nxt_ill_cov['Genome fraction (%)'].replace('-', np.NaN).apply(lambda x: clean(x))

# p.findall('NPHL1_S29_S13_S27_S30.consensus')

In [77]:
new_ass

0       COCV11310
1       COVC03617
2       COVC03665
3       COVC03696
4       COVC03723
          ...    
947    NPHL_12780
948         NPHL1
949         NPHL2
950         NPHL3
951         NPHL4
Name: Assembly, Length: 952, dtype: object

In [78]:
new_gen_cov

0      59.9
1      98.6
2      98.0
3      98.7
4      99.2
       ... 
947    26.9
948    99.0
949    74.7
950    98.1
951    61.0
Name: Genome fraction (%), Length: 952, dtype: float64

In [79]:
df_nxt_new = pd.DataFrame({'sample_id': new_ass, 'genome_cov': new_gen_cov})

In [80]:
df_nxt_new[df_nxt_new['genome_cov'] >= 80.0].to_excel('/home/douso/Gisaid/Illumina/over80_id_ill-2.xlsx')

In [81]:
df_nxt_ont = pd.read_csv('/home/douso/Gisaid/ONT/transposed_report.tsv', sep='\t')

In [89]:
df_nxt_ont_cov = df_nxt_ont[['Assembly', 'Genome fraction (%)']]

In [90]:
p = re.compile("_S[1-9]{0,3}\w.*")
new_ass = df_nxt_ont_cov['Assembly'].apply(lambda x: p.sub('', x))
new_gen_cov = df_nxt_ont_cov['Genome fraction (%)'].replace('-', np.NaN).apply(lambda x: clean(x))

# p.findall('NPHL1_S29_S13_S27_S30.consensus')

In [91]:
new_ass

0     COVC14674.consensus
1     COVC14676.consensus
2     COVC14699.consensus
3     COVC14753.consensus
4     COVC14784.consensus
             ...         
91    NHRL_N030.consensus
92    NHRL_N031.consensus
93    NHRL_N033.consensus
94    NHRL_N034.consensus
95    NHRL_N036.consensus
Name: Assembly, Length: 96, dtype: object

In [92]:
new_gen_cov

0     96.1
1     96.1
2     95.3
3     80.2
4     94.6
      ... 
91    15.6
92    95.7
93    11.8
94    16.4
95    93.9
Name: Genome fraction (%), Length: 96, dtype: float64

In [93]:
df_nxt_new = pd.DataFrame({'sample_id': new_ass, 'genome_cov': new_gen_cov})

In [94]:
df_nxt_new[df_nxt_new['genome_cov'] >= 80.0].to_excel('/home/douso/Gisaid/ONT/over80_id_ont-2.xlsx')