# GISAID Metadata - Preprocessing
## Conversion from JSON to TSV

In [4]:
import json

# Show available columns
with open('../../data/gisaid.json') as f:
    print(json.loads(f.readline()).keys())

dict_keys(['covv_accession_id', 'covv_clade', 'covv_collection_date', 'covsurver_prot_mutations', 'covsurver_uniquemutlist', 'covv_orig_lab', 'covv_passage', 'covv_subm_lab', 'gc_content', 'covv_host', 'is_high_coverage', 'is_reference', 'is_complete', 'covv_lineage', 'pangolin_lineages_version', 'covv_location', 'n_content', 'sequence_length', 'covv_subm_date', 'covv_type', 'covv_variant'])


In [1]:
with open('../../data/gisaid.json', 'r') as json_file:
    print(sum(1 for line in json_file))  # 8,102,980 samples found

8102980


In [7]:
import json
import csv

error_count = 0

with open('../../data/gisaid.json', 'r', encoding='utf-8') as json_file:
  with open('../../data/gisaid_metadata.tsv', 'w') as tsv_file:
    for i, line in enumerate(json_file):
      # Read one key-value object per line
      obj = json.loads(line)

      # For first object, take keys as header for TSV
      if i == 0:
        dw = csv.DictWriter(tsv_file, obj.keys(), delimiter='\t')
        dw.writeheader()

      try:
        # For each object, append its values as new row to TSV
        dw.writerow(obj)
      except UnicodeEncodeError:
        # If UnicodeEncodeError occurs, ignore row
        error_count += 1
        continue

print(error_count)  # -> 50,763 rows have thrown UnicodeEncodeError

50763


In [12]:
import dask.dataframe as dd
import pandas as pd

gisaid_metadata = dd.read_csv('../../data/gisaid_metadata.tsv', encoding='iso-8859-1', delimiter='\t', dtype = {'covv_type': str})
gisaid_metadata = gisaid_metadata.compute()
gisaid_metadata.info()

  path_info,


<class 'pandas.core.frame.DataFrame'>
Int64Index: 8052217 entries, 0 to 52179
Data columns (total 21 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   covv_accession_id          object 
 1   covv_clade                 object 
 2   covv_collection_date       object 
 3   covsurver_prot_mutations   object 
 4   covsurver_uniquemutlist    object 
 5   covv_orig_lab              object 
 6   covv_passage               object 
 7   covv_subm_lab              object 
 8   gc_content                 float64
 9   covv_host                  object 
 10  is_high_coverage           bool   
 11  is_reference               bool   
 12  is_complete                bool   
 13  covv_lineage               object 
 14  pangolin_lineages_version  object 
 15  covv_location              object 
 16  n_content                  float64
 17  sequence_length            int64  
 18  covv_subm_date             object 
 19  covv_type                  object 
 20  covv

In [13]:
len(gisaid_metadata)
# -> 8,052,217 rows
# -> further 50,763 rows missing due to UnicodeEncodeError

8052217

In [14]:
gisaid_metadata.head()

Unnamed: 0,covv_accession_id,covv_clade,covv_collection_date,covsurver_prot_mutations,covsurver_uniquemutlist,covv_orig_lab,covv_passage,covv_subm_lab,gc_content,covv_host,...,is_reference,is_complete,covv_lineage,pangolin_lineages_version,covv_location,n_content,sequence_length,covv_subm_date,covv_type,covv_variant
0,EPI_ISL_4087268,GK,2021-09-02,"(M_I82T,NSP12_P323L,Spike_P681R,Spike_R158del,...",,Lighthouse Lab in Alderley Park,Original,Wellcome Sanger Institute for the COVID-19 Gen...,0.379603,Human,...,False,True,AY.4,2022-02-02,Europe / United Kingdom / England,0.0,29760,2021-09-11,betacoronavirus,VOC Delta GK (B.1.617.2+AY.*) first detected i...
1,EPI_ISL_1169235,GRY,2021-02-27,"(Spike_V70del,NSP12_P323L,Spike_D1118H,NSP3_T1...",,ASL Napoli 1 Centro,Original,AMES Centro Polidiagnostico Strumentale S.r.l.,0.379672,Human,...,False,True,B.1.1.7,2022-02-02,Europe / Italy / Campania / Napoli,0.008731,29666,2021-03-06,betacoronavirus,VOC Alpha GRY (B.1.1.7+Q.*) first detected in ...
2,EPI_ISL_6063368,GK,2021-10-05,"(M_I82T,NSP12_P323L,Spike_P681R,Spike_R158del,...",,Aegis Sciences Corporation,Original,Centers for Disease Control and Prevention D...,0.379456,Human,...,False,True,AY.100,2022-02-02,North America / USA / Texas,0.000134,29823,2021-11-08,betacoronavirus,VOC Delta GK (B.1.617.2+AY.*) first detected i...
3,EPI_ISL_6928247,GK,2021-11-20,"(M_I82T,NSP12_P323L,Spike_P681R,NSP14_A344S,Sp...",,Lighthouse Lab in Alderley Park,Original,Wellcome Sanger Institute for the COVID-19 Gen...,0.379375,Human,...,False,True,AY.43,2022-02-02,Europe / United Kingdom / England,0.060853,29449,2021-12-01,betacoronavirus,VOC Delta GK (B.1.617.2+AY.*) first detected i...
4,EPI_ISL_8148179,GK,2021-12-06,"(NSP3_K839R,M_I82T,NS7a_S83L,NSP12_P323L,Spike...",,CDPH VBL,Original,California Department of Public Health,0.37973,Human,...,False,True,AY.103,2022-02-02,North America / USA / California / Alameda County,0.023005,29863,2021-12-28,betacoronavirus,VOC Delta GK (B.1.617.2+AY.*) first detected i...


In [15]:
gisaid_metadata.tail()

Unnamed: 0,covv_accession_id,covv_clade,covv_collection_date,covsurver_prot_mutations,covsurver_uniquemutlist,covv_orig_lab,covv_passage,covv_subm_lab,gc_content,covv_host,...,is_reference,is_complete,covv_lineage,pangolin_lineages_version,covv_location,n_content,sequence_length,covv_subm_date,covv_type,covv_variant
52175,EPI_ISL_4085782,GK,2021-09-03,"(M_I82T,NSP12_P323L,Spike_P681R,Spike_R158del,...",,Berkshire and Surrey Pathology Services Lighth...,Original,Wellcome Sanger Institute for the COVID-19 Gen...,0.379623,Human,...,False,True,AY.4.2,2022-02-02,Europe / United Kingdom / England,0.0,29769,2021-09-11,betacoronavirus,VOC Delta GK (B.1.617.2+AY.*) first detected i...
52176,EPI_ISL_2451549,G,2021-05-23,"(NSP12_P323L,Spike_P26S,Spike_D138Y,N_P80R,Spi...",,Quest Diagnostics Incorporated,Original,Centers for Disease Control and Prevention D...,0.379687,Human,...,False,True,P.1,2022-02-02,North America / USA / Arizona,0.00937,29777,2021-06-08,betacoronavirus,VOC Gamma GR/501Y.V3 (P.1+P.1.*) first detecte...
52177,EPI_ISL_4085783,GK,2021-09-03,"(M_I82T,NSP12_P323L,Spike_P681R,NSP12_T26I,N_D...",,Berkshire and Surrey Pathology Services Lighth...,Original,Wellcome Sanger Institute for the COVID-19 Gen...,0.380545,Human,...,False,True,AY.4,2022-02-02,Europe / United Kingdom / England,0.06949,29775,2021-09-11,betacoronavirus,VOC Delta GK (B.1.617.2+AY.*) first detected i...
52178,EPI_ISL_4085784,GK,2021-09-02,"(M_I82T,NSP12_D879Y,NSP12_P323L,Spike_P681R,Sp...",,Berkshire and Surrey Pathology Services Lighth...,Original,Wellcome Sanger Institute for the COVID-19 Gen...,0.37959,Human,...,False,True,AY.4,2022-02-02,Europe / United Kingdom / England,0.0,29769,2021-09-11,betacoronavirus,VOC Delta GK (B.1.617.2+AY.*) first detected i...
52179,EPI_ISL_3987256,GK,2021-08-18,"(M_I82T,NSP12_P323L,Spike_P681R,Spike_R158del,...",,Swedish national genomic surveillance program ...,Original,The Public Health Agency of Sweden,0.379528,Human,...,False,True,AY.46.6,2022-02-02,Europe / Sweden / Kronoberg,0.0,29816,2021-09-07,betacoronavirus,VOC Delta GK (B.1.617.2+AY.*) first detected i...


## Data Cleaning

In [17]:
# Remove non-human hosts
original_len = len(gisaid_metadata)
gisaid_metadata = gisaid_metadata[gisaid_metadata['covv_host'] == 'Human']
print(f'{original_len - len(gisaid_metadata)} non-human samples removed')  # 5,411 samples removed

5411 non-human samples removed


## Data Enrichment

### Country

In [18]:
gisaid_metadata['covv_location'].value_counts()

Europe / United Kingdom / England                                 1643702
North America / USA / California                                   237471
Europe / United Kingdom / Scotland                                 183835
Europe / United Kingdom / Wales                                    170302
North America / USA / Colorado                                     154177
                                                                   ...   
Europe / Poland / Kujawsko-Pomorskie / Inowroclaw                       1
Europe / Poland / Pomorskie / Katruski                                  1
North America / USA / Arizona / Lake                                    1
Asia / India / Madhya Pradesh / Ratalam                                 1
Europe / Germany / Mecklenburg-Western Pomerania / Ueckermunde          1
Name: covv_location, Length: 27917, dtype: int64

In [19]:
# Extract country from location consisting of at least the continent (index 0) and country (index 1)
gisaid_metadata['country'] = gisaid_metadata['covv_location'].apply(lambda loc: loc.split('/')[1].strip())
gisaid_metadata['country'].unique()

array(['United Kingdom', 'Italy', 'USA', 'France', 'Mexico', 'Sweden',
       'Netherlands', 'Austria', 'Spain', 'Japan', 'Australia', 'Belgium',
       'India', 'Switzerland', 'Canada', 'Reunion', 'Guam', 'Lithuania',
       'El Salvador', 'Brazil', 'Bulgaria', 'Northern Mariana Islands',
       'Slovenia', 'Turkey', 'Germany', 'Argentina',
       'U.S. Virgin Islands', 'Nigeria', 'Slovakia', 'Vietnam',
       'Czech Republic', 'Denmark', 'Ireland', 'Myanmar', 'Iran', 'Peru',
       'Indonesia', 'Hong Kong', 'Portugal', 'Georgia', 'Romania',
       'Colombia', 'Malaysia', 'Bangladesh', 'Jordan', 'Croatia', 'Chile',
       'South Africa', 'Poland', 'Egypt', 'Venezuela', 'Russia', 'Aruba',
       'Serbia', 'United Arab Emirates', 'Israel', 'Taiwan', 'Maldives',
       'Finland', 'Greece', 'Ghana', 'Montenegro', 'Hungary',
       'Trinidad and Tobago', 'China', 'Thailand', 'Singapore',
       'Guadeloupe', 'Ecuador', 'South Korea', 'Qatar', 'Martinique',
       'Bermuda', 'Saint Vincent 

### WHO Label of Virus Variant

Structure of covv_variant column:

`VOI/VOC/VUM <WHO_label> <GISAID_clade> (<Pango_lineage>) first detected in <location>`

In [20]:
# Add column with short version of variant names
# "Unknown" is set, if covv_variant is NaN
gisaid_metadata['variant_WHO_label'] = gisaid_metadata['covv_variant'].apply(lambda name: str(name).split(' ')[1] if 'first' in str(name) else 'Unknown')

# Rename GISAID clade "GH/490R" to short version of Pango lineage "B.1.640+B.1.640.*"
# Was added as name of this variant has no WHO label like "Alpha" etc.
gisaid_metadata['variant_WHO_label'] = gisaid_metadata['variant_WHO_label'].replace('GH/490R', 'B.1.640')

gisaid_metadata['variant_WHO_label'].value_counts()

Delta      4173715
Unknown    1239860
Omicron    1157449
Alpha      1150990
Gamma       120350
Epsilon      70339
Iota         43100
Beta         41607
Mu           15459
Lambda        9768
Eta           9493
Kappa         8227
Zeta          5190
B.1.640        635
Theta          624
Name: variant_WHO_label, dtype: int64

## Export as CSV

In [25]:
# Store preprocessed GISAID metadata as CSV file
gisaid_metadata.to_csv('../../data/gisaid_metadata_preprocessed.csv', index=False)