# GISAID Metadata - Preprocessing
## Conversion from JSON to TSV

In [None]:
import json

# Show available columns
with open('../../data/gisaid.json') as f:
    print(json.loads(f.readline()).keys())

In [None]:
with open('../../data/gisaid.json', 'r') as json_file:
    print(sum(1 for line in json_file))  # 6,984,464 samples found -> used as n_lines in cell below

In [None]:
import json
import csv

fail = 0
n_lines = 6984464

with open('../../data/gisaid.json', 'r', encoding='utf-8') as json_file:
  with open('../../data/gisaid_metadata.tsv', 'w') as tsv_file:
    for i in range(n_lines):
      # Read one key-value object per line
      obj = json.loads(json_file.readline())

      # For first object, take keys as header for TSV
      if i == 0:
        dw = csv.DictWriter(tsv_file, obj.keys(), delimiter='\t')
        dw.writeheader()

      try:
        # For each object, append its values as new row to TSV
        dw.writerow(obj)
      except UnicodeEncodeError:
        # If UnicodeEncodeError occurs, ignore row
        fail += 1
        continue

print(fail)  # -> 34,699 rows have thrown UnicodeEncodeError

In [1]:
import dask.dataframe as dd
import pandas as pd

gisaid_metadata = dd.read_csv('../../data/gisaid_metadata.tsv', encoding='iso-8859-1', delimiter='\t')
gisaid_metadata = gisaid_metadata.compute()
gisaid_metadata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6949765 entries, 0 to 52810
Data columns (total 21 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   covv_accession_id          object 
 1   covv_clade                 object 
 2   covv_collection_date       object 
 3   covsurver_prot_mutations   object 
 4   covsurver_uniquemutlist    object 
 5   covv_orig_lab              object 
 6   covv_passage               object 
 7   covv_subm_lab              object 
 8   gc_content                 float64
 9   covv_host                  object 
 10  is_high_coverage           bool   
 11  is_reference               bool   
 12  is_complete                bool   
 13  covv_lineage               object 
 14  pangolin_lineages_version  object 
 15  covv_location              object 
 16  n_content                  float64
 17  sequence_length            int64  
 18  covv_subm_date             object 
 19  covv_type                  object 
 20  covv

In [39]:
len(gisaid_metadata)
# -> 6,949,765 rows
# -> 34,699 rows missing due to UnicodeEncodeError

6949765

In [40]:
gisaid_metadata.head()

Unnamed: 0,covv_accession_id,covv_clade,covv_collection_date,covsurver_prot_mutations,covsurver_uniquemutlist,covv_orig_lab,covv_passage,covv_subm_lab,gc_content,covv_host,...,is_reference,is_complete,covv_lineage,pangolin_lineages_version,covv_location,n_content,sequence_length,covv_subm_date,covv_type,covv_variant
0,EPI_ISL_1989555,GR,2021-03-21,"(Spike_K417T,Spike_N501Y,NSP12_P323L,NSP3_S370...",,Medical Laboratories Duesseldorf,Original,"Center of Medical Microbiology, Virology, and ...",0.380387,Human,...,False,True,P.1.15,2022-01-05,Europe / Germany / North Rhine-Westphalia,0.038961,29773,2021-05-10,betacoronavirus,VOC Gamma GR/501Y.V3 (P.1+P.1.x) first detecte...
1,EPI_ISL_2183215,GRY,2021-04-25,"(Spike_N501Y,NSP12_P323L,NSP6_G107del,NSP3_A89...",,Israel Central Virology laboratory,Original,Israel National Consortium for SARS-CoV-2 sequ...,0.379828,Human,...,False,True,B.1.1.7,2022-01-05,Asia / Israel / Southern,0.019734,29847,2021-05-20,betacoronavirus,VOC Alpha 202012/01 GRY (B.1.1.7+Q.x) first de...
2,EPI_ISL_3028444,GK,2021-07-14,"(Spike_P681R,N_D63G,NS7a_V82A,NSP12_P323L,NSP1...","(NSP12_L838I,NS7a_V82A,N_D63G,N_Q9L)","Department of Bacteria, Parasites and Fungi, S...",Original,Statens Serum Institut Bioinformatics and Mic...,0.379649,Human,...,False,True,AY.43,2022-01-05,Europe / Denmark / Hovedstaden,0.0,29769,2021-07-21,betacoronavirus,VOC Delta GK/478K.V1 (B.1.617.2+AY.x) first de...
3,EPI_ISL_2183216,GRY,2021-05-10,"(Spike_N501Y,NSP8_Q24R,NSP12_P323L,NSP13_K460R...",(NSP3_K394R),Laboratory Corporation of America,Original,Centers for Disease Control and Prevention D...,0.379973,Human,...,False,True,B.1.1.7,2022-01-05,North America / USA / Virginia,0.0,29760,2021-05-20,betacoronavirus,VOC Alpha 202012/01 GRY (B.1.1.7+Q.x) first de...
4,EPI_ISL_3721606,GH,2021-07-09,"(Spike_R346K,Spike_Y145N,NSP3_T237A,Spike_N501...",(NSP12_L65S),ICMT-Apartado,Original,Universidad Nacional de Colombia - Laboratorio...,0.379718,Human,...,False,True,B.1.621,2022-01-05,South America / Colombia / Antioquia / Apartado,0.012592,29781,2021-08-27,betacoronavirus,VOI Mu GH (B.1.621+B.1.621.1) first detected i...


In [43]:
gisaid_metadata.tail()

Unnamed: 0,covv_accession_id,covv_clade,covv_collection_date,covsurver_prot_mutations,covsurver_uniquemutlist,covv_orig_lab,covv_passage,covv_subm_lab,gc_content,covv_host,...,is_reference,is_complete,covv_lineage,pangolin_lineages_version,covv_location,n_content,sequence_length,covv_subm_date,covv_type,covv_variant
52806,EPI_ISL_2183210,G,2021-04-25,"(Spike_N501Y,NSP1_V84del,NSP12_P323L,NSP1_H83d...",,Israel Central Virology laboratory,Original,Israel National Consortium for SARS-CoV-2 sequ...,0.378747,Human,...,False,True,B.1.1.7,2022-01-05,Asia / Israel / Southern Coast,0.009143,29860,2021-05-20,betacoronavirus,VOC Alpha 202012/01 GRY (B.1.1.7+Q.x) first de...
52807,EPI_ISL_2183211,GRY,2021-05-10,"(N_A208del,Spike_N501Y,N_M210del,NSP2_P106T,NS...",(NSP3_F800L),Laboratory Corporation of America,Original,Centers for Disease Control and Prevention D...,0.379867,Human,...,False,True,B.1.1.7,2022-01-05,North America / USA / Virginia,0.0,29742,2021-05-20,betacoronavirus,VOC Alpha 202012/01 GRY (B.1.1.7+Q.x) first de...
52808,EPI_ISL_2183212,GRY,2021-04-25,"(Spike_N501Y,NSP12_P323L,NSP6_G107del,NSP3_A89...",,Israel Central Virology laboratory,Original,Israel National Consortium for SARS-CoV-2 sequ...,0.37934,Human,...,False,True,B.1.1.7,2022-01-05,Asia / Israel / Southern Coast,0.000368,29876,2021-05-20,betacoronavirus,VOC Alpha 202012/01 GRY (B.1.1.7+Q.x) first de...
52809,EPI_ISL_2183213,GRY,2021-05-10,"(NS3_H78Y,Spike_N501Y,NSP4_S41I,NSP12_P323L,NS...",(NSP4_S40T),Laboratory Corporation of America,Original,Centers for Disease Control and Prevention D...,0.379774,Human,...,False,True,B.1.1.7,2022-01-05,North America / USA / Virginia,0.0,29665,2021-05-20,betacoronavirus,VOC Alpha 202012/01 GRY (B.1.1.7+Q.x) first de...
52810,EPI_ISL_2183214,G,2021-04-25,"(Spike_N501Y,Spike_S221L,NSP12_P323L,NSP6_G107...",,Israel Central Virology laboratory,Original,Israel National Consortium for SARS-CoV-2 sequ...,0.378643,Human,...,False,True,B.1.1.7,2022-01-05,Asia / Israel / Southern Coast,0.00918,29848,2021-05-20,betacoronavirus,VOC Alpha 202012/01 GRY (B.1.1.7+Q.x) first de...


## Data Cleaning

In [2]:
# Remove non-human hosts (39 hosts available)
original_len = len(gisaid_metadata)
gisaid_metadata = gisaid_metadata[gisaid_metadata['covv_host'] == 'Human']
print(f'{original_len - len(gisaid_metadata)} non-human samples removed')  # 4,540 samples removed

4540 non-human samples removed


## Data Enrichment

### Country

In [6]:
gisaid_metadata['covv_location'].value_counts()

Europe / United Kingdom / England                                                1382051
North America / USA / California                                                  196196
Europe / United Kingdom / Scotland                                                159874
Europe / United Kingdom / Wales                                                   142306
Europe / Denmark / Hovedstaden                                                    136354
                                                                                  ...   
Europe / Belgium / Lamine                                                              1
Europe / Belgium / Lo                                                                  1
Europe / Belgium / Bevere                                                              1
Europe / Belgium / Aisemont                                                            1
Europe / Czech Republic / Vysocina Region / Nova Ves u Noveho Mesta na Morave          1
Name: covv_location, 

In [3]:
# Extract country from location consisting of at least the continent (index 0) and country (index 1)
gisaid_metadata['country'] = gisaid_metadata['covv_location'].apply(lambda loc: loc.split('/')[1].strip())
gisaid_metadata['country'].unique()

array(['Germany', 'Israel', 'Denmark', 'USA', 'Colombia',
       'United Kingdom', 'Cyprus', 'Mexico', 'Italy', 'Belgium', 'Spain',
       'Netherlands', 'Cambodia', 'Brazil', 'Japan', 'Switzerland',
       'Norway', 'New Zealand', 'Australia', 'Slovenia', 'France',
       'Canada', 'Kenya', 'Ireland', 'South Africa', 'Zimbabwe', 'Togo',
       'Poland', 'Czech Republic', 'Malaysia', 'Turkey', 'Philippines',
       'Argentina', 'Austria', 'Bonaire', 'Guadeloupe', 'Martinique',
       'Kuwait', 'Niger', 'Curacao', 'Aruba', 'India', 'Romania',
       'Croatia', 'Ghana', 'Cayman Islands', 'Indonesia', 'Maldives',
       'Ecuador', 'Kazakhstan', 'Sweden', 'Sint Maarten', 'Puerto Rico',
       'Taiwan', 'Portugal', 'Malawi', 'Angola', 'Estonia', 'Singapore',
       'Malta', 'Nigeria', 'Thailand', 'Greece', 'Hong Kong',
       'South Korea', 'Lithuania', 'Gibraltar', 'Namibia', 'Egypt',
       'Costa Rica', 'U.S. Virgin Islands', 'Venezuela', 'Russia',
       'Guyana', 'Peru', 'Burkina Faso'

### WHO Label of Virus Variant

Structure of covv_variant column:

`VOI/VOC/VUM <WHO_label> <GISAID_clade> (<Pango_lineage>) first detected in <location>`

In [2]:
# Add column with short version of variant names
# "Unknown" is set, if covv_variant is NaN
gisaid_metadata['variant_WHO_label'] = gisaid_metadata['covv_variant'].apply(lambda name: str(name).split(' ')[1] if 'first' in str(name) else 'Unknown')

# Rename GISAID clade "GH/490R" to short version of Pango lineage "B.1.640+B.1.640.*"
# Was added as name of this variant has no WHO label like "Alpha" etc.
gisaid_metadata['variant_WHO_label'] = gisaid_metadata['variant_WHO_label'].replace('GH/490R', 'B.1.640')

gisaid_metadata['variant_WHO_label'].value_counts()

Delta      3992087
Unknown    1210817
Alpha      1145490
Omicron     276164
Gamma       120122
Epsilon      68135
Iota         42909
Beta         41000
Mu           14769
Lambda        9572
Eta           9424
Kappa         8150
Zeta          5506
Theta          624
B.1.640        456
Name: variant_WHO_label, dtype: int64

## Export as CSV

In [3]:
# Store preprocessed GISAID metadata as CSV file
gisaid_metadata.to_csv('../../data/gisaid_metadata_preprocessed.csv', index=False)