# GISAID Metadata - Investigation
## Conversion from JSON to TSV

In [2]:
import json

# Show available columns
with open('../data/gisaid.json') as f:
    print(json.loads(f.readline()).keys())

dict_keys(['covv_accession_id', 'covv_clade', 'covv_collection_date', 'covsurver_prot_mutations', 'covsurver_uniquemutlist', 'covv_orig_lab', 'covv_passage', 'covv_subm_lab', 'gc_content', 'covv_host', 'is_high_coverage', 'is_reference', 'is_complete', 'covv_lineage', 'pangolin_lineages_version', 'covv_location', 'n_content', 'sequence_length', 'covv_subm_date', 'covv_type', 'covv_variant'])


In [28]:
with open('../data/gisaid.json', 'r') as json_file:
    print(sum(1 for line in json_file))  # 6984464 samples found -> used as n_lines in cell below

6984464


In [None]:
import json
import csv

fail = 0
with open('../data/gisaid.json', 'r', encoding='utf-8') as json_file:
  with open('../data/gisaid_metadata.tsv', 'w') as tsv_file:

    n_lines = 6984464
    for i in range(n_lines):
      # Read one key-value object per line
      obj = json.loads(json_file.readline())

      # For first object, take keys as header for TSV
      if i == 0:
        dw = csv.DictWriter(tsv_file, obj.keys(), delimiter='\t')
        dw.writeheader()

      try:
        # For each object, append its values as new row to TSV
        dw.writerow(obj)
      except UnicodeEncodeError:
        # If UnicodeEncodeError occurs, ignore row
        fail += 1
        continue

print(fail)  # -> 34,699 rows have thrown UnicodeEncodeError

In [1]:
import dask.dataframe as dd

gisaid_metadata = dd.read_csv('../data/gisaid_metadata.tsv', encoding='iso-8859-1', delimiter='\t')
gisaid_metadata.info()  # -> 21 columns: 15 string, 3 bool, 2 float & 1 integer

<class 'dask.dataframe.core.DataFrame'>
Columns: 21 entries, covv_accession_id to covv_variant
dtypes: object(15), bool(3), float64(2), int64(1)

In [39]:
len(gisaid_metadata)
# -> 6,949,765 rows
# -> 34,699 rows missing due to UnicodeEncodeError

6949765

In [40]:
gisaid_metadata.head()

Unnamed: 0,covv_accession_id,covv_clade,covv_collection_date,covsurver_prot_mutations,covsurver_uniquemutlist,covv_orig_lab,covv_passage,covv_subm_lab,gc_content,covv_host,...,is_reference,is_complete,covv_lineage,pangolin_lineages_version,covv_location,n_content,sequence_length,covv_subm_date,covv_type,covv_variant
0,EPI_ISL_1989555,GR,2021-03-21,"(Spike_K417T,Spike_N501Y,NSP12_P323L,NSP3_S370...",,Medical Laboratories Duesseldorf,Original,"Center of Medical Microbiology, Virology, and ...",0.380387,Human,...,False,True,P.1.15,2022-01-05,Europe / Germany / North Rhine-Westphalia,0.038961,29773,2021-05-10,betacoronavirus,VOC Gamma GR/501Y.V3 (P.1+P.1.x) first detecte...
1,EPI_ISL_2183215,GRY,2021-04-25,"(Spike_N501Y,NSP12_P323L,NSP6_G107del,NSP3_A89...",,Israel Central Virology laboratory,Original,Israel National Consortium for SARS-CoV-2 sequ...,0.379828,Human,...,False,True,B.1.1.7,2022-01-05,Asia / Israel / Southern,0.019734,29847,2021-05-20,betacoronavirus,VOC Alpha 202012/01 GRY (B.1.1.7+Q.x) first de...
2,EPI_ISL_3028444,GK,2021-07-14,"(Spike_P681R,N_D63G,NS7a_V82A,NSP12_P323L,NSP1...","(NSP12_L838I,NS7a_V82A,N_D63G,N_Q9L)","Department of Bacteria, Parasites and Fungi, S...",Original,Statens Serum Institut Bioinformatics and Mic...,0.379649,Human,...,False,True,AY.43,2022-01-05,Europe / Denmark / Hovedstaden,0.0,29769,2021-07-21,betacoronavirus,VOC Delta GK/478K.V1 (B.1.617.2+AY.x) first de...
3,EPI_ISL_2183216,GRY,2021-05-10,"(Spike_N501Y,NSP8_Q24R,NSP12_P323L,NSP13_K460R...",(NSP3_K394R),Laboratory Corporation of America,Original,Centers for Disease Control and Prevention D...,0.379973,Human,...,False,True,B.1.1.7,2022-01-05,North America / USA / Virginia,0.0,29760,2021-05-20,betacoronavirus,VOC Alpha 202012/01 GRY (B.1.1.7+Q.x) first de...
4,EPI_ISL_3721606,GH,2021-07-09,"(Spike_R346K,Spike_Y145N,NSP3_T237A,Spike_N501...",(NSP12_L65S),ICMT-Apartado,Original,Universidad Nacional de Colombia - Laboratorio...,0.379718,Human,...,False,True,B.1.621,2022-01-05,South America / Colombia / Antioquia / Apartado,0.012592,29781,2021-08-27,betacoronavirus,VOI Mu GH (B.1.621+B.1.621.1) first detected i...


In [43]:
gisaid_metadata.tail()

Unnamed: 0,covv_accession_id,covv_clade,covv_collection_date,covsurver_prot_mutations,covsurver_uniquemutlist,covv_orig_lab,covv_passage,covv_subm_lab,gc_content,covv_host,...,is_reference,is_complete,covv_lineage,pangolin_lineages_version,covv_location,n_content,sequence_length,covv_subm_date,covv_type,covv_variant
52806,EPI_ISL_2183210,G,2021-04-25,"(Spike_N501Y,NSP1_V84del,NSP12_P323L,NSP1_H83d...",,Israel Central Virology laboratory,Original,Israel National Consortium for SARS-CoV-2 sequ...,0.378747,Human,...,False,True,B.1.1.7,2022-01-05,Asia / Israel / Southern Coast,0.009143,29860,2021-05-20,betacoronavirus,VOC Alpha 202012/01 GRY (B.1.1.7+Q.x) first de...
52807,EPI_ISL_2183211,GRY,2021-05-10,"(N_A208del,Spike_N501Y,N_M210del,NSP2_P106T,NS...",(NSP3_F800L),Laboratory Corporation of America,Original,Centers for Disease Control and Prevention D...,0.379867,Human,...,False,True,B.1.1.7,2022-01-05,North America / USA / Virginia,0.0,29742,2021-05-20,betacoronavirus,VOC Alpha 202012/01 GRY (B.1.1.7+Q.x) first de...
52808,EPI_ISL_2183212,GRY,2021-04-25,"(Spike_N501Y,NSP12_P323L,NSP6_G107del,NSP3_A89...",,Israel Central Virology laboratory,Original,Israel National Consortium for SARS-CoV-2 sequ...,0.37934,Human,...,False,True,B.1.1.7,2022-01-05,Asia / Israel / Southern Coast,0.000368,29876,2021-05-20,betacoronavirus,VOC Alpha 202012/01 GRY (B.1.1.7+Q.x) first de...
52809,EPI_ISL_2183213,GRY,2021-05-10,"(NS3_H78Y,Spike_N501Y,NSP4_S41I,NSP12_P323L,NS...",(NSP4_S40T),Laboratory Corporation of America,Original,Centers for Disease Control and Prevention D...,0.379774,Human,...,False,True,B.1.1.7,2022-01-05,North America / USA / Virginia,0.0,29665,2021-05-20,betacoronavirus,VOC Alpha 202012/01 GRY (B.1.1.7+Q.x) first de...
52810,EPI_ISL_2183214,G,2021-04-25,"(Spike_N501Y,Spike_S221L,NSP12_P323L,NSP6_G107...",,Israel Central Virology laboratory,Original,Israel National Consortium for SARS-CoV-2 sequ...,0.378643,Human,...,False,True,B.1.1.7,2022-01-05,Asia / Israel / Southern Coast,0.00918,29848,2021-05-20,betacoronavirus,VOC Alpha 202012/01 GRY (B.1.1.7+Q.x) first de...


## Data Cleaning

In [32]:
# Remove non-human hosts (39 hosts available)
original_len = len(gisaid_metadata)
gisaid_metadata = gisaid_metadata[gisaid_metadata['covv_host'] == 'Human']
print(f'{original_len - len(gisaid_metadata)} non-human samples removed')  # 4,540 samples removed

4540 non-human samples removed


## Overview of Available Features

In [25]:
# Get statistics of categorical columns
gisaid_metadata.describe(include=['object', 'bool']).compute()

# FINDINGS:
# covv_variant has 14 unique values
# sequencing reason/ keywords column don't exist but should (see GISAID upload or search area)

# QUESTIONS FOR FABIO:
# 1. Why does metadata by CovRadar API has different column names?
# 2. Can we include sequencing reason?
# 3. Where to find documentation?

Unnamed: 0,covv_accession_id,covv_clade,covv_collection_date,covsurver_prot_mutations,covsurver_uniquemutlist,covv_orig_lab,covv_passage,covv_subm_lab,covv_host,is_high_coverage,is_reference,is_complete,covv_lineage,pangolin_lineages_version,covv_location,covv_subm_date,covv_type,covv_variant
unique,6945225,11,870,3177940,152303,20425,324,4086,1,2,2,2,1578,1,26054,723,1,14
count,6945225,6945158,6945225,6945225,1642878,6945153,6945225,6945224,6945225,6945225,6945225,6945225,6945158,6945222,6945225,6945225,6945206,5734408
top,EPI_ISL_1000000,GK,2021,"(Spike_P681R,N_D63G,NS7a_V82A,NSP12_P323L,NSP1...","(NS7a_V82A,N_D63G)",Lighthouse Lab in Milton Keynes,Original,Wellcome Sanger Institute for the COVID-19 Gen...,Human,True,False,True,B.1.1.7,2022-01-05,Europe / United Kingdom / England,2021-11-17,betacoronavirus,VOC Delta GK/478K.V1 (B.1.617.2+AY.x) first de...
freq,1,3862210,58872,34659,407363,385133,6899356,1200964,6945225,4848328,6945224,6839717,1116179,6945222,1382051,68511,6945206,3992087


In [26]:
# Get statistics of numeric columns
gisaid_metadata.describe(include=['float64', 'int64']).compute()

Unnamed: 0,gc_content,n_content,sequence_length
count,6945225.0,6945225.0,6945225.0
mean,0.3794622,0.01353829,29426.21
std,0.002074926,0.03736357,3023.512
min,0.1449275,0.0,64.0
25%,0.3795258,0.0,29768.0
50%,0.3797201,0.00540051,29799.0
75%,0.380057,0.05040092,29842.0
max,0.5357143,0.9977176,34692.0


In [2]:
# Get value counts of virus variants
gisaid_metadata.covv_variant.value_counts().compute()

VOC Delta GK/478K.V1 (B.1.617.2+AY.x) first detected in India                         3993832
VOC Alpha 202012/01 GRY (B.1.1.7+Q.x) first detected in the UK                        1145975
VOC Omicron GRA (B.1.1.529+BA.*) first detected in Botswana/Hong Kong/South Africa     276235
VOC Gamma GR/501Y.V3 (P.1+P.1.x) first detected in Brazil/Japan                        120130
VOI Epsilon GH/452R.V1 (B.1.429+B.1.427) first detected in USA/California               68180
VOI Iota GH/253G.V1 (B.1.526) first detected in USA/New York                            42912
VOC Beta GH/501Y.V2 (B.1.351+B.1.351.2+B.1.351.3) first detected in South Africa        41006
VOI Mu GH (B.1.621+B.1.621.1) first detected in Colombia                                14774
VOI Lambda GR/452Q.V1 (C.37+C.37.1) first detected in Peru                               9575
VOI Eta G/484K.V3 (B.1.525) first detected in UK/Nigeria                                 9424
VOI Kappa G/452R.V3 (B.1.617.1) first detected in India     