# Overview of COVID Dataset

This dataset was downloaded from GISAID on April 10, 2023 and includes European samples from a period of three weeks between March 20 and April 10. The initial download was filtered for Complete, High coverage, and Collection date complete samples. Europe was a somewhat arbitrary choice, but I wanted to avoid major barriers to dispersal (oceans, major mountain ranges).

In [4]:
import pandas as pd
import requests
import json
import time
import plotly.express as px
from Bio import Seq, SeqIO, AlignIO

## Add Geographic Coordinates

The GISAID dataset provides Continent/Country/Region information for many of their samples; occasionally, City is provided. For the analysis, I needed to convert this location information into longitude and latitude. I did this using Nominatim, an API from OpenStreetMap, and stored the first coordinates from their results. This may not be in the center of the region or country, but as the location is already abstracted in the way it is presented, I felt that this was the simplest strategy. The results from the API are printed below.

In [9]:
metadata = pd.read_table("data/Europe_20230320-20230410/original/sequencing_technology_metadata.tsv")
metadata[["Continent", "Country", "Region", "City"]] = metadata["Location"].str.split(" / ", expand=True)
locations = metadata[["Continent", "Country", "Region", "City"]].drop_duplicates()

def get_lat_lon(country, region, city):
    search_term = country
    if region:
        search_term += "+" + region
    if city:
        search_term += "+" + city
    try:
        location_data = json.loads(requests.get("https://nominatim.openstreetmap.org/search.php?q=" + search_term + "&format=jsonv2").text)[0]
        time.sleep(2)
        print(search_term, location_data)
        return float(location_data["lat"]), float(location_data["lon"])
    except:
        print(search_term, "FAIL")
        return None, None

locations[["Latitude", "Longitude"]] = locations.apply(lambda x: get_lat_lon(x["Country"],x["Region"],x["City"]), axis=1, result_type='expand')
found_locations = locations[-locations["Latitude"].isna()]
metadata = metadata.merge(found_locations, on=["Continent", "Country", "Region", "City"])
#metadata.to_csv("data/Europe_20230320-20230410/original/sequencing_technology_metadata_with_latlon.tsv", sep="\t", index=False)

Belgium+Limburg {'place_id': 298536463, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 53142, 'boundingbox': ['50.6954339', '51.3001647', '4.9800804', '5.9111095'], 'lat': '50.9977937', 'lon': '5.445357520264672', 'display_name': 'Limburg, Vlaanderen, België / Belgique / Belgien', 'place_rank': 8, 'category': 'boundary', 'type': 'administrative', 'importance': 0.6644012435370624, 'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/poi_boundary_administrative.p.20.png'}
Italy+Umbria {'place_id': 297915519, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 42004, 'boundingbox': ['42.3647769', '43.6173443', '11.8920337', '13.2641895'], 'lat': '42.965916', 'lon': '12.490236', 'display_name': 'Umbria, Italia', 'place_rank': 8, 'category': 'boundary', 'type': 'administrative', 'importance': 0.7573426638616688, 'icon': 'https://nominatim.openstreetm

Germany+Saxony {'place_id': 297922329, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 62467, 'boundingbox': ['50.1713271', '51.6851316', '11.872254', '15.0419309'], 'lat': '50.9295798', 'lon': '13.4585052', 'display_name': 'Sachsen, Deutschland', 'place_rank': 8, 'category': 'boundary', 'type': 'administrative', 'importance': 0.6695110203247417, 'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/poi_boundary_administrative.p.20.png'}
Germany+Baden-Wurttemberg {'place_id': 297930133, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 62611, 'boundingbox': ['47.5324787', '49.7912941', '7.5117461', '10.4955731'], 'lat': '48.53775', 'lon': '9.041169', 'display_name': 'Baden-Württemberg, Deutschland', 'place_rank': 8, 'category': 'boundary', 'type': 'administrative', 'importance': 0.7889406218540862, 'icon': 'https://nominatim.openstreetmap.org/ui

France+Hauts-de-France+Douai {'place_id': 297944210, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 56243, 'boundingbox': ['50.3491934', '50.4105745', '3.0514828', '3.149484'], 'lat': '50.3675677', 'lon': '3.0804641', 'display_name': 'Douai, Nord, Hauts-de-France, France métropolitaine, 59500, France', 'place_rank': 16, 'category': 'boundary', 'type': 'administrative', 'importance': 1.1145653862933091, 'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/poi_boundary_administrative.p.20.png'}
France+Bretagne+Melesse {'place_id': 298125186, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 398729, 'boundingbox': ['48.1810158', '48.2573885', '-1.7327721', '-1.6445114'], 'lat': '48.2175823', 'lon': '-1.6962403', 'display_name': 'Melesse, Rennes, Ille-et-Vilaine, Bretagne, France métropolitaine, 35520, France', 'place_rank': 16, 'category': 'bound

Ireland+Cork {'place_id': 299556622, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 11797890, 'boundingbox': ['51.8273102', '51.9701415', '-8.6378543', '-8.3551315'], 'lat': '51.897077', 'lon': '-8.4654674', 'display_name': 'Cork, County Cork, Munster, Éire / Ireland', 'place_rank': 14, 'category': 'boundary', 'type': 'administrative', 'importance': 0.8024517456052389, 'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/poi_boundary_administrative.p.20.png'}
Ireland+Carlow {'place_id': 299471090, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 285977, 'boundingbox': ['52.4635897', '52.9180156', '-7.1079443', '-6.504941'], 'lat': '52.69078865', 'lon': '-6.825145150844913', 'display_name': 'County Carlow, Leinster, Éire / Ireland', 'place_rank': 12, 'category': 'boundary', 'type': 'administrative', 'importance': 0.6893972336827785, 'icon': 'h

Luxembourg {'place_id': 358617682, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 2171347, 'boundingbox': ['49.4478587', '50.1827726', '5.7357006', '6.5312481'], 'lat': '49.8158683', 'lon': '6.1296751', 'display_name': 'Lëtzebuerg', 'place_rank': 4, 'category': 'boundary', 'type': 'administrative', 'importance': 0.7282786757940397, 'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/poi_boundary_administrative.p.20.png'}
Ireland+Galway {'place_id': 298271376, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 1390623, 'boundingbox': ['53.248517', '53.3197423', '-9.1426901', '-8.9548381'], 'lat': '53.2744122', 'lon': '-9.0490601', 'display_name': 'Cathair na Gaillimhe, County Galway, Connacht, Éire / Ireland', 'place_rank': 14, 'category': 'boundary', 'type': 'administrative', 'importance': 0.7587954165289414, 'icon': 'https://nominatim.openstr

Now that I have the coordinates of the samples, I wanted to only pull one sample from each location. This filters our samples down to 72, which should be within the range that ARGweaver can handle. The samples are plotted below. Note that I did not filter by sequence quality, and that should be done prior to all of this.

In [10]:
metadata = pd.read_table("data/Europe_20230320-20230410/original/sequencing_technology_metadata_with_latlon.tsv")
metadata = metadata.drop_duplicates(["Latitude", "Longitude"])
metadata["ID"] = metadata["Virus name"]+"|"+metadata["Accession ID"]+"|"+metadata["Collection date"]

metadata.to_csv("data/Europe_20230320-20230410/filtered/sequencing_technology_metadata_filtered.tsv", sep="\t", index=False)

print("There are", metadata.shape[0], "unique locations.")


fig = px.scatter_mapbox(metadata, 
                        lat="Latitude", 
                        lon="Longitude", 
                        hover_name="Virus name",
                        hover_data=["Continent", "Country", "Region", "City"],
                        zoom=8)

fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

There are 72 unique locations.


In [7]:
reference_seq = SeqIO.parse(open("data/reference.fasta"),"fasta")
fasta_sequences = SeqIO.parse(open("data/Europe_20230320-20230410/original/nucleotide_sequences.fasta"),"fasta")
records = []
for ref in reference_seq:
    records.append(ref)
for fasta in fasta_sequences:
    if fasta.id in list(metadata["ID"]):
        records.append(fasta)

#output_file = "data/Europe_20230320-20230410/filtered/nucleotide_sequences_filtered.fasta"
#with open(output_file, 'w') as f:
#    SeqIO.write(records, f, 'fasta')

## Align Sequences

In [8]:
#!mafft data/Europe_20230320-20230410/filtered/nucleotide_sequences_filtered.fasta > data/Europe_20230320-20230410/filtered/nucleotide_sequences_filtered_aligned.fasta


nseq =  73
distance =  ktuples
iterate =  0
cycle =  2
sparsepickup = 0
nguidetree = 2
nthread = 0
sueff_global = 0.100000
generating a scoring matrix for nucleotide (dist=200) ... done
done
done
scoremtx = -1
Gap Penalty = -1.53, +0.00, +0.00

tuplesize = 6, dorp = d


Making a distance matrix ..

There are 4355 ambiguous characters.
    1 / 73
done.

Constructing a UPGMA tree ... 
   70 / 73
done.

Progressive alignment 1/2... 
STEP    67 / 72 f
Reallocating..done. *alloclen = 60894
STEP    69 / 72 f
len1=30056, len2=29679, Switching to the memsave mode


STEP    71 / 72 fm FFT ... rs) DP 00001 / 00193DP 00002 / 00193DP 00003 / 00193DP 00004 / 00193DP 00005 / 00193DP 00006 / 00193DP 00007 / 00193DP 00008 / 00193DP 00009 / 00193DP 00010 / 00193DP 00011 / 00193DP 00012 / 00193DP 00013 / 00193DP 00014 / 00193DP 00015 / 00193DP 00016 / 00193DP 00017 / 00193DP 00018 / 00193DP 00019 / 00193DP 00020 / 00193DP 00021 / 00193DP 00022 / 00193DP 00023 / 00193DP 00024 / 00193DP 00025 / 00193DP 00026 / 00193DP 00027 / 00193DP 00028 / 00193DP 00029 / 00193DP 00030 / 00193DP 00031 / 00193DP 00032 / 00193DP 00033 / 00193DP 00034 / 00193DP 00035 / 00193DP 00036 / 00193DP 00037 / 00193DP 00038 / 00193DP 00039 / 00193DP 00040 / 00193DP 00041 / 00193DP 00042 / 00193DP 00043 / 00193DP 00044 / 00193DP 00045 / 00193DP 00046 / 00193DP 00047 / 00193DP 00048 / 00193DP 00049 / 00193DP 00050 / 00193DP 00051 / 00193DP 00052 / 00193DP 00053 / 00193DP 00054 / 00193DP 00055 / 00193DP 00056 / 00193DP 00057 / 00193DP 00058 / 00193DP 00059 / 00193DP 00060 / 00193DP 00061 

In [11]:
aligned_fasta_sequences = SeqIO.parse(open("data/Europe_20230320-20230410/filtered/nucleotide_sequences_filtered_aligned.fasta"),"fasta")
for seq in aligned_fasta_sequences:
    print(len(seq.seq))

30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075
30075


In [12]:
aligned_fasta_sequences = SeqIO.parse(open("data/Europe_20230320-20230410/filtered/nextclade.aligned.fasta"),"fasta")
for seq in aligned_fasta_sequences:
    print(len(seq.seq))

29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
29903
