In [None]:
import pandas as pd
import plotly.express as px
import numpy as np
import json
import requests
import srsly
from pathlib import Path

In [None]:
# Code for grabbing variant props for covariants github
# Data from covariants cluster tables https://github.com/hodcroftlab/covariants/tree/master/cluster_tables, covariants.org

In [None]:
# Set path to target data in data folder
cwd = Path.cwd()
cwd.parent.parent
variant_data_path = cwd.parent.parent / Path("data/raw-variant-data")

In [None]:
cwd.parent.parent

In [None]:
# specify variant dictionary
variant_data_dict = {}
temp_dict = {}

# for loop to read through data paths in folders, grab json files and add them to dictionary
for p in variant_data_path.iterdir():
    variant_string = p.stem
    variant_name = variant_string.split('_')[0].replace('.','_')
    variant_dict = srsly.read_json(p)
    variant_data_dict[variant_name] = variant_dict

In [None]:
# check each variant dict key has been added
keysList = list(variant_data_dict.keys())
print(keysList)
# Note names are nextstrain calde names at this point and WHO VOC naming

In [None]:
# create dataframe from nested variant dictionary
df = pd.DataFrame()

for variant,country in variant_data_dict.items():
    for c in country:
        new_df = pd.DataFrame(country[c])
        new_df['country'] = c
        new_df['nextclade_variant'] = variant
        df = pd.concat([df, new_df])
        
# create new variant proportion column
df['variant_prop'] = df['cluster_sequences']/df['total_sequences']

In [None]:
# lets creare a new column that adds more common variant names

In [None]:

# Define the function to categorise names
def variant_rename(variant_name):
    if variant_name == '20H_Beta_V2':
        return 'Beta'
    elif variant_name == '20I_Alpha_V1':
        return 'Alpha'
    elif variant_name == '21A_Delta':
        return 'Delta'
    elif variant_name == '21I_Delta':
        return 'Delta'
    elif variant_name == '21J_Delta':
        return 'Delta'
    elif variant_name == '21K_Omicron':
        return 'Omicron_BA1'
    elif variant_name == '21L_Omicron':
        return 'Omicron_BA2'
    elif variant_name == '22A_Omicron':
        return 'Omicron_BA4'
    else:
        return 'Omicron_BA5'

# Apply the function to the Age column using the apply() function
df['variant'] = df['nextclade_variant'].apply(variant_rename)

In [None]:
# inspect df
df.head(20)

In [None]:
# select countries
countries = ['Malaysia', 'Philippines', 'Vietnam']

df_analysis = df.loc[df['country'].isin(countries)]

In [None]:
# identify week first time variant appears - i.e. prop > 0
threshold = 0.0
emergence_df = df_analysis.loc[df_analysis.variant_prop>threshold].groupby(['country','variant']).head(2)

# save as a csv
emergence_df.to_csv(cwd.parent.parent / Path("data/variant-data/variant-emergence.csv"))

In [None]:
# identify week first time variant proportion goes above a threshold and there is reasonable sized sampling, lets say 50 sequences total
threshold = 0.1
number = 50

prev_df = df_analysis.loc[(df_analysis.variant_prop>threshold) & (df_analysis.total_sequences>number)].groupby(['country','variant']).head(2)

# save as a csv
prev_df.to_csv(cwd.parent.parent / Path("data/variant-data/variant-prevalence.csv"))

In [None]:
prev_df