In [1]:
import pandas as pd
import plotly.express as px
import numpy as np
import json
import requests
import srsly
import pathlib

In [None]:
# Code for grabbing variant props for covariants github
# Data from covariants cluster tables https://github.com/hodcroftlab/covariants/tree/master/cluster_tables, covariants.org

In [2]:
# Point to path to variant data folder in data folder
path = pathlib.Path('../data/variant-data')

In [9]:
# specify variant dictionary
variant_data_dict = {}
temp_dict = {}

# for loop to read through data paths in folders, grab json files and add them to dictionary
for p in path.iterdir():
    variant_string = p.stem
    variant_name = variant_string.split('_')[0].replace('.','_')
    variant_dict = srsly.read_json(p)
    variant_data_dict[variant_name] = variant_dict

In [11]:
# check each variant dict key has been added
keysList = list(variant_data_dict.keys())
print(keysList)
# Note names are nextstrain calde names at this point and WHO VOC naming

['20H_Beta_V2', '20I_Alpha_V1', '21A_Delta', '21I_Delta', '21J_Delta', '21K_Omicron', '21L_Omicron', '22A_Omicron', '22B_Omicron']


In [18]:
# create dataframe from nested variant dictionary
df = pd.DataFrame()

for variant,country in variant_data_dict.items():
    for c in country:
        new_df = pd.DataFrame(country[c])
        new_df['country'] = c
        new_df['nextclade_variant'] = variant
        df = pd.concat([df, new_df])
        
# create new variant proportion column
df['variant_prop'] = df['cluster_sequences']/df['total_sequences']

In [14]:
# lets creare a new column that adds more common variant names

In [31]:

# Define the function to categorise names
def variant_rename(variant_name):
    if variant_name == '20H_Beta_V2':
        return 'Beta'
    elif variant_name == '20I_Alpha_V1':
        return 'Alpha'
    elif variant_name == '21A_Delta':
        return 'Delta_1'
    elif variant_name == '21I_Delta':
        return 'Delta_2'
    elif variant_name == '21J_Delta':
        return 'Delta_3'
    elif variant_name == '21K_Omicron':
        return 'Omicron_BA1'
    elif variant_name == '21L_Omicron':
        return 'Omicron_BA2'
    elif variant_name == '22A_Omicron':
        return 'Omicron_BA4'
    else:
        return 'Omicron_BA5'

# Apply the function to the Age column using the apply() function
df['variant'] = df['nextclade_variant'].apply(variant_rename)

In [32]:
df

Unnamed: 0,week,total_sequences,cluster_sequences,country,nextclade_variant,variant_prop,variant
0,2020-04-27,162,0,South Africa,20H_Beta_V2,0.0,Beta
1,2020-05-11,108,0,South Africa,20H_Beta_V2,0.0,Beta
2,2020-05-25,120,0,South Africa,20H_Beta_V2,0.0,Beta
3,2020-06-08,190,0,South Africa,20H_Beta_V2,0.0,Beta
4,2020-06-22,340,0,South Africa,20H_Beta_V2,0.0,Beta
...,...,...,...,...,...,...,...
84,2023-09-25,140,0,Turkey,22B_Omicron,0.0,Omicron_BA5
85,2023-10-09,144,0,Turkey,22B_Omicron,0.0,Omicron_BA5
86,2023-10-23,92,0,Turkey,22B_Omicron,0.0,Omicron_BA5
87,2023-11-06,133,0,Turkey,22B_Omicron,0.0,Omicron_BA5


In [33]:
# select countries
countries = ['Malaysia', 'Philippines', 'Vietnam']

df_analysis = df.loc[df['country'].isin(countries)]

In [34]:
# identify week first time proportion goes above a threshold
threshold = 0.01

emergence_df = df_analysis.loc[df_analysis.variant_prop>threshold].groupby(['country','variant']).head(2)

In [38]:
emergence_df.loc[emergence_df['country'] == 'Malaysia']

Unnamed: 0,week,total_sequences,cluster_sequences,country,nextclade_variant,variant_prop,variant
13,2020-11-23,41,1,Malaysia,20H_Beta_V2,0.02439,Beta
18,2021-02-01,87,2,Malaysia,20H_Beta_V2,0.022989,Beta
15,2020-12-21,36,1,Malaysia,20I_Alpha_V1,0.027778,Alpha
18,2021-02-01,87,1,Malaysia,20I_Alpha_V1,0.011494,Alpha
25,2021-05-10,80,4,Malaysia,21A_Delta,0.05,Delta_1
26,2021-05-24,132,4,Malaysia,21A_Delta,0.030303,Delta_1
16,2021-01-04,145,2,Malaysia,21I_Delta,0.013793,Delta_2
18,2021-02-01,87,1,Malaysia,21I_Delta,0.011494,Delta_2
16,2021-01-04,145,3,Malaysia,21J_Delta,0.02069,Delta_3
20,2021-03-01,57,7,Malaysia,21J_Delta,0.122807,Delta_3


In [None]:
## some old code 
# Combine variant dictionaries into single nested dictionary
variant_data_dict = {'Beta_20H':Beta_20H,
                    'Alpha_20I':Alpha_20I,
                     'Delta_21A':Delta_21A,
                     'Delta_21I':Delta_21I,
                     'Delta_21J': Delta_21J,
                     'Omicron_21K_BA1': Omicron_21K_BA1,
                     'Omicron_21L_BA2': Omicron_21L_BA2,
                     'Omicron_22A_BA4': Omicron_22A_BA4,
                     'Omicron_22B_BA5':Omicron_22B_BA5
                    