# Analysis of presence of variants in phylogeny

## Setup

In [1]:
import pandas as pd
from ete3 import Tree

In [2]:
mydb = '/FastData/czirion/Crypto_Desjardins/results_2024-10-22/02.Dataset/database.db'
tree_path = '/FastData/czirion/Crypto_Desjardins/fungal_pop/data/CryptoDiversity_Desjardins_Tree.tre'

## Get variant presence

In [3]:
vars_presence_raw = pd.read_csv('/FastData/czirion/Crypto_Desjardins/fungal_pop/data/vars_presence.csv')

Separate unique and non unique variants

In [74]:
vars_num_strains = vars_presence_raw.groupby('var_id')['strain'].nunique().reset_index(name='num_strains')
unique_vars = vars_num_strains[vars_num_strains['num_strains'] == 1]
non_unique_vars = vars_num_strains[vars_num_strains['num_strains'] > 1]
len(unique_vars), len(non_unique_vars), len(vars_num_strains)

(356496, 844318, 1200814)

In [79]:
unique_vars_presence = vars_presence_raw[vars_presence_raw['var_id'].isin(unique_vars['var_id'])].copy()
unique_vars_presence['status'] = "unique"
vars_presence = vars_presence_raw[vars_presence_raw['var_id'].isin(non_unique_vars['var_id'])].copy()

Group the variants present in the same strains to simplify the analysis

In [80]:
grouped_vars = vars_presence.groupby('var_id')['strain'].apply(tuple).reset_index()
grouped_vars = grouped_vars.groupby('strain')['var_id'].apply(list).reset_index()
grouped_vars['grouped_var_id'] = ['group_' + str(i) for i in range(len(grouped_vars))]
grouped_vars

Unnamed: 0,strain,var_id,grouped_var_id
0,"(125.91, AD1-83a)",[var_VNI_201289],group_0
1,"(125.91, Bt161)",[var_VNI_190507],group_1
2,"(125.91, Bt2)",[var_VNI_26831],group_2
3,"(125.91, D17-1)",[var_VNI_28905],group_3
4,"(125.91, Gbc39-2, Gbc39-1)",[var_VNI_28981],group_4
...,...,...,...
255122,"(Ze90-1, PMHc1049.THER1.STOR, Muc387-2, Bt46, ...","[var_VNBII_304219, var_VNBII_304220, var_VNBII...",group_255122
255123,"(Ze90-1, PMHc1049.THER1.STOR, Muc387-2, Bt46, ...","[var_VNBII_304233, var_VNBII_304237, var_VNBII...",group_255123
255124,"(Ze90-1, PMHc1049.THER1.STOR, NRHc5008.ENR)","[var_VNBII_318521, var_VNBII_318523, var_VNBII...",group_255124
255125,"(Ze90-1, PMHc1049.THER1.STOR, PMHc1009.ENR, Bt...",[var_VNBII_336105],group_255125


Recreate the original dataframe with the grouped variant IDs

In [81]:
var_id_to_grouped_var_id = {}
for _, row in grouped_vars.iterrows():
    for var_id in row['var_id']:
        var_id_to_grouped_var_id[var_id] = row['grouped_var_id']
group_var_presence = vars_presence.copy()
group_var_presence['grouped_var_id'] = group_var_presence['var_id'].map(var_id_to_grouped_var_id)

group_var_presence

Unnamed: 0,var_id,strain,grouped_var_id
1,var_VNBII_288,PMHc1052.ENR.STOR,group_231054
2,var_VNBII_466,PMHc1052.ENR.STOR,group_244616
3,var_VNBII_502,PMHc1052.ENR.STOR,group_227880
4,var_VNBII_504,PMHc1052.ENR.STOR,group_240741
5,var_VNBII_680,PMHc1052.ENR.STOR,group_240731
...,...,...,...
19001933,var_VNBI_218816,Bt96,group_153543
19001934,var_VNBI_218817,Bt96,group_153543
19001935,var_VNBI_218819,Bt96,group_153678
19001936,var_VNBI_218821,Bt96,group_153656


Make the absence/presence matrix of each group of variants in each strain


In [82]:
groups_presence = group_var_presence.copy()
groups_presence.drop(columns=['var_id'], inplace=True)
groups_presence.drop_duplicates(inplace=True)
groups_presence['presence'] = "1"
groups_presence


Unnamed: 0,strain,grouped_var_id,presence
1,PMHc1052.ENR.STOR,group_231054,1
2,PMHc1052.ENR.STOR,group_244616,1
3,PMHc1052.ENR.STOR,group_227880,1
4,PMHc1052.ENR.STOR,group_240741,1
5,PMHc1052.ENR.STOR,group_240731,1
...,...,...,...
19001927,Bt96,group_153542,1
19001928,Bt96,group_153543,1
19001929,Bt96,group_153541,1
19001936,Bt96,group_153656,1


In [83]:
vars_matrix = groups_presence.pivot(index='grouped_var_id', columns='strain', values='presence').fillna("0")
vars_matrix['H99'] = "0"
vars_matrix

strain,125.91,8-1,A1-35-8,A1-84-14,A2-102-5,A3-1-1,A3-38-20,A4-1-12,A4-34-6,A5-35-17,...,Tu406-1,Tu416-1,Tu422-1,Ug2459,Ug2462,Ug2463,WM148,WM626,Ze90-1,H99
grouped_var_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
group_0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
group_1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
group_10,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
group_100,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
group_1000,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
group_99995,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
group_99996,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
group_99997,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
group_99998,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


## Identify the phylogenetic status of each group of variants

In [25]:
t = Tree(tree_path)

Add the presence/absence matrix to the phylogeny

In [27]:
for var in vars_matrix.index:
    for leaf in t:
        leaf.add_features(**{var: vars_matrix.loc[var, leaf.name]})        

Check the phylogenetic status of each group of variants

In [28]:
status_dict = {}
for var in vars_matrix.index:
    status = t.check_monophyly(values="1", target_attr=var)
    status_dict[var] = status[1]

In [29]:
status_df = pd.DataFrame.from_dict(status_dict, orient='index', columns=['status']) 
status_df   

Unnamed: 0,status
group_0,polyphyletic
group_1,polyphyletic
group_10,polyphyletic
group_100,polyphyletic
group_1000,polyphyletic
...,...
group_99995,polyphyletic
group_99996,polyphyletic
group_99997,polyphyletic
group_99998,polyphyletic


Add the status to the original presence dataframe

In [84]:
variant_status = group_var_presence.merge(status_df, left_on='grouped_var_id', right_index=True)
variant_status.drop(columns=['grouped_var_id'], inplace=True)
variant_status = pd.concat([variant_status, unique_vars_presence])
variant_status.to_csv('/FastData/czirion/Crypto_Desjardins/fungal_pop/data/variant_status.csv')
variant_status

Unnamed: 0,var_id,strain,status
1,var_VNBII_288,PMHc1052.ENR.STOR,polyphyletic
1947,var_VNBII_14675,PMHc1052.ENR.STOR,polyphyletic
10765,var_VNBII_62267,PMHc1052.ENR.STOR,polyphyletic
10770,var_VNBII_62294,PMHc1052.ENR.STOR,polyphyletic
10819,var_VNBII_62553,PMHc1052.ENR.STOR,polyphyletic
...,...,...,...
18998833,var_VNBI_195687,Bt96,unique
18998844,var_VNBI_195913,Bt96,unique
18998848,var_VNBI_195946,Bt96,unique
18998849,var_VNBI_195948,Bt96,unique


Number of variants with each status

In [65]:
unique_vars = variant_status[variant_status['status'] == 'unique']['var_id'].nunique()
mono_vars = variant_status[variant_status['status'] == 'monophyletic']['var_id'].nunique()
para_vars = variant_status[variant_status['status'] == 'paraphyletic']['var_id'].nunique()
poly_vars = variant_status[variant_status['status'] == 'polyphyletic']['var_id'].nunique()
print(f"Unique variants: {unique_vars}")
print(f"Monophyletic variants: {mono_vars}")
print(f"Paraphyletic variants: {para_vars}")
print(f"Polyphyletic variants: {poly_vars}")
print(f"Total non-unique variants: {mono_vars + para_vars + poly_vars}")
print(f"Total variants: {mono_vars + para_vars + poly_vars + unique_vars}")


Unique variants: 356496
Monophyletic variants: 175688
Paraphyletic variants: 14305
Polyphyletic variants: 654325
Total non-unique variants: 844318
Total variants: 1200814


## Testing

In [None]:
t =  Tree("((((((a, e), i), o),h), u), ((f, g), j));")
print(t)
print(t.check_monophyly(values=["a", "e", "i", "o", "u"], target_attr="name", ))
print(t.check_monophyly(values=["a", "e", "i", "o"], target_attr="name"))
print(t.check_monophyly(values=["i", "o"], target_attr="name"))



                  /-a
               /-|
            /-|   \-e
           |  |
         /-|   \-i
        |  |
      /-|   \-o
     |  |
   /-|   \-h
  |  |
  |   \-u
--|
  |      /-f
  |   /-|
   \-|   \-g
     |
      \-j
(False, 'polyphyletic', {Tree node 'h' (0x73aeac2e47f)})
(True, 'monophyletic', set())
(False, 'paraphyletic', {Tree node 'e' (-0x7ffff8c4fb3fe5af), Tree node 'a' (0x73b04c01a4e)})


In [None]:
t =  Tree("((((((4, e), i), o),h), u), ((3, 4), (i, june)));")
# we annotate the tree using external data
colors = {"a":"red", "e":"green", "i":"yellow",
          "o":"black", "u":"purple", "4":"green",
          "3":"yellow", "1":"white", "5":"red",
          "june":"yellow"}
for leaf in t:
    leaf.add_features(color=colors.get(leaf.name, "none"))
print(t.get_ascii(attributes=["name", "color"], show_internal=False))

print("Green-yellow clusters:")
# And obtain clusters exclusively green and yellow
for node in t.get_monophyletic(values=["green", "yellow"], target_attr="color"):
   print(node.get_ascii(attributes=["color", "name"], show_internal=False))

In [None]:
import pandas as pd
import numpy as np

# Create a mock dataframe
mock_data = {
    'var_id': [f'var_{i}' for i in range(1, 11)],
    'a': np.random.choice([0, 1], size=10),
    'b': np.random.choice([0, 1], size=10),
    'c': np.random.choice([0, 1], size=10),
    'd': np.random.choice([0, 1], size=10),
    'e': np.random.choice([0, 1], size=10),
    'f': np.random.choice([0, 1], size=10),
    'g': np.random.choice([0, 1], size=10),
    'h': np.random.choice([0, 1], size=10),
    'i': np.random.choice([0, 1], size=10),
    'j': np.random.choice([0, 1], size=10)
}

mock_df = pd.DataFrame(mock_data)
# Convert integers to strings in the mock dataframe
mock_df = mock_df.astype(str)
mock_df.set_index('var_id', inplace=True)
print(mock_df)


In [None]:
t =  Tree("((((((a, b), c), d),e), f), ((g, h), (i, j)));")


for var in mock_df.index:
    for leaf in t:
        leaf.add_features(**{var: mock_df.loc[var, leaf.name]})
    status = t.check_monophyly(values="1", target_attr=var)
    status_dict = {var: status[1]}

    print(status_dict)        
        
