In [1]:
#!pip install dash
import dash
from dash import dcc, html, dash_table
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
import pandas as pd

Run Treecluster with different Threshold

| Use Case               | Recommended Threshold Range      |
|------------------------|----------------------------------|
| Outbreak detection     | 0.015–0.03                       |
| Species delimitation   | 0.02–0.05                        |
| Deep lineage grouping  | 0.05–0.1+                        |


To analyze the clusters later on and check the sublineages

In [2]:
#Run This Script in Terminal
'''Run this script in the terminal with the following command for help:
python TreeCluster.py -h
'''
#!TreeCluster.py -h

'''Example of how to run the script from the command line:
python TreeCluster.py -i /path/to/input/tree.nwk -o /path/to/output/cluster.txt -t 0.01
'''
#!TreeCluster.py -i /Users/MiladM-Dev/../tree.nwk -o /Users/MiladM-Dev/../trial0.01.txt -t 0.01

'Example of how to run the script from the command line:\npython TreeCluster.py -i /path/to/input/tree.nwk -o /path/to/output/cluster.txt -t 0.01\n'

Read files to prepare it for merging

In [3]:
metadata_tsv_file = pd.read_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/metadata.tsv', sep='\t')

#seq_index_tsv_file view
#metadata_tsv_file

In [4]:
cluster001 = pd.read_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/trial0.001.txt', sep='\t')
cluster015 = pd.read_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/trial0.015.txt', sep='\t')
cluster05 = pd.read_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/trial0.05.txt', sep='\t')
cluster03 = pd.read_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/trial0.03.txt', sep='\t')
cluster01 = pd.read_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/trial0.01.txt', sep='\t')
cluster1 = pd.read_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/trial0.1.txt', sep='\t')

cluster01= cluster01.rename(columns={'SequenceName': 'ID'})
cluster05= cluster05.rename(columns={'SequenceName': 'ID'})
cluster03= cluster03.rename(columns={'SequenceName': 'ID'})
cluster001= cluster001.rename(columns={'SequenceName': 'ID'})
cluster015= cluster015.rename(columns={'SequenceName': 'ID'})
cluster1= cluster1.rename(columns={'SequenceName': 'ID'})

### describe the clusters data

In [5]:
'''number of clusters'''
cluster001['ClusterNumber'].nunique()#

714

In [6]:

cluster001.describe().loc[["count", "min", "max"]]
#cluster1.unique()


Unnamed: 0,ClusterNumber
count,3463.0
min,-1.0
max,713.0


In [7]:
cluster_counts01 = cluster01['ClusterNumber'].value_counts().to_frame(name='Count')
cluster_counts01.reset_index(inplace=True)
cluster_counts01.columns = ['ClusterNumber01', 'Count']

cluster_counts1 = cluster1['ClusterNumber'].value_counts().to_frame(name='Count')
cluster_counts1.reset_index(inplace=True)
cluster_counts1.columns = ['ClusterNumber1', 'Count']

cluster_counts03 = cluster03['ClusterNumber'].value_counts().to_frame(name='Count')
cluster_counts03.reset_index(inplace=True)  
cluster_counts03.columns = ['ClusterNumber03', 'Count']

cluster_counts05 = cluster05['ClusterNumber'].value_counts().to_frame(name='Count')
cluster_counts05.reset_index(inplace=True)
cluster_counts05.columns = ['ClusterNumber05', 'Count']

cluster_counts015 = cluster015['ClusterNumber'].value_counts().to_frame(name='Count')
cluster_counts015.reset_index(inplace=True)
cluster_counts015.columns = ['ClusterNumber015', 'Count']

cluster_counts001 = cluster001['ClusterNumber'].value_counts().to_frame(name='Count')
cluster_counts001.reset_index(inplace=True)
cluster_counts001.columns = ['ClusterNumber001', 'Count']

cluster_counts_combined = pd.concat([cluster_counts001, cluster_counts01, cluster_counts01, cluster_counts015, cluster_counts03, cluster_counts05,  cluster_counts1], axis=1)
cluster_counts_combined.to_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/cluster_counts_combined.csv', index=False)  


#### Create metadata CSV include:
1. Aligned Sequence
2. Location
3. Cluster number
4. IDs
5. years of sampling

Add Sequence:

In [8]:
"""Convert Fasta to DataFrame by Biopython"""
from Bio import SeqIO

fasta_file = '/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/aligned.fasta'
records = [{"ID": record.id, "seq": str(record.seq)} for record in SeqIO.parse(fasta_file, "fasta")]

# Convert to DataFrame
df_fasta = pd.DataFrame(records)

metadata_tsv_file = pd.read_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/metadata.tsv', sep='\t')
metadata_tsv_file.rename(columns={'SequenceName': 'ID'}, inplace=True)

"""de_filtered but I lost the last version used for this sequence"""
df_de = pd.read_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/de_filtered.csv')
df_de.rename(columns={'ID_xsl_prefix': 'ID'}, inplace=True)
df_de_filtered = df_de[['ID', 'BLand']]
df_de_filtered


Unnamed: 0,ID,BLand
0,00-00022,Nordrhein-Westfalen
1,00-00035,Nordrhein-Westfalen
2,00-00043,Bayern
3,00-00049,Bayern
4,00-00061,Bayern
...,...,...
3643,24-01146,Nordrhein-Westfalen
3644,24-01149,Nordrhein-Westfalen
3645,24-01150,Hessen
3646,24-01151,Hessen


In [9]:
#merge

merge_1 = pd.merge(cluster01, df_fasta , on='ID')
merge_2 = pd.merge(merge_1, metadata_tsv_file, on='ID')
merge_3 = pd.merge(merge_2, df_de_filtered, on='ID', how='left')
merge_3 = merge_3.rename(columns={'ID': 'SequenceName'})
merge_3.to_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/merged_metadata.csv', index=False)

Add: Genotypes and dates

In [10]:
### Split DataFrame by Genotypes

In this section, we will split the DataFrame into subsets based on the genotypes present in the data.

SyntaxError: invalid syntax (3013603512.py, line 3)

In [None]:
D8 = merge_3[merge_3['type'] == 'D8']
B3 = merge_3[merge_3['type'] == 'B3']
B3['SequenceName'] = B3['SequenceName'] + '_B3_' + B3['ClusterNumber'].astype(str)
D8['SequenceName'] = D8['SequenceName'] + '_D8_' + D8['ClusterNumber'].astype(str)
D8.to_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/data/metadata_D8.csv', index=False)
B3.to_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/data/metadata_B3.csv', index=False)
B3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  B3['SequenceName'] = B3['SequenceName'] + '_B3_' + B3['ClusterNumber'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  D8['SequenceName'] = D8['SequenceName'] + '_D8_' + D8['ClusterNumber'].astype(str)


Unnamed: 0,SequenceName,ClusterNumber,seq,type,date,BLand
461,12-00245_B3_-1,-1,AAGGTCAGTTCCACATTGGCATCTGAACTCGGTATCACTGCCGAGG...,B3,2012-06-21,Bayern
462,06-00057_B3_10,10,AAGGTCAGTTCCACATTGGCATCTGAACTCGGTATCACTGCCGAGG...,B3,2006-02-20,Baden-Württemberg
463,06-00058_B3_10,10,AAGGTCAGTTCCACATTGGCATCTGAACTCGGTATCACTGCCGAGG...,B3,2006-02-20,Baden-Württemberg
464,07-00450_B3_10,10,AAGGTCAGTTCCACATTGGCATCTGAACTCGGTATCACTGCCGAGG...,B3,2007-06-27,Baden-Württemberg
465,06-00054_B3_10,10,AAGGTCAGTTCCACATTGGCATCTGAACTCGGTATCACTGCCGAGG...,B3,2006-02-17,Baden-Württemberg
...,...,...,...,...,...,...
2794,23-00184_B3_103,103,---GTCAGTTCCACATTGGCATCTGAACTCGGTATCACTGCCGAGG...,B3,2023-03-20,Bayern
2795,18-00040_B3_103,103,AAGGTCAGTTCCACATTGGCATCTGAACTCGGTATCACTGCCGAGG...,B3,2018-01-10,Baden-Württemberg
2796,11-00756_B3_103,103,AAGGTCAGTTCCACATTGGCATCTGAACTCGGTATCACTGCCGAGG...,B3,2011-08-02,Bayern
2797,11-00742_B3_103,103,AAGGTCAGTTCCACATTGGCATCTGAACTCGGTATCACTGCCGAGG...,B3,2011-07-21,Bayern


In [2]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

D8 = pd.read_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/data/metadata_D8.csv')
B3 = pd.read_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/data/metadata_B3.csv')

# Filter sequences based on ClusterNumber
B3_notclustered = B3[B3['ClusterNumber'] == -1]
D8_notclustered = D8[D8['ClusterNumber'] == -1]

# Filter sequences based on date range
B3_20_24 = B3[(B3['date'] >= '2020-01-01') & (B3['date'] <= '2024-12-31')]
D8_20_24 = D8[(D8['date'] >= '2020-01-01') & (D8['date'] <= '2024-12-31')]

# Filter sequences based on ClusterNumber, including only those with ClusterNumber > 0, and get representative sequences
B3_rep = B3[B3['ClusterNumber'] > 0]
B3_rep = B3_rep.drop_duplicates(subset=['ClusterNumber'], keep='first')
D8_rep = D8[D8['ClusterNumber'] > 0]
D8_rep = D8_rep.drop_duplicates(subset=['ClusterNumber'], keep='first')

B3_rep_notclustered = pd.concat([B3_notclustered, B3_rep], ignore_index=True)
D8_rep_notclustered = pd.concat([D8_notclustered, D8_rep], ignore_index=True)


# Create SeqRecord objects for each sequence in the metadata
def create_seq_record(df):
    records = []
    for _, row in df.iterrows():
        seq = Seq(row['seq'])
        record = SeqRecord(seq, id=row['SequenceName'], description="")
        records.append(record)
    return records

create_seq_record_B3 = create_seq_record(B3)
create_seq_record_B3_20_24 = create_seq_record(B3_20_24)
create_seq_record_D8_20_24 = create_seq_record(D8_20_24)
create_seq_record_B3_notclustered = create_seq_record(B3_notclustered)
create_seq_record_D8_notclustered = create_seq_record(D8_notclustered)
create_seq_record_D8 = create_seq_record(D8)
create_seq_record_D8_rep = create_seq_record(D8_rep)
create_seq_record_B3_rep = create_seq_record(B3_rep)
create_seq_record_B3_rep_notclustered = create_seq_record(B3_rep_notclustered)
create_seq_record_D8_rep_notclustered = create_seq_record(D8_rep_notclustered)

# Write the sequences to a FASTA file
output_fasta_file = '/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/data/aligned_'
SeqIO.write(create_seq_record_B3, output_fasta_file +'B3.fasta', "fasta")
SeqIO.write(create_seq_record_D8, output_fasta_file +'D8.fasta', "fasta")
SeqIO.write(create_seq_record_B3_20_24, output_fasta_file +'B3_20_24.fasta', "fasta")
SeqIO.write(create_seq_record_D8_20_24, output_fasta_file +'D8_20_24.fasta', "fasta")
SeqIO.write(create_seq_record_B3_notclustered, output_fasta_file +'B3_notclustered.fasta', "fasta")
SeqIO.write(create_seq_record_D8_notclustered, output_fasta_file +'D8_notclustered.fasta', "fasta")
SeqIO.write(create_seq_record_D8_rep, output_fasta_file +'D8_rep.fasta', "fasta")
SeqIO.write(create_seq_record_B3_rep, output_fasta_file +'B3_rep.fasta', "fasta")
SeqIO.write(create_seq_record_B3_rep_notclustered, output_fasta_file +'B3_rep_notclustered.fasta', "fasta")
SeqIO.write(create_seq_record_D8_rep_notclustered, output_fasta_file +'D8_rep_notclustered.fasta', "fasta")

134

In [17]:
# Split into a dictionary of DataFrames keyed by cluster number
B3_by_cluster = {cluster: group for cluster, group in B3.groupby('ClusterNumber')}
D8_by_cluster = {cluster: group for cluster, group in D8.groupby('ClusterNumber')}
# For example, DataFrame for cluster 0
#df_cluster_0 = dfs_by_cluster[0]

# To view all:
for cluster_id, df_cluster in B3_by_cluster.items():
    print(f"Cluster {cluster_id}:")
    create_seq_record(df_cluster)
    df_cluster.to_csv(f'/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/data/B3_cluster_{cluster_id}.csv', index=False)
    SeqIO.write(create_seq_record(df_cluster), f'/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/data/B3_cluster_{cluster_id}.fasta', "fasta")

for cluster_id, df_cluster in D8_by_cluster.items():
    print(f"Cluster {cluster_id}:")
    create_seq_record(df_cluster)
    df_cluster.to_csv(f'/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/data/D8_cluster_{cluster_id}.csv', index=False)
    SeqIO.write(create_seq_record(df_cluster), f'/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/data/D8_cluster_{cluster_id}.fasta', "fasta")

    

Cluster -1:
Cluster 10:
Cluster 14:
Cluster 15:
Cluster 19:
Cluster 20:
Cluster 21:
Cluster 22:
Cluster 23:
Cluster 24:
Cluster 25:
Cluster 26:
Cluster 27:
Cluster 40:
Cluster 41:
Cluster 103:
Cluster -1:
Cluster 17:
Cluster 18:
Cluster 35:
Cluster 36:
Cluster 37:
Cluster 38:
Cluster 39:
Cluster 47:
Cluster 48:
Cluster 49:
Cluster 50:
Cluster 51:
Cluster 52:
Cluster 53:
Cluster 54:
Cluster 55:
Cluster 56:
Cluster 57:
Cluster 58:
Cluster 59:
Cluster 72:
Cluster 73:
Cluster 74:
Cluster 75:
Cluster 76:
Cluster 77:
Cluster 78:
Cluster 82:
Cluster 83:
Cluster 84:
Cluster 85:
Cluster 86:
Cluster 87:
Cluster 88:
Cluster 89:
Cluster 90:
Cluster 91:
Cluster 92:
Cluster 93:
Cluster 96:
Cluster 97:
Cluster 98:
Cluster 99:
Cluster 100:
Cluster 101:
Cluster 102:
Cluster 104:
Cluster 105:
Cluster 106:
Cluster 109:
Cluster 112:
Cluster 113:
Cluster 114:
Cluster 115:
Cluster 116:
Cluster 117:
Cluster 118:
Cluster 119:
Cluster 120:
Cluster 121:
Cluster 122:
Cluster 123:
Cluster 124:
Cluster 125:
Cluste