In [1]:
#!pip install dash
import dash
from dash import dcc, html, dash_table
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
import pandas as pd

Run Treecluster with different Threshold

| Use Case               | Recommended Threshold Range      |
|------------------------|----------------------------------|
| Outbreak detection     | 0.015–0.03                       |
| Species delimitation   | 0.02–0.05                        |
| Deep lineage grouping  | 0.05–0.1+                        |


To analyze the clusters later on and check the sublineages

In [2]:
#Run This Script in Terminal
'''Run this script in the terminal with the following command for help:
python TreeCluster.py -h
'''
#!TreeCluster.py -h

'''Example of how to run the script from the command line:
python TreeCluster.py -i /path/to/input/tree.nwk -o /path/to/output/cluster.txt -t 0.01
'''
#!TreeCluster.py -i /Users/MiladM-Dev/../tree.nwk -o /Users/MiladM-Dev/../trial0.01.txt -t 0.01

'Example of how to run the script from the command line:\npython TreeCluster.py -i /path/to/input/tree.nwk -o /path/to/output/cluster.txt -t 0.01\n'

Read files to prepare it for merging

In [3]:
metadata_tsv_file = pd.read_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/metadata.tsv', sep='\t')

#seq_index_tsv_file view
#metadata_tsv_file

In [4]:
cluster015 = pd.read_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/trial0.015.txt', sep='\t')
cluster05 = pd.read_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/trial0.05.txt', sep='\t')
cluster03 = pd.read_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/trial0.03.txt', sep='\t')
cluster01 = pd.read_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/trial0.01.txt', sep='\t')
cluster1 = pd.read_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/trial0.1.txt', sep='\t')

In [5]:
#cluster015['ClusterNumber'].value_counts()



### describe the clusters data

In [6]:
'''number of clusters'''
cluster015['ClusterNumber'].nunique()#

76

In [7]:

cluster1.describe().loc[["count", "min", "max"]]
#cluster1.unique()


Unnamed: 0,ClusterNumber
count,3463.0
min,1.0
max,4.0


In [8]:
cluster01.describe().loc[["count", "min", "max"]]

Unnamed: 0,ClusterNumber
count,3463.0
min,-1.0
max,142.0


In [9]:
cluster03.describe().loc[["count", "min", "max"]]

Unnamed: 0,ClusterNumber
count,3463.0
min,-1.0
max,19.0


In [10]:
cluster05.describe().loc[["count", "min", "max"]]

Unnamed: 0,ClusterNumber
count,3463.0
min,-1.0
max,10.0


In [11]:
cluster015.describe().loc[["count", "min", "max"]]

Unnamed: 0,ClusterNumber
count,3463.0
min,-1.0
max,75.0


In [12]:
cluster_counts01 = cluster01['ClusterNumber'].value_counts().to_frame(name='Count')
cluster_counts01.reset_index(inplace=True)
cluster_counts01.columns = ['ClusterNumber01', 'Count']

cluster_counts1 = cluster1['ClusterNumber'].value_counts().to_frame(name='Count')
cluster_counts1.reset_index(inplace=True)
cluster_counts1.columns = ['ClusterNumber1', 'Count']

cluster_counts03 = cluster03['ClusterNumber'].value_counts().to_frame(name='Count')
cluster_counts03.reset_index(inplace=True)  
cluster_counts03.columns = ['ClusterNumber03', 'Count']

cluster_counts05 = cluster05['ClusterNumber'].value_counts().to_frame(name='Count')
cluster_counts05.reset_index(inplace=True)
cluster_counts05.columns = ['ClusterNumber05', 'Count']

cluster_counts015 = cluster015['ClusterNumber'].value_counts().to_frame(name='Count')
cluster_counts015.reset_index(inplace=True)
cluster_counts015.columns = ['ClusterNumber015', 'Count']

cluster_counts_combined = pd.concat([cluster_counts01, cluster_counts015, cluster_counts03, cluster_counts05,  cluster_counts1], axis=1)
cluster_counts_combined.to_csv('/Users/MiladM-Dev/Documents/1PhD/project-1-N450/project-1.1-gendata/Nextstrain/trial-1/results/clusters/cluster_counts_combined.csv', index=False)  


In [13]:
#rename
seq_index_tsv_file= seq_index_tsv_file.rename(columns={'strain': 'ID'})
metadata_tsv_file= metadata_tsv_file.rename(columns={'name': 'ID'})

NameError: name 'seq_index_tsv_file' is not defined

In [None]:
#merge

merged_df = pd.merge(metadata_tsv_file, seq_index_tsv_file , on='ID')
merged_df = merged_df.drop(['N', 'other_IUPAC', '-', '?', 'invalid_nucleotides'], axis=1)
merged_df

In [None]:
# Initialize the Dash app
app = dash.Dash(__name__)

# 1. Pie chart: Count and percentage of types
type_counts = merged_df['type'].value_counts().reset_index()
type_counts.columns = ['type', 'count']
type_counts['percentage'] = (type_counts['count'] / type_counts['count'].sum()) * 100

pie_chart = go.Figure(data=[go.Pie(
    labels=type_counts['type'],
    values=type_counts['count'],
    hoverinfo='label+percent+value',
    textinfo='label+percent+value'
)])

# Dash layout with pie chart and table
app.layout = html.Div([
    html.H1("Data Visualizations"),
    
    # Pie Chart
    dcc.Graph(
        id='pie-chart',

        figure=pie_chart.update_layout(width=650, height=650)
    ), 

    # Table displaying the counts and percentages
    html.H2("Pie Chart Data Table"),
    dash_table.DataTable(
        id='pie-table',
        columns=[
            {"name": "Type", "id": "type"},
            {"name": "Count", "id": "count"},
            {"name": "Percentage", "id": "percentage", "type": "numeric", "format": {"specifier": ".2f"}}
        ],
        data=type_counts.to_dict('records'),
        style_table={'width': '50%', 'margin': 'auto'},
        style_header={'fontWeight': 'bold', 'textAlign': 'center'},
        style_cell={'textAlign': 'center'},
    )
])

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)