In [22]:
import os
import matplotlib.pyplot as plt
from Bio import AlignIO
from Bio.AlignIO.PhylipIO import RelaxedPhylipWriter
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment

import pandas as pd
from plotly import graph_objects as go
from plotly.subplots import make_subplots



states = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+",
",", "/", ":", ";", "<", "=", ">", "@", "[", "\\", "]", "^", "_", "{", "|", "}", "~"]
idx_for_state = {}
for i, state in enumerate(states):
    idx_for_state[state] = i
idx_for_state['-'] = -1
idx_for_state['?'] = -1

def determine_num_states(data_type, model):
    d = "alignments/" + data_type + "/" + model
    align_num_states = {}
    row_num_states = {}
    column_num_states = {}
    with os.scandir(d) as it:
        for entry in it:
            if not entry.is_file():
                continue
            try:
                align = AlignIO.read(os.path.join(d, entry.name), "phylip-relaxed")
            except:
                print(entry.name)
                continue
            column_maxima = [max([idx_for_state[char] + 1for char in rec.seq]) for rec in align]
            row_num_states[entry.name] = column_maxima
            align_num_states[entry.name] = max(column_maxima)
            column_num_states[entry.name] = [max([idx_for_state[record.seq[c]] + 1 for record in align]) for c in range(len(align[0].seq))]
    return (row_num_states, column_num_states, align_num_states)
            
            
def write_num_states(data_type, model):
    (_, _, num_states) = determine_num_states(data_type, model)
    outfile = open("temp/" + data_type + "/max_states.csv", 'w+')
    outfile.write("name,states\n")
    for (name, num) in num_states.items():
        outfile.write(name+"," + str(num) +"\n")
        
def write_avg_col_states(data_type, model):
    (_, col_num_states, _) = determine_num_states(data_type, model)
    outfile = open("temp/" + data_type + "/avg_col_states.csv", 'w+')
    outfile.write("name,avg_states\n")
    for (name, col_states) in col_num_states.items():
        avg = sum(col_states)/len(col_states)
        outfile.write(name+"," + str(avg) +"\n") 
        
def x_values(df, col, eliminate_outlier):
    if eliminate_outlier:
        lower = df[col].quantile(0.1)
        upper = df[col].quantile(0.9)
    else:
        lower = df[col].min()
        upper = df[col].max()
    return df.loc[df[col].between(lower, upper)][col]
        
def read_and_plot_num_states(data_type, model):
    df = pd.read_csv("temp/" + data_type + "/max_states.csv")
    df_avg = pd.read_csv("temp/" + data_type + "/avg_col_states.csv")
    d = {}
    for i, row in df_avg.iterrows():
        d[row["name"]] = row["avg_states"]
    avg_states = []
    for i, row in df.iterrows():
        avg_states.append(d[row["name"]])
    df["avg_states"]  =avg_states
    fig = make_subplots(
        rows=1,
        cols=1,
        subplot_titles=["Number of values for characteristics"]
    )
    
        
    fig.append_trace(
        go.Histogram(
            x=x_values(df, 'states', True),
            name="n_max",
            showlegend=True
            #, histnorm="percent"
        ),
        row=1,
        col=1
    )

    fig.append_trace(
        go.Histogram(
            x=x_values(df, 'avg_states', True),
            name="n_avg",
            showlegend=True
            #, histnorm="percent"
        ),
        row=1,
        col=1
    )
    return fig


data_type = "lang"
model = "multi"
#plot_num_states(data_type, model)
write_avg_col_states(data_type, model)
write_num_states(data_type, model)
read_and_plot_num_states(data_type, model).update_layout(template="plotly_white", height=500)