In [1]:
import pandas as pd
import json
from pathlib import Path
from utils import sanitize_names
from taxonomy import get_lineage
import pprint
import numpy as np

In [7]:
def create_virus_metadata(virus_dir: Path):
    metadata_json = virus_dir.joinpath('virus.json')
    with metadata_json.open() as mj:
        virus_metadata = sanitize_names(json.load(mj), virus=True)
    return virus_metadata


def create_host_metadata(host_dir: Path):
    metadata_json = host_dir.joinpath('host.json')
    with metadata_json.open() as mj:
        host_metadata = sanitize_names(json.load(mj), virus=False)
    return host_metadata


def create_dict_by_rank(data, rank_to_search):
    result_dict = {}

    # for key in data:
    #     temp = get_lineage(data[key], 'lineage_names')
    #     key
    
    for item in data.values():
        temp = get_lineage(item, 'lineage_names')
        item['lineage_names'] = list(temp.values())
        rank_list = item['lineage_ranks']
        name_list = item['lineage_names']

        if rank_to_search in rank_list:
            rank_index = rank_list.index(rank_to_search)
            rank_name_dict = dict(zip(rank_list, name_list))
            # rank_name_dict = get_lineage(item, 'lineage_names')
            result_dict[name_list[rank_index]] = rank_name_dict
        
    return result_dict


# this can be better just in case - more flexible
def test(col, master_virus_dict, tax_level, map_dict=None):
    sorted_col = col.sort_values(ascending=False)[:10]
    print(sorted_col.name)
    print(sorted_col.index)
    lineage = get_lineage(master_virus_dict[sorted_col.name]['host'], 'lineage_names')
    print(lineage)
    if map_dict is None:
        sub_df = {f'top{i + 1}': 1 if lineage[tax_level] in sorted_col.index[:i + 1] else 0 for i, name in enumerate(sorted_col.index)}
        print(sub_df)
        return sub_df
    
    sorted_col.index = [map_dict[x][tax_level] for x in sorted_col.index]
    sub_df = {f'top{i + 1}': 1 if lineage[tax_level] in sorted_col.index[:i + 1] else 0 for i, name in enumerate(sorted_col.index)}
    print(sub_df)
    return sub_df


def distribution(col, master_virus_dict, tax_level, map_dict=None):
    sorted_col = col.sort_values(ascending=False)[:1]
    print(sorted_col.name)
    print(sorted_col.index)
    lineage = get_lineage(master_virus_dict[sorted_col.name]['host'], 'lineage_names')
    print(lineage)
    if map_dict is None:
        sub_df = {lineage[tax_level]: 1 if lineage[tax_level] in sorted_col.index[:i + 1] else 0 for i, name in enumerate(sorted_col.index)}
        print(sub_df)
        return sub_df
    
    sorted_col.index = [map_dict[x][tax_level] for x in sorted_col.index]
    sub_df = {lineage[tax_level]: 1 if lineage[tax_level] in sorted_col.index[:i + 1] else 0 for i, name in enumerate(sorted_col.index)}
    print(sub_df)
    return sub_df


def distribution_nocheck(col, master_virus_dict, tax_level, map_dict=None):
    sorted_col = col.sort_values(ascending=False)[:1]
    print(sorted_col.name)
    print(sorted_col.index)
    lineage = get_lineage(master_virus_dict[sorted_col.name]['host'], 'lineage_names')
    print(lineage)
    if map_dict is None:
        sub_df = {lineage[tax_level]: 1 if lineage[tax_level] in sorted_col.index[:i + 1] else 0 for i, name in enumerate(sorted_col.index)}
        print(sub_df)
        return sub_df
    
    sorted_col.index = [map_dict[x][tax_level] for x in sorted_col.index]
    sub_df = {sorted_col.index[:i + 1][0]: 1 for i, name in enumerate(sorted_col.index)}
    print(sub_df)
    return sub_df


In [8]:
meta_vir = create_virus_metadata(Path('X:/edwards2016/virus/'))
meta_host = create_host_metadata(Path('X:/edwards2016/host/'))
df = pd.read_csv('../phdna_proper_opt/species_noise_04-07-2023/predictions.csv', index_col=0)
df = pd.read_csv('../phdna_proper_opt/fam_noise_05-07-2023.best_classifier/predictions.csv', index_col=0)
df.fillna(0, inplace=True)
# set type of float for all columns     
# df = df.astype(float)
print(len(df.columns))
# get top 5 rows from each column - each column is sorted descending value  
# top_5 = df.apply(lambda x: x.sort_values(ascending=False))
map_dict = create_dict_by_rank(meta_host, 'family')
print(map_dict)
pprint.pprint(meta_vir)
top_5 = df.apply(test, args=(meta_vir, 'species', None))
dist = df.apply(distribution, args=(meta_vir, 'class', map_dict))
dist_nocheck = df.apply(distribution_nocheck, args=(meta_vir, 'class', map_dict))
df_top_5 = pd.DataFrame(top_5.to_list(), index=top_5.index).T
df_dist = pd.DataFrame(dist.to_list(), index=dist.index).T
df_dist_nocheck = pd.DataFrame(dist_nocheck.to_list(), index=dist_nocheck.index).T
df_top_5['correct'] = df_top_5.sum(axis=1)
df_top_5['correct_percent'] = df_top_5['correct'] / len(df.columns)
df_dist['all'] = df_dist.count(axis=1)
df_dist['Poprawni gospodarze'] = (df_dist['all'] / df_dist['all'].sum(axis=0)) * 100
df_dist_nocheck_counts = df_dist_nocheck.count(axis=1).to_frame()
df_dist_nocheck_counts.columns = ['all_predicted']
df_dist = pd.concat([df_dist, df_dist_nocheck_counts], axis=1)
df_dist['Przewidywani gospodarze'] = (df_dist['all_predicted'] / df_dist['all_predicted'].sum(axis=0)) * 100
df_dist['correct'] = df_dist.loc[:, ~df_dist.columns.isin(["all", 'all_predicted', 'Poprawni gospodarze', 'Przewidywani gospodarze'])].apply(lambda row: sum(row == 1), axis=1)
df_dist['Poprawnie przewidywani gospodarze'] = (df_dist['correct'] / df_dist['all'].sum(axis=0)) * 100
print(df_dist['Poprawnie przewidywani gospodarze'].sum())
df_dist['Poprawnie przewidywani gospodarze w poprawnych klasach'] = (df_dist['correct'] / df_dist['all']) * 100
df_dist['ratio_all_allpred'] = df_dist['Przewidywani gospodarze'] / df_dist['Poprawni gospodarze']
df_dist['ratio_all_allpred_log'] = np.log2(df_dist['Przewidywani gospodarze'] / df_dist['Poprawni gospodarze'])
df_dist['ratio_allpred_all'] = df_dist['Poprawni gospodarze'] / df_dist['Przewidywani gospodarze']
def revise_values(row):
    if pd.isna(row['Przewidywani gospodarze']):
        return -1
    elif pd.isna(row['Poprawni gospodarze']):
        return row['Przewidywani gospodarze']
    else:
        return row['ratio_all_allpred']
# df_dist['ratio_all_allpred'] = df_dist.apply(revise_values, axis=1)
print(df_top_5['correct_percent'])
print(df_dist['ratio_all_allpred'])
# print(top_5)
# print(df)
# df.set_index('index', inplace=True)

820
{'Xanthomonadaceae': {'superkingdom': 'Bacteria', 'phylum': 'Proteobacteria', 'class': 'Gammaproteobacteria', 'order': 'Xanthomonadales', 'family': 'Xanthomonadaceae', 'genus': 'Xanthomonas', 'species': 'Xanthomonas euvesicatoria'}, 'Acholeplasmataceae': {'superkingdom': 'Bacteria', 'phylum': 'Tenericutes', 'class': 'Mollicutes', 'order': 'Acholeplasmatales', 'family': 'Acholeplasmataceae', 'genus': 'Candidatus Phytoplasma', 'species': 'Candidatus Phytoplasma australiense'}, 'Enterobacteriaceae': {'superkingdom': 'Bacteria', 'phylum': 'Proteobacteria', 'class': 'Gammaproteobacteria', 'order': 'Enterobacteriales', 'family': 'Enterobacteriaceae', 'genus': 'Candidatus Blochmannia', 'species': 'Candidatus Blochmannia pennsylvanicus'}, 'Erythrobacteraceae': {'superkingdom': 'Bacteria', 'phylum': 'Proteobacteria', 'class': 'Alphaproteobacteria', 'order': 'Sphingomonadales', 'family': 'Erythrobacteraceae', 'genus': 'Erythrobacter', 'species': 'Erythrobacter litoralis'}, 'Listeriaceae': {'

In [30]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

crashes = sns.load_dataset("car_crashes").sort_values("total", ascending=False)
tips = px.data.tips()
df_dist = df_dist.reset_index().rename(columns={'index': 'class'})
df_melted = df_dist.melt(id_vars='class', value_vars=['Poprawni gospodarze', 'Przewidywani gospodarze', 'Poprawnie przewidywani gospodarze'], var_name='dist_type', value_name='percentage')
df_melted_class = df_dist.melt(id_vars='class', value_vars=['Poprawnie przewidywani gospodarze w poprawnych klasach'], var_name='dist_type', value_name='percentage')
fig = px.bar(df_melted, x="percentage", y="dist_type", color='class', orientation='h', text='percentage', text_auto='.3f',
             height=700,
             width=1400,
            #  title='Dystrybucja klas gospodarza w przewidywaniach',
             template='seaborn',
             labels={"percentage":"% przewidywań", "dist_type":"Typ dystrybucji", "class":"Klasy"}
             )
fig.update_traces(textposition='inside')
fig.update_layout(
    font_family="Roboto",
    font_size=14
)
fig.write_image('../phdna_proper_opt/distribution/distribution.png', scale=2.5)
fig.show()
fig = px.bar(df_melted_class, x="percentage", y="dist_type", color='class', orientation='h', text='percentage', text_auto='.3f',
             height=700,
             width=1400,
            #  title='Dystrybucja klas gospodarza w przewidywaniach',
             template='seaborn',
             labels={"percentage":"% przewidywań", "dist_type":"Typ dystrybucji", "class":"Klasy"}
             )
fig.update_traces(textposition='inside')
fig.update_layout(
    font_family="Roboto",
    font_size=14
)
fig.write_image('../phdna_proper_opt/distribution/distribution_inclass.png', scale=2.5)
fig.show()
df_dist["all"].fillna(0, inplace=True)
df_dist["all_predicted"].fillna(0, inplace=True)
df_dist["class_num"] = df_dist.apply(lambda row: f"{row['class']} ({str(int(row['all']))} / {str(int(row['all_predicted']))})", axis=1)
df_melted_ratio = df_dist.melt(id_vars='class_num', value_vars=['ratio_all_allpred'], var_name='dist_type', value_name='fold_change')
df_melted_ratio_log = df_dist.melt(id_vars='class', value_vars=['ratio_all_allpred_log'], var_name='dist_type', value_name='fold_change')
def color_fold(row):
    if row['fold_change'] < 1:
        return '#C44F51'
    elif row['fold_change'] == 1:
        return '#8C8C8C'
    else:
        return '#55A868'
def color_log_fold(row):
    if row['fold_change'] < 0:
        return '#C44F51'
    elif row['fold_change'] == 0:
        return '#8C8C8C'
    else:
        return '#55A868'
# df_melted_ratio['color'] = df_melted_ratio.apply(lambda x: '#C44F51' if x['fold_change'] < 1 else '#55A868', axis=1) 
df_melted_ratio['color'] = df_melted_ratio.apply(color_fold, axis=1) 
df_melted_ratio_log['color'] = df_melted_ratio_log.apply(color_log_fold, axis=1) 
df_melted_ratio['base'] = 1
fig = px.bar(df_melted_ratio, y="class_num", x="fold_change", orientation='h', text='fold_change', text_auto='.3f',
             height=600,
             width=1000,
            #  title='Proporcja dystrybucji klas gospodarza w przewidywaniach (przewidziane przez program / prawdziwe przewidywania)',
             template='seaborn',
             labels={"fold_change":"Krotność zmiany", "dist_type":"Typ dystrybucji", "class_num":""}
             )
fig.update_traces(marker_color=df_melted_ratio['color'])
fig.update_traces(textposition='outside')
fig.update_layout(
    font_family="Roboto",
    font_size=14
)
fig.write_image('../phdna_proper_opt/distribution/dist_foldchange.png', scale=2.5)
fig.write_image('../phdna_proper_opt/distribution/dist_foldchange_svgver.svg', scale=2.5)
fig.show()
fig = px.bar(df_melted_ratio_log, y="class", x="fold_change", orientation='h', text='fold_change', text_auto='.3f',
             height=600,
             width=1200,
            #  title='Proporcja dystrybucji klas gospodarza w przewidywaniach (przewidziane przez program / prawdziwe przewidywania)',
             template='seaborn',
             labels={"fold_change":"Krotność zmiany (log2)", "dist_type":"Typ dystrybucji", "class":""}
             )
fig.update_traces(marker_color=df_melted_ratio_log['color'])
fig.update_traces(textposition='outside')
fig.update_layout(
    font_family="Roboto",
    font_size=14
)
fig.write_image('../phdna_proper_opt/distribution/dist_foldchange_log.png', scale=2.5)
# fig.add_trace(
#     go.Scatter(
#         y=df_melted_ratio['base'],
#         x=df_melted_ratio['class'],
#         mode='lines'
#     )
# )
fig.show()


In [52]:
pprint.pprint(meta_host)

{'NC_000117': {'lineage_names': ['Bacteria',
                                 'Chlamydiae',
                                 'Chlamydiia',
                                 'Chlamydiales',
                                 'Chlamydiaceae',
                                 'Chlamydia',
                                 'Chlamydia trachomatis'],
               'lineage_ranks': ['superkingdom',
                                 'phylum',
                                 'class',
                                 'order',
                                 'family',
                                 'genus',
                                 'species'],
               'organism_name': 'Chlamydia trachomatis D/UW-3/CX',
               'taxid': '272561',
               'version': 'NC_000117.1'},
 'NC_000853': {'lineage_names': ['Bacteria',
                                 'Thermotogae',
                                 'Thermotogae',
                                 'Thermotogales',
                 

In [4]:
from collections import defaultdict
import numpy as np
import json


df_phist = pd.read_csv('../phdna_proper_opt/phist/predictions.csv')
df_phist = df_phist.groupby('phage').head(10)
# df_phist = df_phist.fillna('')
# df_phist[['phage', 'host']]
# df_phist[['phage', 'host']] = df_phist[['phage', 'host']].str.split('.')[0]
for col in ['phage', 'host']:
    df_phist[col] = df_phist[col].str.split('.').str[0]

phist_dict = defaultdict(list)
for k, v in zip(df_phist['phage'],  df_phist['host']):
    if v is np.nan:
        phist_dict[k] = []
        continue
    phist_dict[k].append([v])
with open("../phdna_proper_opt/comparison_src/phist_top10.json", 'w') as f:
    json.dump(dict(phist_dict), f)
# phist_dict = {k: v for k, v in zip(df_phist['phage'],  df_phist['host'])}
pprint.pprint(phist_dict)

defaultdict(<class 'list'>,
            {'NC_000866': [['NC_010658'],
                           ['NC_007613'],
                           ['NC_008258'],
                           ['NC_004337'],
                           ['NC_004741'],
                           ['NC_017328'],
                           ['NC_007384'],
                           ['NC_012892'],
                           ['NC_012971'],
                           ['NC_012947']],
             'NC_000871': [['NC_017595']],
             'NC_000872': [['NC_017563']],
             'NC_000896': [['NC_008530']],
             'NC_000902': [['NC_002695']],
             'NC_000924': [['NC_002655']],
             'NC_000929': [['NC_016902']],
             'NC_001271': [['NC_012892'], ['NC_012971'], ['NC_012947']],
             'NC_001330': [],
             'NC_001331': [['NC_023019']],
             'NC_001332': [],
             'NC_001341': [['NC_008787'],
                           ['NC_017280'],
                           ['NC_0

In [5]:
import os


meta_vir = create_virus_metadata(Path('X:/edwards2016/virus/'))
meta_host = create_host_metadata(Path('X:/edwards2016/host/'))


def parse_json(path, tax):
    data = json.load(open(path, encoding="utf8"))
    tax_lookup = {
        'species': -1,
        'genus': -2,
        'family': -3
    }
    # data_top5 = {}
    # for k, v in data.items():
    #     temp_list = []
    #     print(k)
    #     for values in v[:5]:
    #         print(values)
    #         item = meta_host[values[0]]['lineage_names'][-1]
    #         temp_list.append(item)
    #     data_top5[k] = temp_list
    data_top5 = {k: [meta_host[values[0]]['lineage_names'][tax_lookup[tax]] for values in v[:10]] for k, v in data.items()}
    return data_top5


# def check_preds(col, master_virus_dict, tax_level, map_dict=None):
#     # sorted_col = col.sort_values(ascending=False)[:5]
#     # print(sorted_col.name)
#     # print(sorted_col.index)
#     lineage = get_lineage(master_virus_dict[sorted_col.name]['host'], 'lineage_names')
#     print(lineage)
#     if map_dict is None:
#         sub_df = {f'top{i + 1}': 1 if lineage[tax_level] in sorted_col.index[:i + 1] else 0 for i, name in enumerate(sorted_col.index)}
#         print(sub_df)
#         return sub_df
    
#     sorted_col.index = [map_dict[x][tax_level] for x in sorted_col.index]
#     sub_df = {f'top{i + 1}': 1 if lineage[tax_level] in sorted_col.index[:i + 1] else 0 for i, name in enumerate(sorted_col.index)}
#     print(sub_df)
#     return sub_df

# files = [f for f in Path('../phdna_proper_opt/comparison_src/').iterdir()]
# print(files)
# # master = {}
# # for f in files:
# #     print(f.stem)
# #     master[f.stem] = parse_json(f.as_posix())
# master = {f.stem: parse_json(f.as_posix()) for f in files}
# print(master.keys())
# piler_df = pd.read_json('../phdna_proper_opt/comparison/pilercr-2mismatches.json')

In [9]:
import plotly.express as px


def make_preds_df(tool, tax_level, translate):
    out_preds = {}
    for k, v in master[tool].items():
        lineage = get_lineage(meta_vir[k]['host'], 'lineage_names')
        preds = [1 if lineage[tax_level] in v[:i + 1] else 0 for i, val in enumerate(v)]
        if len(preds) < 10:
            if preds:
                preds.extend([preds[-1]] * (10 - len(preds)))
            else:
                preds.extend([0] * 10)
        out_preds[k] = preds
    out_df = pd.DataFrame(out_preds, index=[f'top{x + 1}' for x in range(10)])
    out_df['correct'] = out_df.sum(axis=1)
    out_df['correct_percent'] = out_df['correct'] / len(df.columns) * 100
    out_df['tool'] = tool
    out_df['tax'] = translate[tax_level]
    return out_df

translate_labels = {
    'species': 'gatunek',
    'genus': 'rodzaj',
    'family': 'rodzina'
}
phastdna_files = {
    'species': '../phdna_proper_opt/species_noise_04-07-2023/predictions.csv',
    'genus': '../phdna_proper_opt/genus_noise_05-07-2023.best_classifier/predictions.csv',
    'family': '../phdna_proper_opt/fam_noise_05-07-2023.best_classifier/predictions.csv'
}
# pprint.pprint(master['wish'])
# pprint.pprint(master['pilercr-2mismatches'])
# df_p = pd.DataFrame.from_dict(master['pilercr-2mismatches'])
# pprint.pprint(master['pilercr-2mismatches'])
# df = pd.read_csv('../phdna_proper_opt/species_noise_04-07-2023/predictions.csv', index_col=0)
# df = pd.read_csv('../phdna_proper_opt/genus_noise_05-07-2023.best_classifier/predictions.csv', index_col=0)
# df = pd.read_csv('../phdna_proper_opt/fam_noise_05-07-2023.best_classifier/predictions.csv', index_col=0)
df.fillna(0, inplace=True)
files = [f for f in Path('../phdna_proper_opt/comparison_src/').iterdir()]
df_tools_concat = []
for tax in ['species', 'genus', 'family']:
    master = {f.stem: parse_json(f.as_posix(), tax) for f in files}
    df = pd.read_csv(phastdna_files[tax], index_col=0)
    top_5 = df.apply(test, args=(meta_vir, tax, None))
    df_top_5 = pd.DataFrame(top_5.to_list(), index=top_5.index).T
    df_top_5['correct'] = df_top_5.sum(axis=1)
    df_top_5['correct_percent'] = df_top_5['correct'] / len(df.columns) * 100
    df_top_5['tool'] = 'phastDNA'
    df_top_5['tax'] = translate_labels[tax]

    dfs_tools = [make_preds_df(tool, tax, translate_labels) for tool in master.keys()]
    dfs_tools.append(df_top_5)
    df_tools = pd.concat(dfs_tools)
    df_tools.reset_index(inplace=True)
    df_tools = df_tools.sort_values(['tool', 'correct_percent'], ascending=True)
    df_tools_concat.append(df_tools)
df_comparison_absolute = pd.concat(df_tools_concat)
    # fig = px.line(df_tools, 
    #             x='index', 
    #             y='correct_percent', 
    #             color='tool', 
    #             markers=True, 
    #             template='seaborn',
    #             title="Najlepsze poprawne przewidywania dla gatunku",
    #             height=500,
    #             width=700,
    #             labels={
    #                     "index": "N najlepszych poprawnych przewidywań",
    #                     "correct_percent": "% poprawnych przewidywań",
    #                     "tool": "Narzędzie"
    #                 })
    # fig.update_layout(
    #     font_family="Roboto"
    # )
    # fig.show()
    # fig.write_image('../phdna_proper_opt/comparison_out/comparison_topN_species.png', scale=2)
# pprint.pprint(piler_preds)
# df_piler = pd.DataFrame(piler_preds, index=[f'top_{x + 1}' for x in range(5)])
# piler_preds = {k: [0 if get_lineage(meta_vir[k]['host'], 'lineage_names')['species'] in val for val in v]  for k, v in master['pilercr-2mismatches']}


NC_019416
Index(['Corynebacterium resistens', 'Corynebacterium aurimucosum',
       'Bifidobacterium breve', 'Deinococcus proteolyticus',
       'Rothia mucilaginosa', 'Propionibacterium acnes',
       'Bifidobacterium longum', 'Coraliomargarita akajimensis',
       'Mobiluncus curtisii', 'Arthrobacter arilaitensis'],
      dtype='object')
{'superkingdom': 'Bacteria', 'phylum': 'Proteobacteria', 'class': 'Gammaproteobacteria', 'order': 'Xanthomonadales', 'family': 'Xanthomonadaceae', 'genus': 'Stenotrophomonas', 'species': 'Stenotrophomonas maltophilia'}
{'top1': 0, 'top2': 0, 'top3': 0, 'top4': 0, 'top5': 0, 'top6': 0, 'top7': 0, 'top8': 0, 'top9': 0, 'top10': 0}
NC_011165
Index(['Candidatus Solibacter usitatus', 'Methylococcus capsulatus',
       'Candidatus Koribacter versatilis', 'Candidatus Nitrospira defluvii',
       'Dechloromonas aromatica', 'Nitrobacter hamburgensis',
       'Hyphomonas neptunium', 'Nitrobacter winogradskyi',
       'Azotobacter vinelandii', 'Desulfarculus ba

  df_tools.reset_index(inplace=True)


NC_019416
Index(['Coraliomargarita', 'Bifidobacterium', 'Pseudomonas', 'Arthrobacter',
       'Corynebacterium', 'Propionibacterium', 'Marinobacter', 'Rothia',
       'Prevotella', 'Desulfovibrio'],
      dtype='object')
{'superkingdom': 'Bacteria', 'phylum': 'Proteobacteria', 'class': 'Gammaproteobacteria', 'order': 'Xanthomonadales', 'family': 'Xanthomonadaceae', 'genus': 'Stenotrophomonas', 'species': 'Stenotrophomonas maltophilia'}
{'top1': 0, 'top2': 0, 'top3': 0, 'top4': 0, 'top5': 0, 'top6': 0, 'top7': 0, 'top8': 0, 'top9': 0, 'top10': 0}
NC_011165
Index(['Candidatus Solibacter', 'Mesorhizobium', 'Nitrospira', 'Pseudomonas',
       'Agrobacterium', 'Dechloromonas', 'Nitrobacter',
       'Candidatus Koribacter', 'Azotobacter', 'Methylococcus'],
      dtype='object')
{'superkingdom': 'Bacteria', 'phylum': 'Proteobacteria', 'class': 'Gammaproteobacteria', 'order': 'Pseudomonadales', 'family': 'Pseudomonadaceae', 'genus': 'Pseudomonas', 'species': 'Pseudomonas aeruginosa'}
{'top1': 

  df_tools.reset_index(inplace=True)


NC_019416
Index(['Micrococcaceae', 'Puniceicoccaceae', 'Pseudomonadaceae',
       'Actinomycetaceae', 'UNCLASSIFIED_Chroococcales', 'Corynebacteriaceae',
       'Ferrimonadaceae', 'Alteromonadaceae', 'UNCLASSIFIED_Oscillatoriales',
       'Alcanivoracaceae'],
      dtype='object')
{'superkingdom': 'Bacteria', 'phylum': 'Proteobacteria', 'class': 'Gammaproteobacteria', 'order': 'Xanthomonadales', 'family': 'Xanthomonadaceae', 'genus': 'Stenotrophomonas', 'species': 'Stenotrophomonas maltophilia'}
{'top1': 0, 'top2': 0, 'top3': 0, 'top4': 0, 'top5': 0, 'top6': 0, 'top7': 0, 'top8': 0, 'top9': 0, 'top10': 0}
NC_011165
Index(['Pseudomonadaceae', 'Phyllobacteriaceae', 'Rhizobiaceae',
       'Solibacteraceae', 'Bradyrhizobiaceae', 'Methylococcaceae',
       'Desulfovibrionaceae', 'Rhodocyclaceae',
       'UNCLASSIFIED_UNCLASSIFIED_UNCLASSIFIED_Acidobacteria',
       'Rhodospirillaceae'],
      dtype='object')
{'superkingdom': 'Bacteria', 'phylum': 'Proteobacteria', 'class': 'Gammaproteobacte

  df_tools.reset_index(inplace=True)


In [10]:
import plotly.graph_objects as go
import plotly.subplots as sp
 
# fig = go.Figure()
fig = sp.make_subplots(rows=3, cols=1)
# fig = go.Figure()

# Get the unique tools from the dataframe
tools = df_comparison_absolute['tool'].unique()

fig = px.line(df_comparison_absolute, x='index', y='correct_percent', 
                 color='tool', facet_row='tax', 
                #  labels={'% poprawnych przewidywań'}, 
                #  title="Najlepsze poprawne przewidywania",
                 markers=True, template='seaborn')
# for index, tax in enumerate(df_comparison_absolute['tax'].unique(), start=1):
#     fastDNA_data = df_comparison_absolute[(df_comparison_absolute['tool'] == 'fastDNA') & (df_comparison_absolute['tax'] == tax)]

#     fig.add_trace(
#     go.Line(
#         x=fastDNA_data['index'],
#         y=fastDNA_data['correct_percent'],
#         mode='lines+markers',
#         name='fastDNA',
#         marker = dict(symbol="diamond", size=5, line=dict(width=2, color="DarkSlateGrey")),
#     ), row=index, col=1)
# Loop through all the tools
# for index, tax in enumerate(df_comparison_absolute['tax'].unique(), start=1):
#     for tool in df_comparison_absolute[df_comparison_absolute['tax'] == tax]['tool'].unique():
#         # fig_inner = go.Figure()
#         filtered_df = df_comparison_absolute[(df_comparison_absolute['tax'] == tax) & (df_comparison_absolute['tool'] == tool)]
        
#         if tool == "phastDNA":  # replace with the actual tool name for which you want to change the markers
#             marker = dict(symbol="diamond", size=5, line=dict(width=2, color="DarkSlateGrey"))
#         else:
#             marker = dict()
        
#         fig.add_trace(
#             go.Scatter(
#                 x=filtered_df['index'], 
#                 y=filtered_df['correct_percent'],
#                 mode = 'lines+markers',
#                 name = tool,
#                 marker = marker
#             ),
#         )
#     # fig.add_trace()
#     # fig.append_trace(fig_inner, row=index, col=1)
#     # if index == 1:
#     #     fig.show()
# for subplot in range(1, 4):  # there are 3 subplots
#     fig.update_yaxes(title_text="% poprawnych przewidywań", row=subplot, col=1)
# # Update layout
fig.for_each_trace(
    lambda trace: trace.update(marker=dict(symbol="diamond", size=5, line=dict(width=2, color="DarkSlateGrey"))) 
                    if trace.name == 'phastDNA' else None
)
fig.update_layout(
    template='seaborn', 
    # title="Najlepsze poprawne przewidywania", 
    height=1100, 
    width=750, 
    font_family="Roboto",
    font_size=14,
    xaxis_title="N najlepszych poprawnych przewidywań",
    yaxis_title="% poprawnych przewidywań",
)
fig.update_yaxes(title_text="% poprawnych przewidywań")
# fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
for a in fig.layout.annotations:
    a.text = a.text.split("=")[-1]
fig.update_layout(legend_title_text='')

fig.show()
fig.write_image('../phdna_proper_opt/comparison_out/comparison_all_top10.png', scale=2.5)
# df_tips = px.data.tips()

In [25]:
import matplotlib.pyplot as plt
import matplotlib
import itertools
import seaborn as sns
import matplotlib.backends.backend_pdf as bpdf
import patchworklib as pw
import math
from matplotlib.font_manager import fontManager, FontProperties
from matplotlib import rcParams


df_summary_species = pd.read_excel('../phdna_proper_opt/summary/PhastDNA2023_Summary.xlsx', sheet_name="species_noise_2", nrows=41)
df_summary_species = df_summary_species.rename(columns={"Unnamed: 0": "iteration"})
df_summary_species.drop(columns=["-log10 lr"], inplace=True)
hypers = {
    #'epochs': (75, 500), 200
    #'vector_size': (80, 300), 180
    'frag_len': (200, 2000),
    'lr': (0.005, 0.75),
    'lr_update': (50, 500),
    'maxn': (6, 11),
    'minn': (4, 9),
    'noise': (100, 20000)
}
df_summary_species = df_summary_species.loc[:, [*hypers.keys(), 'accordance']]
df_summary_species.rename(columns={"accordance": "Zgodność taksnomiczna"}, inplace=True)

cp = sns.color_palette("crest", as_cmap=True)
plt.rc('font', size=12)
plt.rc('axes', titlesize=14)
plt.rc('axes', labelsize=12)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
font_dir = ['C:/Users/Maciej/AppData/Local/Microsoft/Windows/Fonts/']
fontManager.addfont(font_dir[0] + "Roboto-Regular.ttf")
prop = FontProperties(fname=font_dir[0] + "Roboto-Regular.ttf")
sns.set(font=prop.get_name())
# for font in font_manager.findSystemFonts(font_dir):
#     font_manager.fontManager.addfont(font)
#     print(font)

# Set font family globally
# rcParams['font.family'] = 'Roboto'
# font_manager.fontManager.addfont("C:/Users/Maciej/AppData/Local/Microsoft/Windows/Fonts/Roboto-Regular.ttf")
# prop = font_manager.FontProperties(fname="C:/Users/Maciej/AppData/Local/Microsoft/Windows/Fonts/Roboto-Regular.ttf")
# plt.rcParams['font.family'] = 'sans-serif'
# plt.rcParams['font.sans-serif'] = prop.get_name()
sns.set(rc={'figure.figsize':(6,3)})
plt.rcParams["font.family"] = "Roboto"
print(df_summary_species.columns[:-1])
combs = list(itertools.combinations(df_summary_species.columns[:-1], 2))
# fig, axs = plt.subplots(len(combs), figsize=(len(combs), len(combs)))
plots = []
with bpdf.PdfPages('../phdna_proper_opt/summary/species_noise_2-opt_test.pdf') as pdf:
    for i, item in enumerate(combs):
        print(combs)
        # print(i, x, y)
        x = item[0]
        y = item[1]
        # ax = axs[i // 2, i % 2]
        # g = sns.JointGrid(data=df, x=x, y=y, space=0,
        #                 xlim=hypers[x],
        #                 ylim=hypers[y])
        # x_best = list(df.loc[df['target'] >= 70, x].iloc)
        # y_best = list(df.loc[df['target'] >= 70, y].iloc)
        # best_value = list(df.loc[df['target'] >= 70, 'target'])
        # # print(best_value)
        # g.ax_joint.scatter(x=x_best, y=y_best, c='red', alpha=0.6)
        # # g.plot_joint(sns.scatterplot, x=x_best, y=y_best, color='red', alpha=0.6, marker='+')
        # for (x_val, y_val, best) in zip(x_best, y_best, best_value):
        #     g.ax_joint.annotate(f'{best}', (x_val, y_val),
        #                         xytext=(5, 10), textcoords='offset points',
        #                         fontsize=10, color='green', ha='left', va='top')
        # sns.kdeplot(data=df, x=x, y=y, fill=True, cmap="rocket", ax=g.ax_joint, alpha=0.5, levels=45, thresh=0.05)
        # g.plot_marginals(sns.histplot, color="#03051A", alpha=1)
        # g.set_axis_labels(xlabel=x, ylabel=y, fontsize=12)
        # cp = sns.color_palette("Spectral", as_cmap=True)
        # sns.scatterplot(data=df, x=x, y=y, hue='target', alpha=0.6, palette=cp)
        
        # df_summary_species['above_66'] = df_summary_species['accordance'] >= 0.66
        # markers = {False: 'o', True: 'v'}
        # sns.scatterplot(data=df_summary_species, x=x, y=y, hue='accordance', alpha=1, style="above_66", markers=markers, palette=cp)
        # above_70 = df_summary_species[df_summary_species['accordance'] > 0.66]

        # for j, point in above_70.iterrows():
        #     ax.text(point[x]+0.1, point[y]+0.1, str(point['target']))
        # ax = pw.Brick(figsize=(6,3))
        ax = sns.kdeplot(data=df_summary_species, x=x, y=y, fill=True, cmap="rocket", alpha=0.5, levels=45, thresh=0.05, warn_singular=False)
        # print(all([float(e) for e in hypers[x]]))
        # print([float(e) for e in hypers[x]])
        if not all([isinstance(e, float) for e in hypers[x]]):
            ax.set_xlim((hypers[x][0] - math.ceil(hypers[x][0] / 10), hypers[x][1] + math.ceil(hypers[x][1] / 10)))
        else:
            ax.set_xlim((hypers[x][0] - (hypers[x][0] * 10), hypers[x][1] + (hypers[x][1] / 10)))
        if not all([isinstance(e, float) for e in hypers[y]]):
            ax.set_ylim((hypers[y][0] - math.ceil(hypers[y][0] / 10), hypers[y][1] + math.ceil(hypers[y][1] / 10)))
        else:
            ax.set_ylim((hypers[y][0] - (hypers[y][0] * 10), hypers[y][1] + (hypers[y][1] / 10)))
        # ax.set_ylim(hypers[y])
        ax.set_xlabel(x)
        ax.set_ylabel(y)


        df_summary_species['Powyżej 66'] = df_summary_species['Zgodność taksnomiczna'] >= 0.66
        markers = {False: 'o', True: 'v'}
        sns.scatterplot(data=df_summary_species, x=x, y=y, hue="Zgodność taksnomiczna", alpha=1, style='Powyżej 66', markers=markers, palette=cp, ax=ax)
        above_70 = df_summary_species[df_summary_species["Zgodność taksnomiczna"] > 0.66]

        if i == 0:
            leg = ax.legend(loc='center left', bbox_to_anchor=(1.05, 0.5))
            leg.get_texts()[-1].set_text("Powyżej 0.66 zgodności")
            leg.get_texts()[-2].set_text("Poniżej 0.66 zgodności")
            leg.get_texts()[-3].set_text("")
            print([t for t in leg.get_texts()])
        else:
            sns.set(rc={'figure.figsize':(4,3)})
            plt.rcParams["font.family"] = "Roboto"
            ax.get_legend().remove()
        ax.set_title(f"{i}: {x} - {y}")
        plots.append(ax)
        # axs[i].set_xlim(hypers[x])
        # axs[i].set_ylim(hypers[y])
        # axs[i].set_xlabel(x)
        # axs[i].set_ylabel(y)
        # plt.show()
        # axs[i].imshow(g.ax_joint.collections[0].get_array().reshape(45, 45).T, origin='lower', extent=g.ax_joint.get_xlim() + g.ax_joint.get_ylim(), cmap="rocket", alpha=0.5)
        # axs[i].hist2d(x_best, y_best, bins=20, alpha=0.6, cmap="rocket")
        # axs[i].set_xlim(hypers[x])
        # axs[i].set_ylim(hypers[y])
        # axs[i].set_xlabel(x)
        # axs[i].set_ylabel(y)
    # plt.tight_layout()
        # plots.append(pw.Brick(figsize=(1,2)))
        plt.tight_layout()
        pdf.savefig()
        plt.close()
        # plt.show()
# patch = plots[0] | plots[1] | plots[2] | plots[3] 
# patch.savefig()

Index(['frag_len', 'lr', 'lr_update', 'maxn', 'minn', 'noise'], dtype='object')
[('frag_len', 'lr'), ('frag_len', 'lr_update'), ('frag_len', 'maxn'), ('frag_len', 'minn'), ('frag_len', 'noise'), ('lr', 'lr_update'), ('lr', 'maxn'), ('lr', 'minn'), ('lr', 'noise'), ('lr_update', 'maxn'), ('lr_update', 'minn'), ('lr_update', 'noise'), ('maxn', 'minn'), ('maxn', 'noise'), ('minn', 'noise')]
[Text(0, 0, 'Zgodność taksnomiczna'), Text(0, 0, '0.40'), Text(0, 0, '0.45'), Text(0, 0, '0.50'), Text(0, 0, '0.55'), Text(0, 0, '0.60'), Text(0, 0, '0.65'), Text(0, 0, ''), Text(0, 0, 'Poniżej 0.66 zgodności'), Text(0, 0, 'Powyżej 0.66 zgodności')]
[('frag_len', 'lr'), ('frag_len', 'lr_update'), ('frag_len', 'maxn'), ('frag_len', 'minn'), ('frag_len', 'noise'), ('lr', 'lr_update'), ('lr', 'maxn'), ('lr', 'minn'), ('lr', 'noise'), ('lr_update', 'maxn'), ('lr_update', 'minn'), ('lr_update', 'noise'), ('maxn', 'minn'), ('maxn', 'noise'), ('minn', 'noise')]
[('frag_len', 'lr'), ('frag_len', 'lr_update'), 

In [6]:
import pickle
import pathlib

temp = pathlib.PosixPath
pathlib.PosixPath = pathlib.WindowsPath

# load pickle file  
# with open('../phdna_proper_opt/fam_noise_05-07-2023.best_classifier/classifier.pkl', 'rb') as f:
with open('../phdna_proper_opt/testo/best_classifier/classifier.pkl', 'rb') as f:
    classifier = pickle.load(f)
print(classifier.dir)
print(classifier.fastdna_exe)
print(classifier.model)
print(classifier.name)
# pathlib.PosixPath = temp
# path = pathlib.Path('../phdna_proper_opt/fam_noise_05-07-2023.best_classifier_path_correct/classifier.pkl')
# path.parent.mkdir(exist_ok=True, parents=True)
# classifier.model = pathlib.PosixPath("/model.n8-7.lr0.49120-162.28460.d100.no2399.fl1213.e40.losoftmax.sa100.bin")
# print(classifier.model)

# # save pickle to file   
# with open(path, 'wb') as f:
#     pickle.dump(classifier, f)

\home\mmichalczyk\phdna_tests\load_test\best_classifier
\home\mmichalczyk\phastDNA\fastDNA\fastdna
\home\mmichalczyk\phdna_tests\load_test\best_classifier\model.n6-6.lr0.54126-97.47427.d100.no2468.fl1139.e1.losoftmax.sa100.bin
model.n6-6.lr0.54126-97.47427.d100.no2468.fl1139.e1.losoftmax.sa100


In [None]:
import pandas as pd
import pygwalker as pyg

df = pd.read_csv('../phdna_tests/testo_load_testo/predictions.csv')
gwalker = pyg.walk(df)

In [43]:
pprint.pprint(meta_host)
df_host = pd.DataFrame.from_dict(meta_host, orient='index')
df_host_final = pd.DataFrame(df_host.lineage_names.values.tolist(), index=df_host.organism_name).reset_index()
df_host_final.columns = ['Organism Name', 'Superkingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
df_host_final['Family'].value_counts()

{'NC_000117': {'lineage_names': ['Bacteria',
                                 'Chlamydiae',
                                 'Chlamydiia',
                                 'Chlamydiales',
                                 'Chlamydiaceae',
                                 'Chlamydia',
                                 'Chlamydia trachomatis'],
               'lineage_ranks': ['superkingdom',
                                 'phylum',
                                 'class',
                                 'order',
                                 'family',
                                 'genus',
                                 'species'],
               'organism_name': 'Chlamydia trachomatis D/UW-3/CX',
               'taxid': '272561',
               'version': 'NC_000117.1'},
 'NC_000853': {'lineage_names': ['Bacteria',
                                 'Thermotogae',
                                 'Thermotogae',
                                 'Thermotogales',
                 

Enterobacteriaceae     239
Streptococcaceae       129
Burkholderiaceae       116
Bacillaceae             95
Chlamydiaceae           92
                      ... 
Sanguibacteraceae        1
Thermomonosporaceae      1
Kofleriaceae             1
Nakamurellaceae          1
Hahellaceae              1
Name: Family, Length: 247, dtype: int64

In [10]:
df_result = pd.read_csv("../phdna_proper_opt/species_noise_04-07-2023/predictions.csv")
df_result = df_result.rename(columns={"Unnamed: 0": "Host"})
df_result_melted = df_result.melt(id_vars=["Host"], var_name="Virus", value_name="Score")
df_melted_sorted = df_result_melted.groupby('Virus').apply(lambda x: x.sort_values(by='Score', ascending=False)).reset_index(drop=True)
print(df_result_melted)


                                                 Host      Virus     Score
0                           Corynebacterium resistens  NC_019416  0.002441
1                         Corynebacterium aurimucosum  NC_019416  0.002354
2                               Bifidobacterium breve  NC_019416  0.001873
3                           Deinococcus proteolyticus  NC_019416  0.001395
4                                 Rothia mucilaginosa  NC_019416  0.001024
...                                               ...        ...       ...
1101255     Blattabacterium sp. Periplaneta americana  NC_021774       NaN
1101256  Blattabacterium sp. Mastotermes darwiniensis  NC_021774       NaN
1101257                         Beutenbergia cavernae  NC_021774       NaN
1101258                  Pseudonocardia dioxanivorans  NC_021774       NaN
1101259                   Blattabacterium punctulatus  NC_021774       NaN

[1101260 rows x 3 columns]
