In [None]:
import os
import clustering
import alignment
import fake_alignment
import cluster_visualization
import template_generating

%matplotlib inline

In [None]:
from _cluster_file_functions import extract_clusters, record_clusters
from self_stopped_genetic_algorithm import selfstop_genetic_algorithm

In [None]:
SEQUENCE_SOURCE_DIR = 'data1/'  # Folder with sequences
SEQUENCE_SOURCE_NAME = 'patient_traces_test'  # Name of sequences file without extention
OUT_DIR_CLUSTERS = 'data1/Clusters/'  # Folder for clusters
OUT_DIR_VISUALIZATION = 'data1/Paths'  # Folder with graphs
OUTCOMES_FILE = None
NODE_PARAMETERS_DIR = '_nodes_parameters/clinical_pathways_params'  # File with settings (names_dict.txt is required!!!)
EXPERT_TEMPLATE_FILE = 'data1/expert_templates.txt'  # Manual templates
GENETIC_TEMPLATE_FILE = 'data1/genetic_templates.txt'  # Empty file at start
ALPHABET = 'AFNIED'  # All possible states in any order
SEQUENCE_SEPARATOR = '\t'  # A separator in SEQUENCE_SOURCE_NAME
encoding = 'cp1251'

In [None]:
# read sequences
sequence_source_path = os.path.join(SEQUENCE_SOURCE_DIR, SEQUENCE_SOURCE_NAME + '.txt')
vectors, cases = extract_clusters(sequence_source_path, eval_=False, header=True,
                                      encoding=encoding, sep=SEQUENCE_SEPARATOR)
vectors = vectors[0]
cases = cases[0]
print(len(vectors), 'sequences')

In [None]:
OUT_DIR_CLUSTERS += SEQUENCE_SOURCE_NAME + '/'

#### Distance matrix

In [None]:
# filename of distance matrix 
distance_matrix_file = '{0}{1}_similarity_matrix.txt'.format(OUT_DIR_CLUSTERS,
                                                                 SEQUENCE_SOURCE_NAME)

lev_sim_matrix = clustering.get_similarity_matrix(distance_matrix_file, vectors)

In [None]:
# Minimum and maximum number of clusters for clustering (less than 3 is not allowed)
MIN_NUM_CLUSTER = 3
MAX_NUM_CLUSTER = 10

In [None]:
var_metric, sill_metric = clustering.cluster_in_range(MIN_NUM_CLUSTER, 
                                   MAX_NUM_CLUSTER, 
                                   lev_sim_matrix, 
                                   vectors, 
                                   cases, 
                                   OUT_DIR_CLUSTERS)

clustering.visualize_cluster_metrics(MIN_NUM_CLUSTER, 
                                     OUT_DIR_CLUSTERS, 
                                     [var_metric, sill_metric], 
                                     ['variance', 'silhouette'])


In [None]:
# Choose a number of clusters using metrics above
num_of_clusters = 6

In [None]:
cluster_source = '{}KMeans{}.txt'.format(OUT_DIR_CLUSTERS, num_of_clusters)
clusters, cluster_cases = extract_clusters(cluster_source, eval_=False, sep='\t')

#### Genetic templates

In [None]:
template_dir = os.path.join(SEQUENCE_SOURCE_DIR, 'Template_generating', SEQUENCE_SOURCE_NAME,
                                        os.path.basename(cluster_source.split('.')[0]))

In [None]:
selfstop_genetic_algorithm(clusters, 
                           template_dir,
                           alphabet=ALPHABET, 
                           mutation_num=10,
                           increment=10, 
                           animation_and_plot=False)

In [None]:
template_generating.choose_best_templates(template_dir, SEQUENCE_SOURCE_DIR, cluster_source,
                                          os.path.basename(GENETIC_TEMPLATE_FILE))



In [None]:
template_file = GENETIC_TEMPLATE_FILE

In [None]:
file_out = cluster_source.split('.')[0] + '_full_alignment.txt'

#### Draw clusters with graphviz

In [None]:
num_state = alignment.align_all(clusters, cluster_cases, file_out, template_file)
this_clusters, this_cases = extract_clusters(file_out, sep='\t', eval_=False)

In [None]:
def draw_all_clusters(add_name=''):
    graphs_paths = []
    for i, cluster in enumerate(this_clusters):
        graph = cluster_visualization.Graph(cluster, 
                                            this_cases[i],
                                            cluster_visualization.get_file_name(
                                                file_out) + add_name, 
                                            i, 
                                            num_state, 
                                            NODE_PARAMETERS_DIR, 
                                            OUTCOMES_FILE,
                                            add_figures=False)
        graph.collect_edges()    

        try:
            path_to_graph = graph.to_gv(OUT_DIR_VISUALIZATION, 0.0, 0.5)
            graphs_paths.append(path_to_graph)
        except AssertionError:
            print('Cluster #{} is empty'.format(i))
    return graphs_paths

In [None]:
graphs_paths = draw_all_clusters()

#### Clusters review

In [None]:
import ipywidgets as widgets
from IPython.display import Image
from ipywidgets import interact

In [None]:
path_with_graphs = os.path.split(graphs_paths[0])[0]

In [None]:
def show_chains_with_clusters(path_with_graphs):
    
    def display_chain(x, width):
        return Image(os.path.join(path_with_graphs, x), width = width, height = 300)
    
    png_graphs = [f for f in os.listdir(path_with_graphs) if f[-3:] == 'png']
    

    x_widget =  widgets.Dropdown(
        options=png_graphs,
        value=png_graphs[0],
        description='Кластер №',
        disabled=False,
    )



    width_slider = widgets.IntSlider(min=100, 
                                     max=1000, 
                                     step=1, 
                                     value=700,
                                    description='Размер:')

    
    interact(display_chain,
             x=x_widget,
             width=width_slider,
            )

In [None]:
show_chains_with_clusters(path_with_graphs)

#### Statistics 

In [None]:
import pandas as pd
from stat_calculator import cluster_report

In [None]:
cluster_report(cluster_source, file_out)

#### Expert templates

In [None]:
template_file_spec = EXPERT_TEMPLATE_FILE
file_out_spec = cluster_source.split('.')[0] + '_full_alignment_specialist.txt'
num_state = alignment.align_all(clusters, cluster_cases, file_out_spec, template_file_spec)
this_clusters, this_cases = extract_clusters(file_out_spec, sep='\t', eval_=False)

graphs_paths_spec = draw_all_clusters(add_name='_spec')


In [None]:
path_with_graphs_spec = os.path.split(graphs_paths_spec[0])[0]
show_chains_with_clusters(path_with_graphs_spec)

In [None]:
cluster_report(cluster_source, file_out_spec)

#### Fake alignment (Cyclic)

In [None]:
file_alignment = cluster_source.split('.')[0] + '_fake_alignment.txt'
aligned_clusters = [[] for i in range(len(clusters))]
for i, cluster in enumerate(clusters):
    cyclic_alignment = fake_alignment.CyclicAlignment()
    for sequence in cluster:
        aligned_clusters[i].append(cyclic_alignment.align(sequence))

record_clusters(file_alignment, aligned_clusters, cluster_cases, sep='\t')

fake_graphs_append = []
for i, cluster in enumerate(aligned_clusters):
    graph = cluster_visualization.Graph(cluster, cluster_cases[i],
                                        cluster_visualization.get_file_name(file_alignment), i, 1,
                                        NODE_PARAMETERS_DIR, OUTCOMES_FILE)
    graph.collect_edges()
    fake_graphs_append.append(graph.to_gv(OUT_DIR_VISUALIZATION, 0.0, 0.3))

In [None]:
path_with_fake_graphs = os.path.split(fake_graphs_append[0])[0]
show_chains_with_clusters(path_with_fake_graphs)

In [None]:
cluster_report(cluster_source, file_alignment)

#### Clusters to dataframe and vice versa 

In [None]:
from _cluster_file_functions import clusters_to_pd, pd_to_clusters

In [None]:
cluster_df = clusters_to_pd(clusters, cluster_cases)
cluster_df

In [None]:
cluster_df = clusters_to_pd(cluster_file=cluster_source)
cluster_df

In [None]:
clusters_from_df, cases_from_df = pd_to_clusters(cluster_df)
clusters_from_df, cases_from_df