In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
def calculate_distance_stat(dataframe, list_sample=False):
    if list_sample != False:
        dataframe = dataframe.loc[list_sample,list_sample]
    stacked_df = dataframe.stack()
    #np.nanmean(dfdist.loc[cluster_test,cluster_test].values)
    #np.nanmin(dfdist.loc[cluster_test,cluster_test].values)
    #np.nanmax(dfdist.loc[cluster_test,cluster_test].values)
    mean_distance = stacked_df.mean(skipna = True)
    min_distance = stacked_df.min(skipna = True)
    max_distance = stacked_df.max(skipna = True)
    print("Este cluster tiene %s muestras, con una distancia media de %.2f, rango [%.0f - %.0f]" % (len(dataframe.columns), mean_distance, min_distance, max_distance))

In [3]:
def pairwise_to_cluster(pw,threshold = 20):
    groups = {}
    columns = pw.columns.tolist()
    sorted_df = pw[(pw[columns[0]] != pw[columns[1]]) & (pw[columns[2]] <= threshold)].sort_values(by=[columns[2]])
    
    def rename_dict_clusters(cluster_dict):
        reordered_dict = {}
        for i, k in enumerate(list(cluster_dict)):
            reordered_dict[i] = cluster_dict[k]
        return reordered_dict
    
    def regroup_clusters(list_keys, groups_dict, both_samples_list):
        #sum previous clusters
        list_keys.sort()
        new_cluster = sum([groups_dict[key] for key in list_keys], [])
        #add new cluster
        cluster_asign = list(set(new_cluster + both_samples_list))
        #Remove duped cluster
        first_cluster = list_keys[0]
        groups_dict[first_cluster] = cluster_asign
        rest_cluster = list_keys[1:]
        for key in rest_cluster:
            del groups_dict[key]
        groups_dict = rename_dict_clusters(groups_dict)
        return groups_dict
        
    for _, row in sorted_df.iterrows():
        group_number = len(groups)
        sample_1 = str(row[0])
        sample_2 = str(row[1])
        both_samples_list = row[0:2].tolist()
                
        if group_number == 0:
            groups[group_number] = both_samples_list
        
        all_samples_dict = sum(groups.values(), [])
                
        if sample_1 in all_samples_dict or sample_2 in all_samples_dict:
            #extract cluster which have the new samples
            key_with_sample = {key for (key,value) in groups.items() if (sample_1 in value or sample_2 in value)}
            
            cluster_with_sample = list(key_with_sample)
            cluster_with_sample_name = cluster_with_sample[0]
            number_of_shared_clusters = len(key_with_sample)
            if number_of_shared_clusters > 1:
                groups = regroup_clusters(cluster_with_sample, groups, both_samples_list)
            else:
                groups[cluster_with_sample_name] = list(set(groups[cluster_with_sample_name] + both_samples_list))
        else:
            groups[group_number] = both_samples_list
            
    for _, row in pw[(pw[pw.columns[0]] != pw[pw.columns[1]]) & (pw[pw.columns[2]] > threshold)].iterrows():
        sample_1 = str(row[0])
        sample_2 = str(row[1])
        all_samples_dict = sum(groups.values(), [])
        if sample_1 not in all_samples_dict:
            group_number = len(groups)
            groups[group_number] = [sample_1]
        
        if sample_2 not in all_samples_dict:
            group_number = len(groups)
            groups[group_number] = [sample_2]
            
    cluster_df = pd.DataFrame(groups.values(),index=list(groups))
    
    cluster_df_return = cluster_df.stack().droplevel(1).reset_index().rename(columns={'index': 'group', 0: 'id'})
            
    return cluster_df_return

# READ DISTANCE MATRIX

In [4]:
dfdist = pd.read_csv('/processing_Data/antibioticos/sgarciacobos/projects/virulence_kpOXA48_NL_ES/ANALYSIS/snippy/clean.core.aln.distance.csv', index_col=0,  sep=",")

In [5]:
dfdist

Unnamed: 0,544651_Kpn,544663_Kpn,544680_Kpn,544689_Kpn,544719_Kpn,544723_Kpn,544732_Kpn,544758_Kpn,544762_Kpn,544780_Kpn,544789_Kpn,544798_Kpn,544803_Kpn,544832_Kpn,544892_Kpn,544900_Kpn,544974_Kpn,544989_Kpn,545011_Kpn,545048_Kpn,545068_Kpn,545130_Kpn,545142_Kpn,545163_Kpn,545164_Kpn,545169_Kpn,545271_Kpn,545330_Kpn,545376_Kpn,545403_Kpn,545424_Kpn,545426_Kpn,545457_Kpn,545473_Kpn,545555_Kpn,545567_Kpn,545624_Kpn,545661_Kpn,545734_Kpn,545781_Kpn,545786_Kpn,545804_Kpn,545814_Kpn,545831_Kpn,545833_Kpn,545836_Kpn,545855_Kpn,545889_Kpn,545899_Kpn,545906_Kpn,545907_Kpn,545916_Kpn,545925_Kpn,545933_Kpn,545967_Kpn,545980_Kpn,545991_Kpn,546017_Kpn,546019_Kpn,546023_Kpn,546073_Kpn,546076_Kpn,546083_Kpn,546095_Kpn,546103_Kpn,546156_Kpn,546197_Kpn,546234_Kpn,ISC01,ISC02,ISC03,ISC04,ISC05,ISC06,ISC07,ISC08,ISC09,ISC12,ISC13,ISC14,ISC15,ISC16,ISC17,ISC18,ISC19,ISC20,ISC21,ISC22,ISC23,ISC25,ISC26,ISC27,ISC28,ISC29,ISC30,ISC31,ISC32,ISC33,ISC34,ISC35,ISC36,ISC37,ISC38,ISC39,ISC40,ISC41,ISC42,ISC43,ISC44,ISC45,ISC46,ISC47,ISC48,ISC49,ISC50,ISC51,ISC52,ISC53,ISC54,ISC55,ISC56,ISC57,Reference
544651_Kpn,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
544663_Kpn,3515.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
544680_Kpn,3224.0,3387.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
544689_Kpn,3326.0,3335.0,3439.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
544719_Kpn,3239.0,3449.0,2696.0,3387.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
544723_Kpn,3522.0,27.0,3394.0,3342.0,3456.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
544732_Kpn,3174.0,3461.0,2888.0,3340.0,3161.0,3468.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
544758_Kpn,3209.0,3464.0,2983.0,3316.0,3034.0,3471.0,3115.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
544762_Kpn,3471.0,3656.0,3516.0,3373.0,3568.0,3663.0,3518.0,3553.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
544780_Kpn,3237.0,3451.0,2680.0,3375.0,182.0,3458.0,3171.0,3016.0,3564.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## MAKE PAIRWISE
### (optional)

In [6]:
pairwise = dfdist.stack().reset_index(name='distance').rename(columns={'level_0': 'sample_1', 'level_1': 'sample_2'})

In [7]:
pairwise.head()

Unnamed: 0,sample_1,sample_2,distance
0,544663_Kpn,544651_Kpn,3515.0
1,544680_Kpn,544651_Kpn,3224.0
2,544680_Kpn,544663_Kpn,3387.0
3,544689_Kpn,544651_Kpn,3326.0
4,544689_Kpn,544663_Kpn,3335.0


In [8]:
#matrix_to_matrix = dfdist.stack().reset_index().groupby(['level_0', 'level_1']).mean().unstack()
#matrix_to_matrix.iloc[0,0]

# MATRIZ COMPLETA

In [9]:
calculate_distance_stat(dfdist)

Este cluster tiene 123 muestras, con una distancia media de 2958.52, rango [0 - 3670]


# ESTABLECER CLUSTERS

In [11]:
clusters = pairwise_to_cluster(pairwise,threshold = 15)

In [12]:
clusters

Unnamed: 0,group,id
0,0,545048_Kpn
1,0,545142_Kpn
2,0,ISC56
3,0,544989_Kpn
4,0,ISC40
5,0,544803_Kpn
6,0,545164_Kpn
7,0,544789_Kpn
8,0,545567_Kpn
9,0,546095_Kpn


In [13]:
cluster_summary = clusters.groupby('group')['id'].apply(list).reset_index(name='samples')

In [14]:
cluster_summary

Unnamed: 0,group,samples
0,0,"[545048_Kpn, 545142_Kpn, ISC56, 544989_Kpn, ISC40, 544803_Kpn, 545164_Kpn, 544789_Kpn, 545567_Kpn, 546095_Kpn]"
1,1,"[546076_Kpn, 545933_Kpn, 546083_Kpn, 545068_Kpn, 545925_Kpn, 545833_Kpn, 545836_Kpn, 544798_Kpn, 545967_Kpn, 546234_Kpn]"
2,2,"[ISC01, ISC41, ISC03, ISC08, ISC13, ISC29, ISC02, ISC32, ISC57, ISC52, ISC44, ISC47, ISC37, ISC39]"
3,3,"[ISC04, ISC35, ISC15, ISC28, ISC23, ISC09, ISC27, ISC25, ISC36, ISC07, 544689_Kpn, ISC34, ISC17, ISC12, ISC30, ISC20, ISC21]"
4,4,"[ISC19, ISC18, ISC45, ISC55, ISC46, ISC51, ISC05]"
5,5,"[545889_Kpn, 545781_Kpn, 545011_Kpn, 545907_Kpn]"
6,6,"[545403_Kpn, 545426_Kpn, 545473_Kpn, 545130_Kpn]"
7,7,"[545555_Kpn, ISC42, ISC48, ISC06, 545271_Kpn, 545786_Kpn, ISC16]"
8,8,"[ISC54, ISC38]"
9,9,"[ISC22, ISC50, ISC33, ISC49]"


# POR GRUPOS

In [15]:
def calculate_N(row):
    return len(row.samples)

In [16]:
def calculate_mean_distance(row, df):
    if row.N > 1:
        list_sample = row.samples
        dataframe = df.loc[list_sample,list_sample]
        stacked_df = dataframe.stack()
        mean_distance = stacked_df.mean(skipna = True)
        min_distance = stacked_df.min(skipna = True)
        max_distance = stacked_df.max(skipna = True)
        return round(mean_distance, 2), min_distance, max_distance
    else:
        return 'NaN'

In [17]:
cluster_summary['N'] = cluster_summary.apply(calculate_N, axis=1)

In [18]:
cluster_summary = cluster_summary.sort_values(by=['N'], ascending=False)

In [16]:
cluster_summary[['mean', 'min', 'max']] = cluster_summary.apply(lambda x: calculate_mean_distance(x, dfdist), axis=1, result_type="expand")

In [37]:
sorted_index = cluster_summary.index.to_list()
sorted_index.sort()
sorted_index = [x + 1 for x in sorted_index]

In [42]:
cluster_summary['group'] = sorted_index

In [43]:
cluster_summary.reset_index().drop('index', axis=1)

Unnamed: 0,group,samples,N
0,1,"[ISC04, ISC35, ISC15, ISC28, ISC23, ISC09, ISC27, ISC25, ISC36, ISC07, 544689_Kpn, ISC34, ISC17, ISC12, ISC30, ISC20, ISC21]",17
1,2,"[ISC01, ISC41, ISC03, ISC08, ISC13, ISC29, ISC02, ISC32, ISC57, ISC52, ISC44, ISC47, ISC37, ISC39]",14
2,3,"[545048_Kpn, 545142_Kpn, ISC56, 544989_Kpn, ISC40, 544803_Kpn, 545164_Kpn, 544789_Kpn, 545567_Kpn, 546095_Kpn]",10
3,4,"[546076_Kpn, 545933_Kpn, 546083_Kpn, 545068_Kpn, 545925_Kpn, 545833_Kpn, 545836_Kpn, 544798_Kpn, 545967_Kpn, 546234_Kpn]",10
4,5,"[ISC19, ISC18, ISC45, ISC55, ISC46, ISC51, ISC05]",7
5,6,"[545555_Kpn, ISC42, ISC48, ISC06, 545271_Kpn, 545786_Kpn, ISC16]",7
6,7,"[544723_Kpn, 545855_Kpn, 544974_Kpn, 546017_Kpn]",4
7,8,"[545889_Kpn, 545781_Kpn, 545011_Kpn, 545907_Kpn]",4
8,9,"[545403_Kpn, 545426_Kpn, 545473_Kpn, 545130_Kpn]",4
9,10,"[ISC22, ISC50, ISC33, ISC49]",4


In [50]:
cluster_summary[['mean', 'min', 'max']] = cluster_summary.apply(lambda x: calculate_mean_distance(x, dfdist), axis=1, result_type="expand")

In [51]:
cluster_summary

Unnamed: 0,group,samples,N,mean,min,max
3,1,"[ISC04, ISC35, ISC15, ISC28, ISC23, ISC09, ISC27, ISC25, ISC36, ISC07, 544689_Kpn, ISC34, ISC17, ISC12, ISC30, ISC20, ISC21]",17,5.96,0.0,13.0
2,2,"[ISC01, ISC41, ISC03, ISC08, ISC13, ISC29, ISC02, ISC32, ISC57, ISC52, ISC44, ISC47, ISC37, ISC39]",14,3.49,0.0,12.0
0,3,"[545048_Kpn, 545142_Kpn, ISC56, 544989_Kpn, ISC40, 544803_Kpn, 545164_Kpn, 544789_Kpn, 545567_Kpn, 546095_Kpn]",10,14.47,0.0,21.0
1,4,"[546076_Kpn, 545933_Kpn, 546083_Kpn, 545068_Kpn, 545925_Kpn, 545833_Kpn, 545836_Kpn, 544798_Kpn, 545967_Kpn, 546234_Kpn]",10,8.0,0.0,14.0
4,5,"[ISC19, ISC18, ISC45, ISC55, ISC46, ISC51, ISC05]",7,5.9,0.0,10.0
7,6,"[545555_Kpn, ISC42, ISC48, ISC06, 545271_Kpn, 545786_Kpn, ISC16]",7,12.48,1.0,22.0
12,7,"[544723_Kpn, 545855_Kpn, 544974_Kpn, 546017_Kpn]",4,11.67,8.0,14.0
5,8,"[545889_Kpn, 545781_Kpn, 545011_Kpn, 545907_Kpn]",4,6.33,0.0,12.0
6,9,"[545403_Kpn, 545426_Kpn, 545473_Kpn, 545130_Kpn]",4,7.5,0.0,15.0
9,10,"[ISC22, ISC50, ISC33, ISC49]",4,5.0,4.0,6.0


In [47]:
final_cluster = cluster_summary[["group", "samples"]].explode("samples").reset_index(drop=True)

In [48]:
final_cluster

Unnamed: 0,group,samples
0,1,ISC04
1,1,ISC35
2,1,ISC15
3,1,ISC28
4,1,ISC23
5,1,ISC09
6,1,ISC27
7,1,ISC25
8,1,ISC36
9,1,ISC07


In [18]:
cluster_summary.to_csv('/processing_Data/antibioticos/sgarciacobos/projects/virulence_kpOXA48_NL_ES/ANALYSIS/snippy/stats_cluster_5SNP.tsv', sep='\t', index=False)