In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from snmcseq_utils import plot_tsne_labels

In [41]:
tsne_f = './data/tsne/tsne_perp30_binc_mCH_human_combined_100000_summary_nmcc_v2.tsv'
cluster_f = './data/cluster/cluster_MB_v1_MB_EA_MB_EB/clusters_v1_binc_mCH_louvain.tsv'
cluster_annotation_f = './data/cluster/cluster_MB_v1_MB_EA_MB_EB/clusters_v1_binc_mCH_louvain_annotation.tsv'
metadata_f = './data/metadata/metadata_human_combined_updated.tsv'

output_f = './data/browser/human_MB_v1_MB_EA_MB_EB/tsne_points_ordered.csv'
output_gmch = './data/browser/human_MB_v1_MB_EA_MB_EB/mch/global_mCH.txt'
output_gmcg = './data/browser/human_MB_v1_MB_EA_MB_EB/mcg/global_mCG.txt'

df_tsne = pd.read_table(tsne_f, index_col='sample')
df_cluster = pd.read_table(cluster_f, index_col='sample')
df_cluster_anno = pd.read_table(cluster_annotation_f, index_col='cluster_ID')
df_meta = pd.read_table(metadata_f, index_col='Sample')[['Biosample', 'mCG/CG', 'mCH/CH']]
print(df_tsne.shape)
print(df_cluster.shape)
print(df_cluster_anno.shape)
print(df_meta.shape)

(6435, 2)
(6435, 1)
(32, 1)
(6435, 3)


In [42]:
df = pd.merge(df_tsne, df_cluster, left_index=True, right_index=True)
df = pd.merge(df, df_meta, left_index=True, right_index=True)
df = pd.merge(df, df_cluster_anno, left_on='cluster_ID', right_index=True)

# tsne_points_ordered.csv
df = df.rename_axis('samp')
df['cluster_final'] = df.cluster_ID
df['cluster_name'] = df.cluster_ID
df['cluster_label'] = df.cluster_ID
df['cluster_ordered'] = [int(cluster_ID[len('cluster_'):]) for cluster_ID in df.cluster_ID]
df['cluster_ortholog'] = np.nan 

def rename_col_names(cols):
    cols_new = []
    for col in cols:
        if col == 'Biosample':
            col_new = 'biosample'
        else:
            col_new = col
            
        cols_new.append(col_new)
    return cols_new


df.columns = rename_col_names(df.columns)
required_columns = ['tsne_x', 'tsne_y', 
                    'cluster_final', 'cluster_name', 
                    'cluster_label', 'cluster_ordered', 
                    'cluster_ortholog', 'biosample', 
                    'cluster_annotation']
df = df[required_columns]
df = df.sort_values('cluster_ordered')

# save tsne_points_ordered
df.to_csv(output_f,
         sep='\t', na_rep='NA', header=True, index=True)

# view
print(df.shape)
df.head()

(6435, 9)


Unnamed: 0_level_0,tsne_x,tsne_y,cluster_final,cluster_name,cluster_label,cluster_ordered,cluster_ortholog,biosample,cluster_annotation
samp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
160729_MB_v1_hs_25yr_MFG_pool_86_AD010_indexed,-12.506811,-52.198963,cluster_1,cluster_1,cluster_1,1,,MB_v1,hL2/3
171030_MB_EB_hs_25yr_BA10_FCL1_NA_E12_AD002_indexed,-23.449356,-47.344868,cluster_1,cluster_1,cluster_1,1,,MB_EB,hL2/3
171030_MB_EB_hs_25yr_BA10_FCL1_NA_E10_AD010_indexed,-21.853146,-50.612858,cluster_1,cluster_1,cluster_1,1,,MB_EB,hL2/3
171030_MB_EB_hs_25yr_BA10_FCL1_NA_E10_AD008_indexed,-29.088329,-62.679546,cluster_1,cluster_1,cluster_1,1,,MB_EB,hL2/3
171030_MB_EB_hs_25yr_BA10_FCL1_NA_E10_AD006_indexed,-18.936016,-70.780205,cluster_1,cluster_1,cluster_1,1,,MB_EB,hL2/3


In [43]:
# global mCH and mCG

df_mCH = df_meta[['mCH/CH']] 
df_mCH = df_mCH.reset_index()
df_mCH['gene_name'] = 'global_mCH'
df_mCH['nmCH/CH'] = df_mCH['mCH/CH']
df_mCH = df_mCH[['gene_name', 'Sample', 'mCH/CH', 'nmCH/CH']]
df_mCH.to_csv(output_gmch, sep='\t', na_rep='NA', header=False, index=False)
print(df_mCG.shape)

df_mCG = df_meta[['mCG/CG']] 
df_mCG = df_mCG.reset_index()
df_mCG['gene_name'] = 'global_mCG'
df_mCG['nmCG/CG'] = df_mCG['mCG/CG']
df_mCG = df_mCG[['gene_name', 'Sample', 'mCG/CG', 'nmCG/CG']]
df_mCG.to_csv(output_gmcg, sep='\t', na_rep='NA', header=False, index=False)
print(df_mCG.shape)

(6435, 4)
(6435, 4)
