In [1]:
import os
import pandas as pd 
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSCanonical, PLSRegression, CCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, cross_val_score
# from sklearn.manifold import TSNE
import matplotlib  as mpl
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms
%matplotlib inline
import os, shutil, glob
from random import randint
import seaborn as sns; sns.set_style("white")
import umap as umap

# import plotnine as gg
# from cytominer_eval import evaluate

os.getcwd()

2023-08-30 09:13:10.369575: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'/scratch2-shared/david/colopaint3D/notebooks'

In [2]:
figformat = 'png'
dpi = 300
statarg = 'single'
OutputDir = f'./output/3_PCAUMAP_MeanMedian'
if not os.path.exists(OutputDir): 
    os.makedirs(OutputDir)
FeatureDir = './output/1_FeaturesImages'
statmet = ['Median', 'Mean']
filenames = ['spheroidspher010-DLD1-L5', 'spheroidspher010-HCT15-L2', 'spheroidspher010-HT29-L4']

In [4]:
###Medians

def readData(filename):
    read_data = pd.read_parquet(f'{FeatureDir}/{filename}.parquet')
    cbar = pd.DataFrame()
    read_data.reset_index(inplace=True, drop = True)
    training_data = read_data.copy()
    loc_data = training_data[['Nuclei_Location_Center_X','Nuclei_Location_Center_Y']]
    training_data = training_data.loc[:,~training_data.columns.str.contains('Location', case=True)]
    training_data = training_data.loc[:,~training_data.columns.str.contains('ImageNumber_', case=True)]
    training_data = training_data.loc[:,~training_data.columns.str.contains('Parent', case=True)]
    training_data = training_data.loc[:,~training_data.columns.str.contains('Children', case=True)]
    training_data = training_data.loc[:,~training_data.columns.str.contains('_ObjectNumber', case=True)]
    training_data = training_data.loc[:,~training_data.columns.str.contains('_Object_Number', case=True)]
    training_data = training_data.loc[:,~training_data.columns.str.contains('_Y', case=True)]
    training_data = training_data.loc[:,~training_data.columns.str.contains('_X', case=True)]
    training_data = training_data.copy()

    toNpy = training_data.loc[:,~training_data.columns.str.contains('Metadata_|onehot')]
    toNpy.reset_index(inplace=True, drop=True)
    toNpy = toNpy.copy()
    dataNpy = toNpy.to_numpy()
    colnames = toNpy.columns
    
    dataLabel = oneHot(training_data)

    return dataNpy, dataLabel

def oneHot(training_data):
    ### New Onehot
    target = pd.DataFrame()
    onehot_val = list(range(len(training_data['Metadata_Cmpd'].unique())))
    onehot_dict = dict(zip(training_data['Metadata_Cmpd'].unique(), onehot_val))
    target['onehot'] = training_data['Metadata_Cmpd'].apply(lambda x: onehot_dict.get(x, -1))
    training_data['onehot'] = target['onehot']



    dataLabel = pd.DataFrame()
    dataLabel['Metadata_Cmpd'] = training_data['Metadata_Cmpd']
    dataLabel['onehot'] = training_data['onehot']

    # dataUMAP = pd.DataFrame()
    # dataUMAP['Metadata_Cmpd'] = training_data['Metadata_Cmpd']
    # dataUMAP['onehot'] = training_data['onehot']
    return dataLabel

def makePCA(dataN, dataPCA, name='', n_components=2):
    pca_model = PCA(n_components=  2)
    pca_model = pca_model.fit(dataN)
    pcaOut = pca_model.transform(dataN)
    dataPCA['pc1'] = pcaOut[:,0]
    dataPCA['pc2'] = pcaOut[:,1]
    dataPCA = dataPCA.copy()

    # cmap = sns.color_palette("hls", n_colors=11)
    cmap = sns.color_palette("Set3", n_colors=12)
    cmap = cmap[:11]
    hue = dataPCA['Metadata_Cmpd']
    # hue = training_data['cluster']

    fig = plt.figure(figsize=[14, 5])
    ax = fig.add_subplot(121)
    ax.set_xlabel('PC 1', fontsize = 10)
    ax.set_ylabel('PC 2', fontsize = 10)
    ax.spines['top'].set_color('w')
    ax.spines['right'].set_color('w')
    ax.spines['left'].set_color('grey')
    ax.spines['bottom'].set_color('grey')
    sns.scatterplot(x="pc1", y="pc2",
                        palette=cmap, hue=hue,
                        marker='.',
                        data=dataPCA).set(title=f'PCA {name} All'
                )


    noInert = dataPCA[~(dataPCA['Metadata_Cmpd'] == 'dmso')]
    noInert = noInert[~(noInert['Metadata_Cmpd'] == 'sorb')]
    ax.set_facecolor('w')
    ax.get_legend().remove()
    # ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
    ax = fig.add_subplot(122)
    ax.set_xlabel('PC 1', fontsize = 10)
    ax.set_ylabel('PC 2', fontsize = 10)
    ax.spines['top'].set_color('w')
    ax.spines['right'].set_color('w')
    ax.spines['left'].set_color('grey')
    ax.spines['bottom'].set_color('grey')
    sns.scatterplot(x="pc1", y="pc2",
                        palette=cmap, hue=hue,
                        marker='.',
                        data=noInert).set(title=f'PCA {name} no Inert'
                )
    ax.set_facecolor('w')
    # ax.get_legend().remove()
    ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
    plt.savefig(f'{OutputDir}/{name}_pca.png')
    plt.show()
    plt.close()
    return dataPCA

def makeUMAP(dataN, dataUMAP, name='', nn = 100, is_supervised=True, n_components=2, min_dist=0.2, spread= 5, n_epochs=None, metric='cosine',):
    umap_model = umap.UMAP(n_neighbors=nn
                        , min_dist=min_dist
                        , spread= spread
                        , n_epochs=n_epochs
                        , metric=metric
                        )
    if is_supervised:
        umapOut = umap_model.fit_transform(dataN, y=dataUMAP['onehot'])
        isSup = 'Supervised'
    else:
        umapOut = umap_model.fit_transform(dataN)
        isSup = 'Unsupervised'
    dataUMAP['umap1'] = umapOut[:,0]
    dataUMAP['umap2'] = umapOut[:,1]
    dataUMAP = dataUMAP.copy()

    # cmap = sns.color_palette("hls", n_colors=11)
    cmap = sns.color_palette("Set3", n_colors=12)
    cmap = cmap[:11]
    hue = dataUMAP['Metadata_Cmpd']
    # hue = training_data['cluster']

    fig = plt.figure(figsize=[14, 5])
    ax = fig.add_subplot(121)
    ax.set_xlabel('UMAP 1', fontsize = 10)
    ax.set_ylabel('UMAP 2', fontsize = 10)
    ax.spines['top'].set_color('w')
    ax.spines['right'].set_color('w')
    ax.spines['left'].set_color('grey')
    ax.spines['bottom'].set_color('grey')
    sns.scatterplot(x="umap1", y="umap2",
                        palette=cmap, hue=hue,
                        marker='.',
                        data=dataUMAP).set(title=f'UMAP {isSup} {name} all'
                )


    noInert = dataUMAP[~(dataUMAP['Metadata_Cmpd'] == 'dmso')]
    noInert = noInert[~(noInert['Metadata_Cmpd'] == 'sorb')]
    ax.set_facecolor('w')
    ax.get_legend().remove()

    ax = fig.add_subplot(122)
    ax.set_xlabel('UMAP 1', fontsize = 10)
    ax.set_ylabel('UMAP 2', fontsize = 10)
    ax.spines['top'].set_color('w')
    ax.spines['right'].set_color('w')
    ax.spines['left'].set_color('grey')
    ax.spines['bottom'].set_color('grey')
    sns.scatterplot(x="umap1", y="umap2",
                        palette=cmap, hue=hue,
                        marker='.',
                        data=noInert).set(title=f'UMAP {isSup} {name} no Inert'
                )
    # ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
    ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
    # ax.get_legend().remove() 
    ax.set_facecolor('w')
    plt.savefig(f'{OutputDir}/{name}_umap{nn}nn_{isSup}.png')
    plt.show()
    plt.close()
    return dataUMAP



In [12]:
min_dist=0.2
spread= 5
n_epochs=None
metric='euclidean'
nn = 5

In [None]:
for met in statmet:
    for filename in filenames:
        filename=f'{filename}{met}'
        dataNpy, dataL = readData(filename)
        dataPCA = makePCA(dataNpy, dataL, name=filename)
        dataUMAP15 = makeUMAP(dataNpy, dataL, name=filename, nn=15)
        dataUMAP25 = makeUMAP(dataNpy, dataL, name=filename, nn=25)
        dataUMAP50 = makeUMAP(dataNpy, dataL, name=filename, nn=50)
        dataUMAP100 = makeUMAP(dataNpy, dataL, name=filename)
        dataUMAP = makeUMAP(dataNpy, dataL, name=filename, is_supervised=False)