In [None]:
# Import all necessary packages to run functions
from pathlib import Path
import numpy as np
from matplotlib import pyplot as plt
from FlowCytometryTools import FCMeasurement
from FlowCytometryTools import QuadGate, ThresholdGate
import sklearn
from sklearn import preprocessing
from sklearn.decomposition import PCA

def cd4():
    cd41 = ThresholdGate(6.514e+03, ('VL4-H'), region="above", name='cd41')
    cd42 = ThresholdGate(7.646e+03, ('VL4-H'), region="below", name='cd42')
    cd4_gate = cd41 & cd42
    return cd4_gate


def importF(pathname):
    """
    Import FCS files. Variable input: name of path name to file. Output is a list of Data File Names in FCT Format
    Title/file names are returned in the array file --> later referenced in other functions as title/titles input argument
    """
    # Declare arrays and int
    file = []
    sample = []
    title = []
    z = 0
    # Read in user input for file path and assign to array file
    pathlist = Path(r"" + str(pathname)).glob("**/*.fcs")
    for path in pathlist:
        path_in_str = str(path)
        file.append(path_in_str)
    file.sort()
    # Go through each file and assign the file contents to entry in the array sample
    for entry in file:
        sample.append(FCMeasurement(ID="Test Sample" + str(z), datafile=entry))
        z += 1
    importF.sample = sample
    # Returns the array sample which contains data of each file in folder (one file per entry in array)
    return sample, file


def sampleT(smpl):
    """Output is the T cells data (the protein channels related to T cells)"""
    # Features are the protein channels of interest when analyzing T cells
    features = ["BL1-H", "VL1-H", "VL4-H", "BL3-H"]
    tregd = []
    tregp = []
    # Transform to put on log scale
    tform = smpl.transform("hlog", channels=["BL1-H", "VL1-H", "VL4-H", "BL3-H", "RL1-H"])
    # Save the data of each column of the protein channels
    data = tform.data[["BL1-H", "VL1-H", "VL4-H", "BL3-H"]][0:]
    # Save pSTAT5 data
    pstat = tform.data[["RL1-H"]][0:]
    # Create a section for assigning colors to each data point of each cell population --> in this case, T cells
    colmat = []*(len(data)+1)
    for  i in range (len(data)):
        if data.iat[i,0] > 4.814e+03 and data.iat [i,0] <6.258e+03 and data.iat[i,1] > 3.229e+03 and data.iat[i,1] <5.814e+03:
            colmat.append('r') #Treg
            tregd.append (data.iloc[[i]])
            tregp.append(pstat.iloc[[i]])
        elif data.iat[i,0] > 2.586e+03 and data.iat [i,0] <5.115e+03 and data.iat[i,1] > 3.470e+02 and data.iat[i,1] <5.245e+03:
            colmat.append('g') # non Treg
        else:
            colmat.append('c')
    return data, pstat, features, colmat


def sampleNK(smpl):
    """Output is the NK cells data (the protein channels related to NK cells)"""
    # For NK, the data consists of different channels so the data var. output will be different
    # Output is data specific to NK cells
    # Features for the NK file of proteins (CD3, CD8, CD56)
    features = ["VL4-H", "RL1-H", "BL1-H"]
    # Transform all proteins (including pSTAT5)
    tform = smpl.transform("hlog", channels=["VL4-H", "RL1-H", "BL1-H", "BL2-H"])
    # Assign data of three protein channels AND pSTAT5
    data = tform.data[["VL4-H", "RL1-H", "BL1-H"]][0:]
    pstat = tform.data[["BL2-H"]][0:]
    # Create a section for assigning colors to each data point of each cell population --> in this case NK cells
    colmat = []*(len(data)+1)
    for  i in range (len(data)):
        if data.iat[i,0] > 5.550e03 and data.iat [i,0] <6.468e03 and data.iat[i,2] > 4.861e03 and data.iat[i,2] <5.813e03:
            colmat.append('r') #nk
        elif data.iat[i,0] > 6.533e03 and data.iat [i,0] <7.34e03 and data.iat[i,2] > 4.899e03 and data.iat[i,2] <5.751e03:
            colmat.append('g') #bnk
        else:
            colmat.append('c')
    return data, pstat, features, colmat


def appPCA(data, features):
    """Applies the PCA algorithm to the data set"""
    # Apply PCA to the data set
    # setting values of data of selected features to data frame
    xi = data.loc[:, features].values
    # STANDARDIZE DATA --> very important to do before applying machine learning algorithm
    scaler = preprocessing.StandardScaler()
    xs = scaler.fit_transform(xi)
    #xs = sklearn.preprocessing.scale(xi)
    xs = np.nan_to_num(xs)
    # setting how many components wanted --> PC1 and PC2
    pca = PCA(n_components=2)
    # apply PCA to standardized data set
    # NOTE: score == xf
    xf = pca.fit(xs).transform(xs)
    # creates the loading array (equation is defintion of loading)
    loading = pca.components_.T
    return xf, loading


def pcaPlt(xf, pstat, features, i, title, colormat):
    """
    Used to plot the score graph.
    Scattered point color gradients are based on range/abundance of pSTAT5 data. Light --> Dark = Less --> More Active
    """
    # PCA
    if len(features) == 4:
        name = "T Cells"
    elif len(features) == 3:
        name = "NK Cells"
    # Setting x and y values from xf
    x = xf[:, 0]
    y = xf[:, 1]
    # Working with pSTAT5 data --> setting min and max values
    pstat_data = pstat.values
    pstat_mean = np.mean(pstat_data)
    print("mean pStat activity: ", pstat_mean)
    plt.show()
    # Creating a figure for both scatter and mesh plots for PCA
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xlabel("Principal Component 1", fontsize=12)
    ax.set_ylabel("Principal Component 2", fontsize=12)
    #ax.set_title(name + " - PCA - " + str(title), fontsize=20)
    ax.set(xlim=(-5, 5), ylim=(-5, 5))
    # This is the scatter plot of the cell clusters colored cell type
    colormat = np.array(colormat)
    colormat.transpose
    plt.scatter(x[colormat == "c"], y[colormat == "c"], s = .15, c = "c", label = "Other", alpha = 0.5)
    plt.scatter(x[colormat == "g"], y[colormat == "g"], s = .15, c = "g", label = "NonTreg", alpha = 0.5)
    plt.scatter(x[colormat == "r"], y[colormat == "r"], s = .15, c = "r", label = "TReg", alpha = 0.5)
    plt.legend()
    plt.savefig('type'+str(i)+'.png')
    
def loadingPlot(loading, features, i, title):
    """Plot the loading data"""
    # Loading
    # Create graph for loading values
    x_load = loading[:, 0]
    y_load = loading[:, 1]

    # Create figure for the loading plot
    fig1 = plt.figure(figsize=(8, 8))
    ax = fig1.add_subplot(1, 1, 1)
    ax.set_xlabel("PC1", fontsize=15)
    ax.set_ylabel("PC2", fontsize=15)
    plt.scatter(x_load, y_load)

    for z, feature in enumerate(features):
        # Please note: not the best logic, but there are three features in NK and four features in T cells
        if len(features) == 4:
            name = "T Cells"
            if feature == "BL1-H":
                feature = "Foxp3"
            elif feature == "VL1-H":
                feature = "CD25"
            elif feature == "VL4-H":
                feature = "CD4"
            elif feature == "BL3-H":
                feature = "CD45RA"
        if len(features) == 3:
            name = "NK Cells"
            if feature == "VL4-H":
                feature = "CD3"
            if feature == "RL1-H":
                feature = "CD8"
            if feature == "BL1-H":
                feature = "CD56"
        plt.annotate(str(feature), xy=(x_load[z], y_load[z]))
    ax.set_title(name + " - Loading - " + str(title), fontsize=20)
    
    
def pcaAll(sampleType, check, titles):
    """
    Use to plot the score and loading graphs for PCA. Assign protein and pstat5 arrays AND score and loading arrays
    This is all the data for each file.
    Want to use for both T and NK cells? Use it twice!
    sampleType is importF for T or NK
    check == "t" for T cells OR check == "n" for NK cells
    """
    # declare the arrays to store the data
    data_array = []
    pstat_array = []
    xf_array = []
    loading_array = []
    
    # create the for loop to file through the data and save to the arrays
    # using the functions created above for a singular file
    if check == "t":
        for i, sample in enumerate(sampleType):
            title = titles[i].split("/")
            title = title[len(title)-1]
            data, pstat, features,_,  xfreg, preg = sampleT(sample)
            data_array.append(xfreg)
            pstat_array.append(preg)
            xf, loading = appPCA(data, features)
            xf_array.append(xf)
            loading_array.append(loading)
            pcaPlt(xf, pstat, features, i, title)
            loadingPlot(loading, features, i, title)
    elif check == "n":
        for i, sample in enumerate(sampleType):
            title = titles[i].split("/")
            title = title[len(title)-1]
            data, pstat, features, _, xfreg, preg = sampleT(sample)
            data_array.append(xfreg)
            pstat_array.append(preg)
            xf, loading = appPCA(data, features)
            pcaPlt(xf, pstat, features, i, title)
            loadingPlot(loading, features, i, title)
    plt.show()
    return data_array, pstat_array, xf_array, loading_array

def count_data(sampleType, gate):
    """
    Used to count the number of cells and store the data of all of these cells in a folder with multiple files --> automates the process sampleType
    is NK or T cell data, gate is the desired cell population.
    Sample type: is the overall importF assignment for T or NK (all the T cell files, all NK cell files)
    """
    # declare the arrays to store the data
    count_array = []
    data_array = []
    # create the for loop to file through the data and save to the arrays
    # using the functions created above for a singular file
    for _, sample in enumerate(sampleType):
        count_array.append(cellCount(sample, gate))
        data_array.append(rawData(sample, gate))
    # returns the array for count of cells and the array where each entry is the data for the specific cell population in that .fcs file
    return count_array, data_array


def pcaAllCellType(sampleType, check, titles):
    """
    Use to plot the score and loading graphs for PCA. Assign protein and pstat5 arrays AND score and loading arrays
    This is all the data for each file.
    Want to use for both T and NK cells? Use it twice!
    sampleType is importF for T or NK
    check == "t" for T cells OR check == "n" for NK cells
    """
    # declare the arrays to store the data
    data_array = []
    pstat_array = []
    xf_array = []
    loading_array = []
    
    # create the for loop to file through the data and save to the arrays
    # using the functions created above for a singular file
    if check == "t":
        for i, sample in enumerate(sampleType):
            title = titles[i].split("/")
            title = title[len(title)-1]
            data, pstat, features, colormat = sampleT(sample)
            data_array.append(data)
            pstat_array.append(pstat)
            xf, loading = appPCA(data, features)
            xf_array.append(xf)
            loading_array.append(loading)
            pcaPlt(xf, pstat, features, i, title, colormat) #changed
            loadingPlot(loading, features, i, title)
    elif check == "n":
        for i, sample in enumerate(sampleType):
            title = titles[i].split("/")
            title = title[len(title)-1]
            data, pstat, features, colormat = sampleNK(sample)
            data_array.append(data)
            pstat_array.append(pstat)
            xf, loading = appPCA(data, features)
            pcaPlt(xf, pstat, features, i, title, colormat)
            loadingPlot(loading, features, i, title)
    plt.show()
    return data_array, pstat_array, xf_array, loading_array



In [None]:
#tplate = input('What is the name of the T plate folder?:')
tplate = "/home/heatherm/type-I-ckine-model/ckine/brian/ti"
tsample, titlesT = importF(tplate)


#nkplate = input('What is the name of the NK plate folder?:')
#nksample, titlesNK = importF(nkplate)


dataT_array, pstatT_array, xfT_array, loadingT_array = pcaAllCellType(tsample, "t", titlesT)
#dataNK_array, pstatNK_array, xfNK_array, loadingNK_array = pcaAllCellType(nksample, "n", titlesNK)

#dataT_array, pstatT_array, xfT_array, loadingT_array = pcaAllCellType(tsample, "t", titlesT)

#dataNK_array, pstatNK_array, xfNK_array, loadingNK_array = pcaAllCellType(nksample, "n", titlesNK)
#/home/brianoj/Tplate15
#/home/brianoj/PosterPlatesT