In [None]:
import pandas as pd
import copy
import math
import numpy as np
import enum 

import networkx as nx
from networkx.convert_matrix import from_numpy_array

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

import scipy.spatial.distance
import scipy.stats as stats
from scipy.stats import pearsonr
from scipy.stats import kurtosis, skew

import matplotlib.pyplot as plt

import pylab

In [None]:
cell_data = pd.read_csv('cell_data.tsv', sep='\t')
cell_data.rename(columns={cell_data.columns[0]: 'cellID'}, inplace=True)
cell_data

In [None]:
# pravi matricu susedstva grafa 
# data_dict[celija] = (x,y) ili data_dict[celija] = redukovane genske ekspresije
# cells = [id_celije_1, ... , id_celije_n]
def adjacency_matrix(data_dict, cells):
    n = len(cells)
    adjacency_matrix = np.zeros(shape=(n, n))

    for i in range(0, n - 1):
        for j in range(i + 1, n):
            adjacency_matrix[i][j] = scipy.spatial.distance.euclidean(data_dict[cells[i]], data_dict[cells[j]])
            adjacency_matrix[j][i] = adjacency_matrix[i][j]

    return adjacency_matrix

In [None]:
cell_dict = {}
cells = []

# cell_dict ce sadrzati koordinate celija (cvorova) 
# cells je lista id-eva svih celija
for index, row in cell_data.iterrows():
    cell_dict[row['cellID']] = (row['x'], row['y'])
    cells.append(row['cellID'])

In [None]:
adjacency_matrix_1 = adjacency_matrix(cell_dict, cells)

In [None]:
# Redukcija dimenzionalnosti genskih ekspresija pomocu PCA metode
gene_exp = cell_data.copy()
gene_exp.drop('cellID', axis=1, inplace=True)
gene_exp.drop('x', axis=1, inplace=True)
gene_exp.drop('y', axis=1, inplace=True)

scaling = StandardScaler()
scaling.fit(gene_exp)
scaled_data = scaling.transform(gene_exp)

pca = PCA(n_components=4800)
reduced_data = pca.fit_transform(scaled_data)
pca.explained_variance_ratio_.cumsum()

In [None]:
# reduced_data_dict ce sadrzati redukovane genske ekspresije celija (cvorova) 
reduced_data_dict = {}
for i in range(0, len(reduced_data)):
    reduced_data_dict[cells[i]] = reduced_data[i]
len(reduced_data_dict)

In [None]:
adjacency_matrix_2 = adjacency_matrix(reduced_data_dict, cells)

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
norm_adj_matrix_1 = min_max_scaler.fit_transform(adjacency_matrix_1)
norm_adj_matrix_1

In [None]:
norm_adj_matrix_2 = min_max_scaler.fit_transform(adjacency_matrix_2)
norm_adj_matrix_2

In [None]:
stand_scaler = StandardScaler()
stand_adj_matrix_1 = stand_scaler.fit_transform(adjacency_matrix_1)
stand_adj_matrix_2 = stand_scaler.fit_transform(adjacency_matrix_2)

In [None]:
class Dissim_metric(enum.Enum):
    EUCLIDIAN = 1
    MANHATTAN = 2
    MINKOWSKI = 3
    COSINE = 4

In [None]:
def plot_dissim_hist(cdm_arr, bins=1000, exp_type="unknown", diss_between="unknown", is_normalized = False):
    figure, ax = plt.subplots(nrows=1, ncols=1)
    figure.dpi = 100
    figure.set_figheight(10)
    figure.set_figwidth(16)

    _ = ax.hist(cdm_arr, bins=bins)
    ax.set_title("Histogram of " + diss_between + " dissimilarity using " + exp_type + " metric")
    if is_normalized:
        ax.set_xlabel("Normalized dissimilarity")
    else:
        ax.set_xlabel("Dissimilarity")
    ax.set_ylabel("Number of cell pairs with specified dissimilarity")

In [None]:
def calc_dissimilarity(arr1, arr2, metric_type):
    if metric_type == Dissim_metric.EUCLIDIAN.value:
        return scipy.spatial.distance.euclidean(arr1, arr2)

    elif metric_type == Dissim_metric.MANHATTAN.value:
        return scipy.spatial.distance.cityblock(arr1, arr2)
        
    elif metric_type == Dissim_metric.MINKOWSKI.value:
        return scipy.spatial.distance.minkowski(arr1, arr2, p=3)
    
    elif metric_type == Dissim_metric.COSINE.value:
        return scipy.spatial.distance.cosine(arr1, arr2)

In [None]:
def create_cdm_matrix(matrix1, matrix2, metric):
    n = len(matrix1)
    distance_matrix = np.zeros(shape=(n, n))

    for i in range(0, n):
            distance_matrix[i,j] = calc_dissimilarity(matrix1[i,:], matrix2[i,:], metric_type=metric)
    return distance_matrix

In [None]:
cdm_matrix = {}
for met in Dissim_metric:
    cdm_matrix[met.name] = create_cdm_matrix(norm_adj_matrix_1, norm_adj_matrix_2, metric=met.value)

In [None]:
for met in Dissim_metric:
    dissimilarity_array = cdm_matrix[met.name].flatten()
    dissimilarity_array /= np.max(dissimilarity_array)
    plot_dissim_hist(dissimilarity_array, bins=1000, exp_type=met.name, diss_between="graph (normalized ajdacency matrix)")