In [1]:
import numpy as np
import pandas as pd
import random
import os

In [2]:
# base_path = "/Users/schmuck/Library/CloudStorage/OneDrive-IndianaUniversity/PhD/DATASETS/real_data/Data_without_labels"
# transf_path = "/Users/schmuck/Library/CloudStorage/OneDrive-IndianaUniversity/PhD/DATASETS/real_data/P_adic_transformed_data/transformed_data"
# out_path = "/Users/schmuck/Library/CloudStorage/OneDrive-IndianaUniversity/PhD/DATASETS/real_data/Data_without_labels/"

base_path = "/u/parishar/nobackup/DATASETS/exp_data/raw_data/"
out_path = "/u/parishar/nobackup/DATASETS/exp_data/raw_data/"

# file_list = ["pollen_raw.csv", "darmanis_raw.csv", "usoskin_raw.csv", "mouse_pan.csv", 
#             "Muraro_raw.csv", "QSDiaphragm_raw.csv"]

# data_list = ["Pollen", "Darmanis", "Usoskin", "Mouse_pan", 
#              "Muraro", "QSDiaphragm"]

# label_list = ["labels_pollen.csv", "labels_darmanis.csv", "labels_usoskin.csv", 
#               "labels_mouse_pan.csv", "labels_Muraro.csv", "labels_QSDiaphragm.csv"]


file_list = ["pollen_raw.csv", "darmanis_raw.csv", "usoskin_raw.csv", "mouse_pan.csv", 
            "Muraro_raw.csv", "QSLimb_raw.csv", "QSTrachea_raw.csv", "QSLung_raw.csv", "QSDiaphragm_raw.csv", 
             "Q10XSpleen_raw.csv"]

data_list = ["Pollen", "Darmanis", "Usoskin", "Mouse_pan", "Muraro", "QSLimb", "QSTrachea", "QSLung", 
             "QSDiaphragm", "Q10XSpleen"]

label_list = ["labels_pollen.csv", "labels_darmanis.csv", "labels_usoskin.csv", 
              "labels_mouse_pan.csv", "labels_Muraro.csv", "labels_QSLimb.csv", "labels_QSTrachea.csv", 
              "labels_QSLung.csv", "labels_QSDiaphragm.csv",
              "labels_Q10XSpleen.csv"]


In [3]:
def load_data(file_name, label_name):
    
    file_path = os.path.join(base_path, file_name)
    data = pd.read_csv(file_path, sep=",", header=None)
    labels = pd.read_csv(os.path.join(base_path, label_name), header=None)

    return data, labels


data_num_clusters = {"Usoskin": 4, "Pollen": 11, 
             "Mouse_pan": 13, "Darmanis": 8, "Muraro": 9,
             "QSLimb": 6, "QSLung": 11, "Q10XSpleen": 5, 
             "QSTrachea": 4, "QSDiaphragm": 5}


def pre_process_data(data, labels):

    data = np.array(data)

    print("Data Shape: ", data.shape)

    # Remove genes expressed in less than 1% of cell
    gene_expr_sum = np.sum(data, axis=0)
    limit = np.ceil(0.01 * data.shape[0])
    wch_genes = np.where(gene_expr_sum < limit)[0]
    
    if len(wch_genes) > 0:
        print("Genes to be removed:", len(wch_genes))
        data = np.delete(data, wch_genes, 1)

    rsum = np.sum(data, 1)
    wch_cells = np.where(rsum < 100)[0]
    

    if len(wch_cells) > 0:
        print("Cell to be removed: ", len(wch_cells))
        data = np.delete(data, wch_cells, 0)
        labels = np.delete(labels, wch_cells, 0)


    # Library size normalization
    data = (data/np.sum(data, 1)[:, None]) * np.median(np.sum(data, 1))
    labels = np.array(labels).reshape(data.shape[0],)
    data = pd.DataFrame(data)

    print("Data Shape: ", data.shape, len(labels))
    
    return data, labels

In [4]:
num_rep = 20
n_nei = 5

# Generate seeds 
np.random.seed(9)
seeds = np.random.choice(2000, 200, replace=False).reshape(10, 20)

# Output pandas data frame
all_results = pd.DataFrame(columns=["Data", "Run", "Indices"])

temp_indices = []
temp_data = []
temp_run = []

for i in range(len(file_list)):

    print("Processing: ", data_list[i])

    file_name = file_list[i]
    label_name = label_list[i]

    # Load
    data, labels = load_data(file_name, label_name)

    # Preprocess
    data, labels = pre_process_data(data, labels)

    # Num clusters
    num_clusters = data_num_clusters[data_list[i]]

    
    for rep in range(num_rep):
        
        temp_data.append(data_list[i])
        temp_run.append(rep)
        
        random.seed(seeds[i, rep])
        temp_indices.append(random.sample(range(data.shape[0]), num_clusters))

        # temp = pd.DataFrame(list(zip(temp_data, temp_run, ind)), columns=["Data", "Run", "Indices"])
        # all_results = pd.concat([all_results, temp], ignore_index=True, sort=False)

Processing:  Pollen
Data Shape:  (299, 21468)
Genes to be removed: 1516
Data Shape:  (299, 19952) 299
Processing:  Darmanis


since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seeds[i, rep])


Data Shape:  (420, 21516)
Genes to be removed: 2094
Data Shape:  (420, 19422) 420
Processing:  Usoskin


since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seeds[i, rep])


Data Shape:  (622, 19532)
Genes to be removed: 1509
Data Shape:  (622, 18023) 622
Processing:  Mouse_pan


since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seeds[i, rep])


Data Shape:  (1884, 14878)
Genes to be removed: 2978
Data Shape:  (1884, 11900) 1884
Processing:  Muraro


since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seeds[i, rep])


Data Shape:  (2122, 19046)
Genes to be removed: 3723
Data Shape:  (2122, 15323) 2122
Processing:  QSLimb


since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seeds[i, rep])


Data Shape:  (1090, 23341)
Genes to be removed: 6665
Data Shape:  (1090, 16676) 1090
Processing:  QSTrachea


since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seeds[i, rep])


Data Shape:  (1350, 23341)
Genes to be removed: 4944
Data Shape:  (1350, 18397) 1350
Processing:  QSLung


since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seeds[i, rep])


Data Shape:  (1676, 23341)
Genes to be removed: 6077
Data Shape:  (1676, 17264) 1676
Processing:  QSDiaphragm


since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seeds[i, rep])


Data Shape:  (870, 23341)
Genes to be removed: 7528
Data Shape:  (870, 15813) 870
Processing:  Q10XSpleen


since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seeds[i, rep])


Data Shape:  (9552, 23341)
Genes to be removed: 13626
Data Shape:  (9552, 9715) 9552


since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seeds[i, rep])


In [5]:
with open("random_centroid_indices.csv", "w") as myFile:
    
    myFile.write("Data" + "," + "Run" + "," + "Indices" + "\n")
    
    for i in range(len(temp_data)):
        
        ind = ""
        
        for j in range(len(temp_indices[i])):
            
            if j < len(temp_indices[i])-1:
                ind += str(temp_indices[i][j]) + "+"  
            else:
                ind += str(temp_indices[i][j])
        
        # print(temp_data[i] + "," + str(temp_run[i]) + "," + ind)
        
        if i < len(temp_data)-1:
            myFile.write(temp_data[i] + "," + str(temp_run[i]) + "," + ind + "\n")
        else:
            myFile.write(temp_data[i] + "," + str(temp_run[i]) + "," + ind )

In [22]:
# f =  pd.read_csv("random_centroid_indices.csv", header=0)
# f.head(10)

Unnamed: 0,Data,Run,Indices
0,Pollen,0,259+113+81+12+177+77+248+43+30+161+187
1,Pollen,1,235+138+191+52+50+116+285+94+161+200+121
2,Pollen,2,139+95+42+226+2+12+192+199+14+213+243
3,Pollen,3,80+104+185+236+266+1+170+19+111+217+59
4,Pollen,4,60+101+9+73+179+7+192+49+181+115+32
5,Pollen,5,72+213+275+195+52+17+189+241+277+125+293
6,Pollen,6,263+48+94+200+14+72+297+197+65+141+205
7,Pollen,7,282+216+127+40+201+131+207+298+159+78+233
8,Pollen,8,270+288+37+69+145+30+181+179+128+251+267
9,Pollen,9,266+217+139+225+234+11+183+122+297+173+113
