In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
genome_sample_df = pd.read_csv('genome_sample.csv')

# Primer: dobijem 'GSM3020393' vratim 'GRCh38'
genome_sample_map = pd.Series(genome_sample_df.GENOME.values, index=genome_sample_df.SAMPLE).to_dict()

print(np.unique(list(genome_sample_map.values())))

common_human_map_genome = {"GRCh38" : "Ensembl_GRCh38.p12_rel94", 
                           "hg19" : "hg19",
                           "hg38" : "hg38",
                           "GRCh38 version 90" : "Ensembl_GRCh38.p12_rel94"}

common_human_list_df = pd.read_csv('common_human_list.csv')
ENSG_ID = list(common_human_list_df['ENSG_ID'])

hg19 = list(common_human_list_df['hg19'])
hg19 = [x[x.rfind('_')+1:] for x in hg19]
hg37 = list(common_human_list_df['hg37'])
hg37 = [x[x.rfind('_')+1:] for x in hg37]
hg38 = list(common_human_list_df['hg38'])
hg38 = [x[x.rfind('_')+1:] for x in hg38]
ensembl_h38 = list(common_human_list_df['Ensembl_GRCh38.p12_rel94'])
ensembl_h38 = [x[x.rfind('#')+1:] for x in ensembl_h38]

map_hg = {"hg19" : hg19, "hg37" : hg37, "hg38" : hg38, "Ensembl_GRCh38.p12_rel94" : ensembl_h38}

['GRCh38' 'GRCh38 version 90' 'hg19' 'hg38']


In [3]:
common_human_list_df.head()

Unnamed: 0,ENSG_ID,hg19,hg37,hg38,Ensembl_GRCh38.p12_rel94,GSM3717979,Unnamed: 6
0,ENSG00000181638,hg19_ZFP41,grch37_ZFP41,grch38_ZFP41,#ZFP41,#ZFP41,in all
1,ENSG00000111875,hg19_ASF1A,grch37_ASF1A,grch38_ASF1A,#ASF1A,#ASF1A,in all
2,ENSG00000176142,hg19_TMEM39A,grch37_TMEM39A,grch38_TMEM39A,#TMEM39A,#TMEM39A,in all
3,ENSG00000177186,hg19_OR2M7,grch37_OR2M7,grch38_OR2M7,#OR2M7,#OR2M7,in all
4,ENSG00000135624,hg19_CCT7,grch37_CCT7,grch38_CCT7,#CCT7,#CCT7,in all


In [4]:
# Primer: sample = 'GSM3892576'
def get_genome_value(sample):
    return genome_sample_map[sample]

def data_transpose(data):
    data_transposed = data.transpose()
    data_transposed.columns = data_transposed[0:1].to_numpy()[0]
    data_transposed = data_transposed.drop(data_transposed.index[0])
        
    return data_transposed

# 1. briše one ENSG redove kojih nema u common_human_list
# 2. zamenjuje ENSG vrednosti sa odgovarajućim vrednostima gena
def change_ENSG_rows(raw_data, genome_mapped):
    start_length = len(raw_data)
    for_dropping = []

    for i in range(start_length):
        if raw_data['Index'][i] not in ENSG_ID:
            for_dropping.append(i)
        
    raw_data_ENSG_filter = raw_data.drop(raw_data.index[for_dropping])
    raw_data_ENSG_filter = raw_data_ENSG_filter.reset_index(drop=True)
    
    for i in range(len(raw_data_ENSG_filter)):
        # TODO pitaj da li treba kao tamo bez # ili treba sa #
        # da li treba 'hg38_nešto' ili samo to 'nešto'
        # ja skidam to hg38 i skidam #, ostavljam sve posle "_" i sve posle "#"
        tmp = list(common_human_list_df.loc[common_human_list_df["ENSG_ID"] == raw_data_ENSG_filter["Index"][i]]
               [genome_mapped])[0][0:]
        
        if tmp[0] == "#":
            tmp = tmp[1:]
        else:
            tmp = tmp[tmp.rfind('_')+1:]
            
        raw_data_ENSG_filter.loc[i, 'Index'] = tmp
    
    return raw_data_ENSG_filter

def discard_not_allowed_rows(raw_data, genome_mapped):
    start_length = len(raw_data)
    for_dropping = []
    hg = map_hg[genome_mapped]
    
    for i in range(start_length):
        if raw_data['Unnamed: 0'][i] not in hg:
            for_dropping.append(i)
        
    raw_data_ENSG_filter = raw_data.drop(raw_data.index[for_dropping])
    raw_data_ENSG_filter = raw_data_ENSG_filter.reset_index(drop=True)
    
    return raw_data_ENSG_filter

def change_columns(raw_data_filter, folder_name):
    columns_array = range(0, len(raw_data_filter.columns))
    
    prefix_folder = folder_name[3:]
    new_col_names = [prefix_folder + "_" + str(x) for x in columns_array]
    new_col_names[0] = "Index"
    
    raw_data_filter.columns = [new_col_names]     
    return raw_data_filter

def remove_NaN_values(raw_data):
    print(f'Shape pre obrade -> {raw_data.shape}')
    raw_data_no_nan = raw_data.dropna()
    print(f'Shape posle dropna() -> {raw_data_no_nan.shape}')
        
    return raw_data_no_nan

def prepare_data(folder_path, folder_name):
    csv_file_name = os.listdir(folder_path)[0]
    csv_path = os.path.join(folder_path, csv_file_name)
    raw_data = pd.read_csv(csv_path)
    
    
    raw_data = remove_NaN_values(raw_data)
    
    
    genome = get_genome_value(folder)
    genome_mapped = common_human_map_genome[genome]
    
    if "Index" in raw_data.columns:
        # onda imamo ovakve ne obrađene redove: 'ENSG00000243485'
        print("Obrada ENSG vrsta: ", csv_path)
        raw_data_filter = change_ENSG_rows(raw_data, genome_mapped)
        
        # AAACCTGAGCAGACTG-1 zameniti sa: x_1 ako je x = 2741551 iz GSM2741551
        data_filtered = change_columns(raw_data_filter, folder_name)
        
    else: 
        # ovde imamo zamenjene vrednosti, ali nisu sve prisutne u common_list datoteci
        # treba da se čiste u zavisnosti od vrednosti u datoj koloni hg19, hg38 tj. odgovarajućoj
        print("Uklanjanje redova: ", csv_path)
        raw_data_filter = discard_not_allowed_rows(raw_data, genome_mapped)  
        
        # ovde su kolone malo drugačije, ali isto ih preimenujemo
        # nemamo Index kolonu, već neku Unnamed: 0, preimenujem je na Index za sad?
        data_filtered = change_columns(raw_data_filter, folder_name)
        
    # ovde su redovi i kolone sređeni
    return raw_data_filter

In [9]:
# I grupa su oboleli pre tretmana
# II grupa su oboleli posle tretmana
# ova 2 foldera što su poslednja su Healthy i treba da vidimo šta sa njima

data_folders = ['GSM3511735', #'GSM3511747', 'GSM3511752', 'GSM3511757', 'GSM3511762',
                #'GSM3511766', 'GSM3721449', 'GSM3721453', 'GSM3721454', 'GSM3721455', 
                #'GSM3721457', 'GSM3721458', 'GSM3721460', 'GSM3721462', 'GSM3721464'
               
                 'GSM3511738',# 'GSM3511741', 'GSM3511749', 'GSM3511754', 'GSM3511760', 
#                 'GSM3721445', 'GSM3721447', 'GSM3721448', 'GSM3721450', 'GSM3721451', 
#                 'GSM3721452', 'GSM3721456', 'GSM3721459', 'GSM3721461', 'GSM3721463', 
                 'GSM3892576',
                
#                 'GSM3837173', 'GSM3892570'               
               ]

In [10]:
for folder in data_folders:
    print()
    print(f'Obrađuje se folder: {folder}')

    this_dir = os.getcwd()
    data_filtered = prepare_data(os.path.join(this_dir, folder), folder)
    print("Folder: {} ima shape nakon obrade {}".format(folder, data_filtered.shape))


Obrađuje se folder: GSM3511735
Shape pre obrade -> (23309, 884)
Shape posle dropna() -> (23309, 884)
Uklanjanje redova:  /Users/mandja96/Desktop/Skripta/GSM3511735/GSM3511735_su001_pre1_Tcell_RNA.csv
Folder: GSM3511735 ima shape nakon obrade (17523, 884)

Obrađuje se folder: GSM3511738
Shape pre obrade -> (23309, 3950)
Shape posle dropna() -> (23309, 3950)
Uklanjanje redova:  /Users/mandja96/Desktop/Skripta/GSM3511738/GSM3511738_su001_post_Tcell_RNA.csv
Folder: GSM3511738 ima shape nakon obrade (17523, 3950)

Obrađuje se folder: GSM3892576
Shape pre obrade -> (33538, 6576)
Shape posle dropna() -> (33538, 6576)
Obrada ENSG vrsta:  /Users/mandja96/Desktop/Skripta/GSM3892576/GSM3892576_PBMC_POST2W_filtered_gene_bc_matrices_h5.csv
Folder: GSM3892576 ima shape nakon obrade (30710, 6576)


In [11]:
data_filtered.head()

Unnamed: 0,Index,3892576_1,3892576_2,3892576_3,3892576_4,3892576_5,3892576_6,3892576_7,3892576_8,3892576_9,...,3892576_6566,3892576_6567,3892576_6568,3892576_6569,3892576_6570,3892576_6571,3892576_6572,3892576_6573,3892576_6574,3892576_6575
0,MIR1302-2HG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,FAM138A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,OR4F5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AL627309.1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AL627309.3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
transponovano = data_transpose(data_filtered)
transponovano.head()

Unnamed: 0,MIR1302-2HG,FAM138A,OR4F5,AL627309.1,AL627309.3,AL627309.2,AL627309.4,AL732372.1,AC114498.1,AL669831.2,...,MT-ATP8,MT-ATP6,MT-CO3,MT-ND3,MT-ND4L,MT-ND4,MT-ND5,MT-ND6,MT-CYB,AC213203.1
3892576_1,0,0,0,0,0,0,0,0,0,0,...,22,27,14,26,20,21,6,3,13,0
3892576_2,0,0,0,0,0,0,0,0,0,0,...,16,25,21,18,12,9,10,4,6,0
3892576_3,0,0,0,0,0,0,0,0,0,0,...,18,26,15,37,20,17,11,1,9,0
3892576_4,0,0,0,0,0,0,0,0,0,0,...,17,24,24,26,13,10,7,6,15,0
3892576_5,0,0,0,0,0,0,0,0,0,0,...,19,25,21,34,22,27,3,1,16,0


In [15]:
max(transponovano.max(axis = 1, skipna = True) )

1019.0