In [30]:
import pandas as pd
import numpy as np
import os

In [31]:
genome_sample_df = pd.read_csv('genome_sample.csv')

### Primer: dobijem 'GSM3020393' vratim 'GRCh38'
genome_sample_map = pd.Series(genome_sample_df.GENOME.values, index=genome_sample_df.SAMPLE).to_dict()

# print(np.unique(list(genome_sample_map.values())))

common_human_map_genome = {"GRCh38" : "Ensembl_GRCh38.p12_rel94", 
                           "hg19" : "hg19",
                           "hg38" : "hg38",
                           "GRCh38 version 90" : "Ensembl_GRCh38.p12_rel94"}

common_human_list_df = pd.read_csv('common_human_list.csv')
ENSG_ID = list(common_human_list_df['ENSG_ID'])
hg19 = list(common_human_list_df['hg19'])
hg19 = [x[x.rfind('_')+1:] for x in hg19]
hg37 = list(common_human_list_df['hg37'])
hg37 = [x[x.rfind('_')+1:] for x in hg37]
hg38 = list(common_human_list_df['hg38'])
hg38 = [x[x.rfind('_')+1:] for x in hg38]
ensembl_h38 = list(common_human_list_df['Ensembl_GRCh38.p12_rel94'])
ensembl_h38 = [x[x.rfind('#')+1:] for x in ensembl_h38]

map_hg = {"hg19" : hg19, "hg37" : hg37, "hg38" : hg38, "Ensembl_GRCh38.p12_rel94" : ensembl_h38}

In [32]:
### Primer: sample = 'GSM3892576'
def get_genome_value(sample):
    return genome_sample_map[sample]


def data_transpose(data):
    data_transposed = data.transpose()
    data_transposed.columns = data_transposed[0:1].to_numpy()[0]
    data_transposed = data_transposed.drop(data_transposed.index[0])      
    
    return data_transposed


# 1. briše one ENSG redove kojih nema u common_human_list
# 2. zamenjuje ENSG vrednosti sa odgovarajućim vrednostima gena
def change_ENSG_rows(raw_data, genome_mapped):
    start_length = len(raw_data)
    for_dropping = []

    for i in range(start_length):
        if raw_data['Index'][i] not in ENSG_ID:
            for_dropping.append(i)
        
    raw_data_ENSG_filter = raw_data.drop(raw_data.index[for_dropping])
    raw_data_ENSG_filter = raw_data_ENSG_filter.reset_index(drop=True)
    
    for i in range(len(raw_data_ENSG_filter)):
        # TODO pitaj da li treba kao tamo bez # ili treba sa #
        # da li treba 'hg38_nešto' ili samo to 'nešto'
        # ja skidam to hg38 i skidam #, ostavljam sve posle "_" i sve posle "#"
        tmp = list(common_human_list_df.loc[common_human_list_df["ENSG_ID"] == raw_data_ENSG_filter["Index"][i]]
               [genome_mapped])[0]
        
        if tmp[0] == "#":
            tmp = tmp[1:]
        else:
            tmp = tmp[tmp.rfind('_')+1:]
            
        raw_data_ENSG_filter.loc[i, 'Index'] = tmp
    return raw_data_ENSG_filter


def discard_not_allowed_rows(raw_data, genome_mapped):
    start_length = len(raw_data)
    for_dropping = []
    hg = map_hg[genome_mapped]
    
    for i in range(start_length):
        if raw_data['Unnamed: 0'][i] not in hg:
            for_dropping.append(i)
        
    raw_data_ENSG_filter = raw_data.drop(raw_data.index[for_dropping])
    raw_data_ENSG_filter = raw_data_ENSG_filter.reset_index(drop=True)
    return raw_data_ENSG_filter


def change_columns(raw_data_filter, folder_name):
    columns_array = range(0, len(raw_data_filter.columns))
    
    prefix_folder = folder_name[3:]
    new_col_names = [prefix_folder + "_" + str(x) for x in columns_array]
    new_col_names[0] = "Index"
    
    raw_data_filter.columns = [new_col_names]     
    return raw_data_filter


def remove_NaN_values(raw_data):
    print(f'Dimenzije pre obrade -> {raw_data.shape}')
    raw_data_no_nan = raw_data.dropna()
    print(f'Dimenzije posle dropna -> {raw_data_no_nan.shape}')
    return raw_data_no_nan


def prepare_data(folder_path, folder_name):
    csv_file_name = os.listdir(folder_path)[0]
    csv_path = os.path.join(folder_path, csv_file_name)
    raw_data = pd.read_csv(csv_path)
    
    raw_data = remove_NaN_values(raw_data)    
    
    genome = get_genome_value(folder)
    genome_mapped = common_human_map_genome[genome]
    
    #print("Genome_mapped: ", genome_mapped)
    if "Index" in raw_data.columns:
        # onda imamo ovakve ne obrađene redove: 'ENSG00000243485'
        print("Obrada ENSG vrsta: ", csv_path)
        raw_data_filter = change_ENSG_rows(raw_data, genome_mapped)
        
        # AAACCTGAGCAGACTG-1 zameniti sa: x_1 ako je x = 2741551 iz GSM2741551
        data_filtered = change_columns(raw_data_filter, folder_name)
        
    else: 
        # ovde imamo zamenjene vrednosti, ali nisu sve prisutne u common_list datoteci
        # treba da se čiste u zavisnosti od vrednosti u datoj koloni hg19, hg38 tj. odgovarajućoj
        print("Uklanjanje redova: ", csv_path)
        raw_data_filter = discard_not_allowed_rows(raw_data, genome_mapped)  
        
        # ovde su kolone malo drugačije, ali isto ih preimenujemo
        # nemamo Index kolonu, već neku Unnamed: 0, preimenujem je na Index za sad?
        data_filtered = change_columns(raw_data_filter, folder_name)
        
    # ovde su redovi i kolone sređeni
    return raw_data_filter

In [39]:
### I grupa su oboleli pre tretmana
### II grupa su oboleli posle tretmana
### 2 foldera što su poslednja su Healthy i treba da vidimo šta sa njima

data_folders = [#'GSM3511735', 'GSM3511747', 'GSM3511752', 'GSM3511757', 'GSM3511762',
                #'GSM3511766', 'GSM3721449', 'GSM3721453', 'GSM3721454', 'GSM3721455', 
                #'GSM3721457', 'GSM3721458', 'GSM3721460', 'GSM3721462', 
                'GSM3721464',
               
                 'GSM3511738', 'GSM3511741', 'GSM3511749', 'GSM3511754', 'GSM3511760', 
                 'GSM3721445', 'GSM3721447', 'GSM3721448', 'GSM3721450', 'GSM3721451', 
                 'GSM3721452', 'GSM3721456', 'GSM3721459', 'GSM3721461', 'GSM3721463', 
                 'GSM3892576',
                
                 'GSM3892571', 'GSM3892570'               
               ]

In [40]:
for folder in data_folders:
    print("-----------------------------------")
    print(f'Obrađuje se folder: {folder}')

    this_dir = os.getcwd()
    data_filtered = prepare_data(os.path.join(this_dir, folder), folder)
    print("Folder: {} ima dimenzije nakon obrade {}".format(folder, data_filtered.shape))

    data_transposed = data_transpose(data_filtered)
    
    # obrisati nula redove
    temp = data_transposed.abs().sum(axis=1) == 0      
    
    data_no_zero_rows = data_transposed.drop(data_transposed[temp].index)    
    data_no_zero_rows_index_col = data_no_zero_rows.reset_index()
    data_no_zero_rows_index_col.rename(columns={'level_0':'Index'}, inplace=True)
    csv_path = os.path.join(os.path.join(this_dir, folder), "preprocessed1_" + folder + ".csv")
    #print("Path: ", csv_path)
    ### Čuvam preprocesiran fajl
    data_no_zero_rows_index_col.to_csv(csv_path, index=False)
    print(f'Folder {folder} je obrađen.')
    print("-----------------------------------")

-----------------------------------
Obrađuje se folder: GSM3721464
Dimenzije pre obrade -> (18347, 2809)
Dimenzije posle dropna -> (18347, 2809)
Uklanjanje redova:  /Users/mandja96/Desktop/Skripta/GSM3721464/GSM3721464_su014_pre_Tcell_RNA.csv
Folder: GSM3721464 ima dimenzije nakon obrade (14573, 2809)
Folder GSM3721464 je obrađen.
-----------------------------------
-----------------------------------
Obrađuje se folder: GSM3511738
Dimenzije pre obrade -> (23309, 3950)
Dimenzije posle dropna -> (23309, 3950)
Uklanjanje redova:  /Users/mandja96/Desktop/Skripta/GSM3511738/GSM3511738_su001_post_Tcell_RNA.csv
Folder: GSM3511738 ima dimenzije nakon obrade (17523, 3950)
Folder GSM3511738 je obrađen.
-----------------------------------
-----------------------------------
Obrađuje se folder: GSM3511741
Dimenzije pre obrade -> (23309, 179)
Dimenzije posle dropna -> (23309, 179)
Uklanjanje redova:  /Users/mandja96/Desktop/Skripta/GSM3511741/GSM3511741_su002_post_Tcell_RNA.csv
Folder: GSM3511741 

In [42]:
proba = pd.read_csv("GSM3721463/preprocessed1_GSM3721463.csv")

In [43]:
proba.head()

Unnamed: 0,Index,LINC00115,FAM41C,NOC2L,KLHL17,PLEKHN1,HES4,ISG15,AGRN,C1orf159,...,CDH26,ZNF233,NKPD1,SLC17A7,CRYBB3,LINC01589,MIOX,TCP10L,TFF3,LINC00334
0,3721463_1,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3721463_2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3721463_3,0,0,0,0,0,0,5,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3721463_4,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3721463_5,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
proba_index = proba.set_index('Index')
proba_index.head()

Unnamed: 0_level_0,LINC00115,FAM41C,NOC2L,KLHL17,PLEKHN1,HES4,ISG15,AGRN,C1orf159,TNFRSF18,...,CDH26,ZNF233,NKPD1,SLC17A7,CRYBB3,LINC01589,MIOX,TCP10L,TFF3,LINC00334
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3721463_1,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3721463_2,0,0,0,0,0,0,0,0,0,9,...,0,0,0,0,0,0,0,0,0,0
3721463_3,0,0,0,0,0,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3721463_4,0,0,0,0,0,0,2,0,0,2,...,0,0,0,0,0,0,0,0,0,0
3721463_5,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
