In [33]:
import pandas as pd
import numpy as np
import os
from more_itertools import locate
import time

In [34]:
genome_sample_df = pd.read_csv('genome_sample.csv')

### Primer: dobijem 'GSM3020393' vratim 'GRCh38'
genome_sample_map = pd.Series(genome_sample_df.GENOME.values, index=genome_sample_df.SAMPLE).to_dict()

common_human_map_genome = {"GRCh38" : "Ensembl_GRCh38.p12_rel94", 
                           "hg19" : "Ensembl_GRCh38.p12_rel94",
                           "hg38" : "Ensembl_GRCh38.p12_rel94",
                           "GRCh38 version 90" : "Ensembl_GRCh38.p12_rel94"}

common_human_list_df = pd.read_csv('common_human_list.csv')
ENSG_ID = list(common_human_list_df['ENSG_ID'])

ensembl_h38 = list(common_human_list_df['Ensembl_GRCh38.p12_rel94'])
ensembl_h38 = [x[x.find('#')+1:] for x in ensembl_h38]

map_hg = {"Ensembl_GRCh38.p12_rel94" : ensembl_h38}

In [90]:
### Primer: sample = 'GSM3892576', vratim 'GRCh38'
def get_genome_value(sample):
    return genome_sample_map[sample]

def data_transpose(data):
    data_transposed = data.transpose()
    data_transposed.columns = data_transposed[0:1].to_numpy()[0]
    data_transposed = data_transposed.drop(data_transposed.index[0])      
    
    return data_transposed


# 1. briše one ENSG redove kojih nema u common_human_list
# 2. zamenjuje ENSG vrednosti sa odgovarajućim vrednostima gena
# Format primer: ENSG00000163468_grch38_CCT3 
def change_ENSG_rows(raw_data, genome_mapped):
    start_length = len(raw_data)
    for_dropping = []

    for i in range(start_length):
        if raw_data['Index'][i] not in ENSG_ID:
            for_dropping.append(i) 
        
    raw_data_ENSG_filter = raw_data.drop(raw_data.index[for_dropping])
    raw_data_ENSG_filter = raw_data_ENSG_filter.reset_index(drop=True)
    
    data_len = len(raw_data_ENSG_filter)
    for i in range(data_len):
        
        index_name = raw_data_ENSG_filter["Index"][i]
        # TODO pitaj da li treba kao tamo bez # ili treba sa #
        tmp = list(common_human_list_df.loc[common_human_list_df["ENSG_ID"] \
                                            == index_name]
               [genome_mapped])[0]
        
        dot = tmp.find('.')
        if(dot != -1):
            tmp = tmp[1:dot]
        else:
            tmp = tmp[1:]
            
        # ENSG0000 -> uzmem samo E i ovo posle NSG00000    
        new_name = "E" + index_name[9:]
        raw_data_ENSG_filter.loc[i, 'Index'] = new_name + "_" + tmp
    return raw_data_ENSG_filter

def discard_not_allowed_rows(raw_data, genome_mapped):
    start_length = len(raw_data)
    for_dropping = []
    hg = map_hg[genome_mapped]
    
    for i in range(start_length):
        unnamed_index_name = raw_data['Unnamed: 0'][i]
        
        if unnamed_index_name not in hg:
            for_dropping.append(i)
        else:
            gene_search = "#" + unnamed_index_name

            tmp = list(common_human_list_df.loc[common_human_list_df[genome_mapped] \
                                                == gene_search]["ENSG_ID"])[0]
            # ENSG0000 -> uzmem samo E i ovo posle NSG00000
            new_name = "E" + tmp[9:]
            
            tmp_name = new_name + "_"
            dot_pos = unnamed_index_name.find('.')
            if(dot_pos != -1):
                raw_data.loc[i, 'Unnamed: 0'] = tmp_name + unnamed_index_name[:dot_pos]
            else:
                raw_data.loc[i, 'Unnamed: 0'] = tmp_name + unnamed_index_name
                
    raw_data_ENSG_filter = raw_data.drop(raw_data.index[for_dropping])
    raw_data_ENSG_filter = raw_data_ENSG_filter.reset_index(drop=True)
    
    return raw_data_ENSG_filter


def change_columns(raw_data_filter, folder_name):
    columns_array = range(0, len(raw_data_filter.columns))
    
    new_col_names = [folder_name + "_" + str(x) for x in columns_array]
    new_col_names[0] = "Index"
    
    raw_data_filter.columns = [new_col_names]     
    return raw_data_filter


def remove_NaN_values(raw_data):
    print(f'Dimenzije pre dropna -> {raw_data.shape}')
    raw_data_no_nan = raw_data.dropna()
    print(f'Dimenzije posle dropna -> {raw_data_no_nan.shape}')
    return raw_data_no_nan


def condition(data, threshold1 = 1000.0, threshold2 = 500.0):
    print("Condition data shape: ", data.shape)
    return list((np.sum(data, axis=0) >= threshold1) & \
                (data[data > 0.0].count() >= threshold2))

def prepare_data(folder_path, folder_name):
    data_filtered = 0
    
    csv_file_name = os.listdir(folder_path)[0]
    csv_path = os.path.join(folder_path, csv_file_name)
    raw_data = pd.read_csv(csv_path)
    
    raw_data = remove_NaN_values(raw_data)    
    
    genome = get_genome_value(folder)
    # imam ovde ono Ensembl_GRCh38.p12_rel94...
    genome_mapped = common_human_map_genome[genome]
    
    if "Index" in raw_data.columns:
        # onda imamo ovakve ne obrađene redove: 'ENSG00000243485'
        print("Obrada ENSG vrsta: ", csv_path)
        raw_data_filter = change_ENSG_rows(raw_data, genome_mapped)
        
        # AAACCTGAGCAGACTG-1 zameniti sa: x_1 ako je x = GSM2741551
        data_filtered = change_columns(raw_data_filter, folder_name)
        
    else: 
        # ovde imamo zamenjene vrednosti, ali nisu sve prisutne u common_list datoteci
        # treba da se čiste u zavisnosti od vrednosti u odgovarajućoj koloni za gen
        print("Uklanjanje redova: ", csv_path)
        raw_data_filter = discard_not_allowed_rows(raw_data, genome_mapped)  
        
        # ovde su kolone malo drugačije, ali isto ih preimenujemo
        # nemamo Index kolonu, već neku Unnamed: 0, preimenujem je na Index za sad?
        data_filtered = change_columns(raw_data_filter, folder_name)
        
    # ovde su redovi i kolone sređeni
    return data_filtered



In [91]:
### I grupa su oboleli pre tretmana
### II grupa su oboleli posle tretmana
### 2 foldera što su poslednja su Healthy i treba da vidimo šta sa njima

data_folders = ['GSM3511735'#, 'GSM3511747', 'GSM3511752', 'GSM3511757', 'GSM3511762',
                #'GSM3511766', 'GSM3721449', 'GSM3721453', 'GSM3721454', 'GSM3721455', 
                #'GSM3721457', 'GSM3721458', 'GSM3721460', 'GSM3721462', 
                #'GSM3721464',
               
                # 'GSM3511738', 
                #'GSM3511741'
                #'GSM3511749', 
                #'GSM3511754', 'GSM3511760', 
                 #'GSM3721445', 'GSM3721447', 'GSM3721448', 'GSM3721450', 'GSM3721451', 
                 #'GSM3721452', 'GSM3721456', 'GSM3721459', 'GSM3721461', 'GSM3721463', 
                 #'GSM3892576',
                
                 #'GSM3892571', 'GSM3892570'               
               ]

In [194]:
%%time
for folder in data_folders:
    print("-----------------------------------")
    print(f'Obrađuje se folder: {folder}')

    this_dir = os.getcwd()
    data_filtered = prepare_data(os.path.join(this_dir, folder), folder)
    print("Podaci u folderu: {} imaju dimenzije nakon obrade1 {}".format(folder, data_filtered.shape))
        
    # [True] + je zbog dodavanja 'Index' kolone :)
    data_filtered_drop_cell_cols = data_filtered.loc[:, [True] + condition(data_filtered.drop('Index', axis=1))]    
    print("Podaci u folderu: {} imaju dimenzije nakon obrade1 {}".format(folder, data_filtered_drop_cell_cols.shape))

    data_transposed = data_transpose(data_filtered_drop_cell_cols)    
    # obrisati nula redove
    temp = data_transposed.abs().sum(axis=1) == 0      

    data_no_zero_rows = data_transposed.drop(data_transposed[temp].index)    
    data_no_zero_rows_index_col = data_no_zero_rows.reset_index()
    data_no_zero_rows_index_col.rename(columns={'level_0':'Index'}, inplace=True)

    csv_path = os.path.join(os.path.join(this_dir, folder), "preprocessed1_" + folder + ".csv")
    print("Path: ", csv_path)
#     print("Spisak gena: ", list(data_no_zero_rows_index_col.columns)[1:])
    
    ### Čuvam preprocesiran fajl
#     data_no_zero_rows_index_col.to_csv(csv_path, index=False)
    print(f'Folder {folder} je obrađen.')
    print("-----------------------------------")

-----------------------------------
Obrađuje se folder: GSM3511735
Dimenzije pre dropna -> (23309, 884)
Dimenzije posle dropna -> (23309, 884)
Uklanjanje redova:  /Users/mandja96/Desktop/Skripta/GSM3511735/GSM3511735_su001_pre1_Tcell_RNA.csv
Podaci u folderu: GSM3511735 imaju dimenzije nakon obrade1 (17523, 884)
Condition data shape:  (17523, 883)


  new_axis = axis.drop(labels, errors=errors)


Podaci u folderu: GSM3511735 imaju dimenzije nakon obrade1 (17523, 871)
Path:  /Users/mandja96/Desktop/Skripta/GSM3511735/preprocessed1_GSM3511735.csv
Folder GSM3511735 je obrađen.
-----------------------------------
CPU times: user 1min 14s, sys: 1.13 s, total: 1min 15s
Wall time: 1min 16s


In [None]:
[x[0] for x in list(data_filtered_drop_cell_cols.columns)]

In [176]:
values = {1,2,3,4}

with open("file.txt", 'w') as output:
    for row in values:
        output.write(str(row) + '\n')

In [192]:
lineList = set([line.rstrip('\n') for line in open('file.txt')])

In [193]:
lineList

{'1', '2', '3', '4'}

In [195]:
data_no_zero_rows_index_col

Unnamed: 0,Index,E225880_LINC00115,E230368_FAM41C,E188976_NOC2L,E187961_KLHL17,E187583_PLEKHN1,E188290_HES4,E187608_ISG15,E131591_C1orf159,E186891_TNFRSF18,...,E165588_OTX2,E187105_HEATR4,E226800_CACTIN-AS1,E161180_CCDC116,E242908_AADACL2-AS1,E238171_AC068196,E204711_C9orf135,E245148_ARAP1-AS2,E073598_FNDC8,E100121_GGTLC2
0,GSM3511735_1,1,0,0,0,0,0,2,0,3,...,0,0,0,0,0,0,0,0,0,0
1,GSM3511735_2,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,GSM3511735_3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,GSM3511735_4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,GSM3511735_5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,GSM3511735_6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,GSM3511735_7,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,GSM3511735_8,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,GSM3511735_9,1,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,GSM3511735_10,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# TODO
# 1. izdvoj spisak gena iz svakog fajla u poseban .txt fajl 
# 2. sacuvaj sredjen dataframe u novi fajl

# 3. uvezi sve .txt fajlove izdvojenih gena i uradi presek
# 4. sacuvaj to u finalni .txt fajl i razlike gena sacuvaj za svaku datoteku posebno

# 5. u odnosu na taj presek gena, ucitaj svaki fajl 
#    opet i ukloni one gene koji nisu u preseku

# 6. Vidi sta ces da radis za ovih p% vrednosti kada se spoje SVE datoteke...