# Splicing Datasets (ENST)

Because this dataset is very large, we will save the csv in batches.
Before doing so we need to select the data we want and transform it in a way that we can use the data.

In [1]:
import pandas as pd
from tqdm import tqdm

## Dividing into batches

In [2]:
class FilterData:
    def get_csv(self, path_txt, primary_sites, sample_types = ["Primary Tumor", "Normal Tissue"], genders = ["Female", "Male"], detailed_category = "All", version = "1.0"):
        """
        path_txt = original file
        primary_sites = list with all types of tissue
        sample_types = list with all types of samples
        version = version of csv
        """
        df_phenotype = pd.read_csv("Data/Expression_Data/Raw_Data/TcgaTargetGTEX_phenotype.txt", delimiter="\t")
       
        df2 = df_phenotype[df_phenotype['_primary_site'].isin(primary_sites)]
        df2 = df2[df2['_gender'].isin(genders)]
        
        if detailed_category != "All":
            df2 = df2[df2['detailed_category'].isin(detailed_category)]
        
        cols = df2[df2["_sample_type"].isin(sample_types)]["sample"].tolist()
        
        df = pd.read_csv(path_txt, delimiter="\t", usecols=['sample'] + cols) #end
        
        # Set index and transpose
        df = df.set_index("sample").T

        # Make label column
        df["label"] = df.index.astype(str).str[0]

        # Set labels to 0 and 1
        df.label.replace({"T": 1, "G": 0}, inplace=True)

        # Put the label column on first col index
        cols = df.columns.tolist()
        cols = [cols[-1]]+cols[:-1] # or whatever change you need
        df = df.reindex(columns=cols)

        # Save dataframe
        
        df.to_csv("Data/Expression_Data/{}_{}_{}_{}.csv".format(primary_sites, detailed_category, sample_types, version))
        print("Finish")

filter_data = FilterData()

In [25]:
filter_data.get_csv("Data/Expression_Data/Raw_Data/TcgaTargetGtex_rsem_gene_tpm", ["Lung"], detailed_category = ["Lung Squamous Cell Carcinoma", "Lung"])
filter_data.get_csv("Data/Expression_Data/Raw_Data/TcgaTargetGtex_rsem_gene_tpm", ["Thyroid", "Thyroid Gland"])
filter_data.get_csv("Data/Expression_Data/Raw_Data/TcgaTargetGtex_rsem_gene_tpm", ["Colon"])
filter_data.get_csv("Data/Expression_Data/Raw_Data/TcgaTargetGtex_rsem_gene_tpm", ["Skin"])
filter_data.get_csv("Data/Expression_Data/Raw_Data/TcgaTargetGtex_rsem_gene_tpm", ["Breast"], genders = ["Female"])
filter_data.get_csv("Data/Expression_Data/Raw_Data/TcgaTargetGtex_rsem_gene_tpm", ["Lung"], detailed_category = ["Lung Adenocarcinoma", "Lung"])

Finish

Finish

Finish

Finish

Finish



In [6]:
filter_data.get_csv("Data/Expression_Data/Raw_Data/TcgaTargetGtex_rsem_gene_tpm", ["Lung"], detailed_category = ["Lung Adenocarcinoma", "Lung"])
filter_data.get_csv("Data/Expression_Data/Raw_Data/TcgaTargetGtex_rsem_gene_tpm", ["Lung"], detailed_category = ["Lung Squamous Cell Carcinoma", "Lung"])

Finish
Finish


In [29]:
filter_data.get_csv("Data/TcgaTargetGtex_rsem_gene_tpm", ["Lung"], version = "2")

Finish

