# Splicing Datasets (ENST)

Because this dataset is very large, we will save the csv in batches.
Before doing so we need to select the data we want and transform it in a way that we can use the data.

In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
import os
def main():
    print("Current Working Directory " , os.getcwd())
    if os.path.exists("C:/Users/micha.DESKTOP-8HA2IGV/OneDrive/Programming/Propulsion Project/intelligencia_backup/intelligencia") :
        # Change the current working Directory    
        os.chdir("C:/Users/micha.DESKTOP-8HA2IGV/OneDrive/Programming/Propulsion Project/intelligencia_backup/intelligencia")
        print("New Working Directory " , os.getcwd())
    else:
        print("Can't change the Current Working Directory")    
        print("Current Working Directory " , os.getcwd())
if __name__ == '__main__':
    main()

Current Working Directory  C:\Users\micha.DESKTOP-8HA2IGV\OneDrive\Programming\Propulsion Project\intelligencia_backup\intelligencia\Mike
New Working Directory  C:\Users\micha.DESKTOP-8HA2IGV\OneDrive\Programming\Propulsion Project\intelligencia_backup\intelligencia


## Dividing into batches

In [2]:
class FilterData:
    def get_csv(self, path_txt, primary_sites, sample_types = ["Primary Tumor", "Normal Tissue"], genders = ["Female", "Male"], detailed_category = "All", version = "1.0"):
        """
        path_txt = original file
        primary_sites = list with all types of tissue
        sample_types = list with all types of samples
        version = version of csv
        """
        df_phenotype = pd.read_csv("Data/Expression_Data/Raw_Data/TcgaTargetGTEX_phenotype.txt", delimiter="\t")
       
        df2 = df_phenotype[df_phenotype['_primary_site'].isin(primary_sites)]
        df2 = df2[df2['_gender'].isin(genders)]
        
        if detailed_category != "All":
            df2 = df2[df2['detailed_category'].isin(detailed_category)]
        
        cols = df2[df2["_sample_type"].isin(sample_types)]["sample"].tolist()
        
        df = pd.read_csv(path_txt, delimiter="\t", usecols=['sample'] + cols) #end
        
        # Set index and transpose
        df = df.set_index("sample").T

        # Make label column
        df["label"] = df.index.astype(str).str[0]

        # Set labels to 0 and 1
        df.label.replace({"T": 1, "G": 0}, inplace=True)

        # Put the label column on first col index
        cols = df.columns.tolist()
        cols = [cols[-1]]+cols[:-1] # or whatever change you need
        df = df.reindex(columns=cols)

        # Save dataframe
        
        df.to_csv("Data/Expression_Data/{}_{}_{}_{}.csv".format(primary_sites, detailed_category, sample_types, version))
        print("Finish")

filter_data = FilterData()

In [25]:
filter_data.get_csv("Data/Expression_Data/Raw_Data/TcgaTargetGtex_rsem_gene_tpm", ["Lung"], detailed_category = ["Lung Squamous Cell Carcinoma", "Lung"])
filter_data.get_csv("Data/Expression_Data/Raw_Data/TcgaTargetGtex_rsem_gene_tpm", ["Thyroid", "Thyroid Gland"])
filter_data.get_csv("Data/Expression_Data/Raw_Data/TcgaTargetGtex_rsem_gene_tpm", ["Colon"])
filter_data.get_csv("Data/Expression_Data/Raw_Data/TcgaTargetGtex_rsem_gene_tpm", ["Skin"])
filter_data.get_csv("Data/Expression_Data/Raw_Data/TcgaTargetGtex_rsem_gene_tpm", ["Breast"], genders = ["Female"])
filter_data.get_csv("Data/Expression_Data/Raw_Data/TcgaTargetGtex_rsem_gene_tpm", ["Lung"], detailed_category = ["Lung Adenocarcinoma", "Lung"])

Finish

Finish

Finish

Finish

Finish



In [6]:
filter_data.get_csv("Data/Expression_Data/Raw_Data/TcgaTargetGtex_rsem_gene_tpm", ["Lung"], detailed_category = ["Lung Adenocarcinoma", "Lung"])
filter_data.get_csv("Data/Expression_Data/Raw_Data/TcgaTargetGtex_rsem_gene_tpm", ["Lung"], detailed_category = ["Lung Squamous Cell Carcinoma", "Lung"])

Finish
Finish


In [29]:
filter_data.get_csv("Data/TcgaTargetGtex_rsem_gene_tpm", ["Lung"], version = "2")

Finish



In [1]:
import pandas as pd

df = pd.DataFrame({"Genes":[1,2,3,3,4,4,4,5], "Label":["a","b","a","b","a","b","c","c"]})

In [2]:
df

Unnamed: 0,Genes,Label
0,1,a
1,2,b
2,3,a
3,3,b
4,4,a
5,4,b
6,4,c
7,5,c


In [7]:
df["Genes"].count()

8

In [29]:
genes = [1,2,3, 4]
labels = []
for gene in genes:
    labels.append(df[df["Genes"] == gene]["Label"].sum())
pd.DataFrame({"Labels":labels, "Genes":genes})

In [98]:
results_breast = pd.read_csv("Output/Results/Result_2.0_Breast.csv", index_col=0)
results_lung_a = pd.read_csv("Output/Results/Result_2.0_LungAdenocarcinoma_Lung.csv", index_col=0)
results_skin = pd.read_csv("Output/Results/Result_2.0_Skin.csv", index_col=0)
results_lung_s = pd.read_csv("Output/Results/Result_2.0_LungSquamousCellCarcinoma_Lung.csv", index_col=0)
results_thyroid = pd.read_csv("Output/Results/Result_2.0_Thyroid_ThyroidGland.csv", index_col=0)
results_all = pd.read_csv("Output/Results/Result_2.0_AllCancers_0vs1.csv", index_col=0)
results_colon = pd.read_csv("Output/Results/Result_2.0_Colon.csv", index_col=0)

chunk_names = ["skin", "thyroid", "colon", "breast", "lung_a", "lung_s", "all"]
results_list = [results_skin, results_thyroid, results_colon, results_breast, results_lung_a, results_lung_s, results_all]

In [60]:
dictionary = {}
for results, chunk_name in zip(results_list, chunk_names):
    # Top genes
    top = []
    # Chosen based on Count (C), Importance Score (I) or Cosmic Overlap (O).
    top_labels = []
    
    top_30_count = [results.sort_values(by="Total Count", ascending = False).head(30).index.tolist(), ["C"]*30]
    top_30_score = [results.sort_values(by="Importance Score", ascending = False).head(30).index.tolist(), ["S"]*30]

    results = results[results["Total Count"] > 1]
    cosmic_genes = results[results["Cosmic"] == 1].index.tolist()[:30]
    top_30_cosmic = [cosmic_genes, ["O"]*len(cosmic_genes)]
    
    for lst in [top_30_count, top_30_score, top_30_cosmic]:
        for element in lst[0]:
            top.append(element)
        for element in lst[1]:
            top_labels.append(element)
    
    top_df = pd.DataFrame({"Genes":top, "Criteria":top_labels})
    

In [100]:
dictionary = {}
for results, chunk_name in zip(results_list, chunk_names):
    # Top genes
    top = []
    # Chosen based on Count (C), Importance Score (I) or Cosmic Overlap (O).
    top_criteria = []
    
    top_30_count = [results.sort_values(by="Total Count", ascending = False).head(30).index.tolist(), ["T"]*30]
    top_30_score = [results.sort_values(by="Importance Score", ascending = False).head(30).index.tolist(), ["I"]*30]

    results = results[results["Total Count"] > 1]
    cosmic_genes = results[results["Cosmic"] == 1].index.tolist()[:30]
    top_30_cosmic = [cosmic_genes, ["C"]*len(cosmic_genes)]
    
    for lst in [top_30_count, top_30_score, top_30_cosmic]:
        for element in lst[0]:
            top.append(element)
        for element in lst[1]:
            top_criteria.append(element)
    top_df = pd.DataFrame({"Gene":top, "Criteria":top_criteria})
    
    # Create Dataframe with duplicate genes removed and remaining labelled according to matching criteria.
    seen = set()
    top_unique = [x for x in top if x not in seen and not seen.add(x)]
    top_criteria_code = []
    
    for gene in top_unique:
        top_criteria_code.append(top_df[top_df["Gene"] == gene]["Criteria"].sum())
    
    dictionary[chunk_name] = [top_unique, top_criteria_code]
    print(chunk_name, ": ", str(len(top_unique)), "\tgenes selected |", str(len(top)-len(top_unique)), "duplicates removed",
         "\tT:", str(top_unique_df["Gene"][top_unique_df["Criteria"] == "T"].count()),
         ",I:", str(top_unique_df["Gene"][top_unique_df["Criteria"] == "I"].count()),
         ",C:", str(top_unique_df["Gene"][top_unique_df["Criteria"] == "C"].count()),
         ",TI:", str(top_unique_df["Gene"][top_unique_df["Criteria"] == "TI"].count()),
         ",TC:", str(top_unique_df["Gene"][top_unique_df["Criteria"] == "TC"].count()),
         ",IC:", str(top_unique_df["Gene"][top_unique_df["Criteria"] == "IC"].count()),
         ",TIC:", str(top_unique_df["Gene"][top_unique_df["Criteria"] == "TIC"].count()),)

    return dictionary

skin :  68 	genes selected | 13 duplicates removed 	T: 23 ,I: 23 ,C: 28 ,TI: 6 ,TC: 1 ,IC: 1 ,TIC: 0
thyroid :  80 	genes selected | 4 duplicates removed 	T: 23 ,I: 23 ,C: 28 ,TI: 6 ,TC: 1 ,IC: 1 ,TIC: 0
colon :  81 	genes selected | 9 duplicates removed 	T: 23 ,I: 23 ,C: 28 ,TI: 6 ,TC: 1 ,IC: 1 ,TIC: 0
breast :  74 	genes selected | 7 duplicates removed 	T: 23 ,I: 23 ,C: 28 ,TI: 6 ,TC: 1 ,IC: 1 ,TIC: 0
lung_a :  81 	genes selected | 7 duplicates removed 	T: 23 ,I: 23 ,C: 28 ,TI: 6 ,TC: 1 ,IC: 1 ,TIC: 0
lung_s :  84 	genes selected | 6 duplicates removed 	T: 23 ,I: 23 ,C: 28 ,TI: 6 ,TC: 1 ,IC: 1 ,TIC: 0
all :  82 	genes selected | 8 duplicates removed 	T: 23 ,I: 23 ,C: 28 ,TI: 6 ,TC: 1 ,IC: 1 ,TIC: 0


In [101]:
dictionary

{'skin': [['ENSG00000048462',
   'ENSG00000006128',
   'ENSG00000250254',
   'ENSG00000171094',
   'ENSG00000100721',
   'ENSG00000175206',
   'ENSG00000109906',
   'ENSG00000090932',
   'ENSG00000153002',
   'ENSG00000126752',
   'ENSG00000081479',
   'ENSG00000180318',
   'ENSG00000170486',
   'ENSG00000124900',
   'ENSG00000185686',
   'ENSG00000204677',
   'ENSG00000197172',
   'ENSG00000172867',
   'ENSG00000140506',
   'ENSG00000125816',
   'ENSG00000186810',
   'ENSG00000205649',
   'ENSG00000228144',
   'ENSG00000147889',
   'ENSG00000070031',
   'ENSG00000145244',
   'ENSG00000104901',
   'ENSG00000206172',
   'ENSG00000120215',
   'ENSG00000237988',
   'ENSG00000168010',
   'ENSG00000110080',
   'ENSG00000118495',
   'ENSG00000168333',
   'ENSG00000120054',
   'ENSG00000124935',
   'ENSG00000204882',
   'ENSG00000153822',
   'ENSG00000140287',
   'ENSG00000213927',
   'ENSG00000111245',
   'ENSG00000186160',
   'ENSG00000267001',
   'ENSG00000168530',
   'ENSG00000172824',
  

In [97]:
top_unique_df[top_unique_df["Criteria"] == "TIC"]

Unnamed: 0,Gene,Criteria


In [None]:
t

In [91]:
top_unique_df

Unnamed: 0,Gene,Criteria
0,ENSG00000149451,TI
1,ENSG00000122304,TI
2,ENSG00000092054,TI
3,ENSG00000137392,T
4,ENSG00000156234,T
5,ENSG00000009709,TC
6,ENSG00000111245,T
7,ENSG00000122641,T
8,ENSG00000128610,T
9,ENSG00000175646,T


In [43]:
top_30_count = [results.sort_values(by="Total Count", ascending = False).head(30).index.tolist(), ["Count"]*30]

In [45]:
top_30_count[1]

['Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count',
 'Count']