# Import , Set working directory

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import time

from scipy import stats
from scipy.stats import randint

from imblearn.over_sampling import SMOTE
from boruta import BorutaPy

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet

from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE
from sklearn.tree import export_graphviz
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.linear_model import LassoCV
from sklearn.datasets import make_regression

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve 

%matplotlib inline

import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
def main():
    print("Current Working Directory " , os.getcwd())
    if os.path.exists("C:/Users/micha.DESKTOP-8HA2IGV/OneDrive/Programming/Propulsion Project/intelligencia_backup/intelligencia") :
        # Change the current working Directory    
        os.chdir("C:/Users/micha.DESKTOP-8HA2IGV/OneDrive/Programming/Propulsion Project/intelligencia_backup/intelligencia")
        print("New Working Directory " , os.getcwd())
    else:
        print("Can't change the Current Working Directory")    
        print("Current Working Directory " , os.getcwd())
if __name__ == '__main__':
    main()

Current Working Directory  C:\Users\micha.DESKTOP-8HA2IGV\OneDrive\Programming\Propulsion Project\intelligencia_backup\intelligencia
New Working Directory  C:\Users\micha.DESKTOP-8HA2IGV\OneDrive\Programming\Propulsion Project\intelligencia_backup\intelligencia


Preprocess a provided CSV file to smaller chunks with relevant data for our analysis.

# Data Exploration

## Data Acquisition and Exploration

- Xena platform collects gene expression data from different studies and combines them in clean, normalized datasets. Data that was used for this project includes phenotype and genotype information about 19131 samples (they update the data regularly - so this number might increase) with various tissue types.

https://xenabrowser.net/datapages/?cohort=TCGA%20TARGET%20GTEx&removeHub=https%3A%2F%2Fxena.tree
- Genotype:

https://xenabrowser.net/datapages/?dataset=TcgaTargetGtex_rsem_gene_tpm&host=https%3A%2F%2Ftoil.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443
- Phenotype:

https://xenabrowser.net/datapages/?dataset=TcgaTargetGTEX_phenotype.txt&host=https%3A%2F%2Ftoil.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443


In [3]:
# Take a look at phenotype information which will be used to seperate the genotype information into individual chunks.
phenotype_path = 'Data/TcgaTargetGTEX_phenotype.txt'
df_phenotype = pd.read_csv(phenotype_path, '\t')
df_phenotype.head()

Unnamed: 0,sample,detailed_category,primary disease or tissue,_primary_site,_sample_type,_gender,_study
0,TCGA-V4-A9EE-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA
1,TCGA-VD-AA8N-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA
2,TCGA-V4-A9EI-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA
3,TCGA-VD-AA8O-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA
4,TCGA-WC-A888-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA


In [4]:
# There are 19131 samples and six features with information about the samples.
df_phenotype.shape

(19131, 7)

In [4]:
# Genes are represented in Ensebml notation with information about the version of natation.
genotype_path = 'Output/Expression_Data_All/Raw_Data/TcgaTargetGtex_rsem_gene_tpm'
df = pd.read_csv(genotype_path, delimiter='\t', usecols=['sample'])
df.head()

Unnamed: 0,sample
0,ENSG00000242268.2
1,ENSG00000259041.1
2,ENSG00000270112.3
3,ENSG00000167578.16
4,ENSG00000278814.1


In [6]:
# The data includes gene expression values for 60498 genes.
df.shape

(2647, 1)

# Data Preprocessing

## Classes used

### Class: DataFilter 

Filter dataset to retrieve cancer-specific chunks of data.

Main function -> `split(sites, types=["Primary Tumor", "Normal Tissue"], genders=["Female", "Male"], categories=None)`
- Input: Site, tissue type, gender, detailed category.
- Return: Filtered dataframe. Also saves to csv.

Included functions:
- `init`: Define class variables.
- `get_columns`: Select for desired criteria.
- `get_output_path`: Create filepath for dataframe created by split.

In [5]:
from DataPreprocessing import DataFilter

### Class: DataPrep

Prepare data for feature selection algorithm by filtering, splitting and upsampling.

Main function -> `bulbasaur(path, threshold, nrows = None, usecols = None)`
- input: 
    - path: Directory path of gene expression data.
    - threshsold: Standard deviation filter threshold.
- output: `X_train, y_train, x_test, y_test`.

Included functions:
- `read_data`.
- `X_and_y`.
- `split`: Train and test split.
- `smote_up`: Upsampling to get balanced dataset.

In [14]:
from DataPreprocessing import DataPrep

## Create and Store Filtered Chunks 

Because drug development mainly focuses on protein-coding genes and in order to reduce dimensions the data is filtered tor protein-coding genes only.

In [6]:
# Define unique protein-coding genes.
def create_upcg():
    df_mart = pd.read_csv('Data/mart_export.txt')
    return set(df_mart[df_mart['Gene type'] == 'protein_coding']['Gene stable ID'].unique())

In [7]:
# Instantiate DataFilter class with paths of data and upcg function.
data_filter = DataFilter(genotype_path, phenotype_path, create_upcg())

In [8]:
# Create chunks by applying the split function. Store results as a csv.

#data_filter.split(["Lung"], categories=["Lung Squamous Cell Carcinoma", "Lung"])
# Output path: "Output/Chunk_LungSquamousCellCarcinoma_Lung.csv"

#data_filter.split(["Lung"], categories=["Lung Adenocarcinoma", "Lung"])
# Output path: "Output/Chunk_LungAdenocarcinoma_Lung.csv"

#data_filter.split(["Thyroid", "Thyroid Gland"])
# Output path: "Output/Chunk_Thyroid_ThyroidGland.csv"

#data_filter.split(["Colon"])
# Output path: "Output/Chunk_Colon.csv"

#data_filter.split(["Skin"])
# Output path: "Output/Chunk_Skin.csv"

#data_filter.split(["Breast"], genders=["Female"])
# Output path: "Output/Chunk_Breast.csv"

## Create Other Useful Chunks

### Create: All Sites, Cancerous and Healthy (12 labels)

In [None]:
chunks = [
    "Output/Chunk_LungSquamousCellCarcinoma_Lung.csv",
    "Output/Chunk_LungAdenocarcinoma_Lung.csv",
    "Output/Chunk_Thyroid_ThyroidGland.csv",
    "Output/Chunk_Colon.csv",
    "Output/Chunk_Skin.csv",
    "Output/Chunk_Breast.csv"
]

cancer = [
    "lung_s",
    "lung_a",
    "thyroid",
    "colon",
    "skin",
    "breast"
]

In [None]:
# Create a dataframe with all chunks. Healthy tissue will be labeled with 0, cancerous with 1.
chunk_df = pd.DataFrame()
for chunk, cancer in zip(chunks, cancers):
    # Load chunks.
    chunk = pd.read_csv(chunk)
    chunk.index = chunk.iloc[:,0]
    chunk.drop(columns = "Unnamed: 0", inplace = True)
    chunk.columns = [(re.sub('\.\d+', '', gene)) for gene in chunk.columns]
    
    # Label chunks according to sample type.
    chunk["label"].replace(1, "1_" + cancer, inplace = True)
    chunk["label"].replace(0, "0_" + cancer, inplace = True)
    
    # Append to complete chunk.
    chunk_df = chunk_df.append(chunk)

In [None]:
chunk_df.to_csv("Output/Chunk_AllCancers.csv")

### Create: All Sites, Only Cancerous

In [6]:
chunk_df = pd.read_csv("Output/Chunk_AllCancers.csv", index_col = "Unnamed: 0")
chunk_cancer = chunk_df[chunk_df["label"].str.contains("1")]
chunk_cancer.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2982 entries, TCGA-C8-A1HL-01 to TCGA-FY-A4B4-01
Columns: 19664 entries, label to ENSG00000181518
dtypes: float64(19663), object(1)
memory usage: 447.4+ MB


In [10]:
chunk_cancer.to_csv("Output/Chunk_AllCancers_1only.csv")

### Create: All Sites, Cancerous and Healthy (2 labels)

In [207]:
chunk_df_all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4781 entries, TCGA-C8-A1HL-01 to GTEX-OOBK-2626-SM-2HMKY
Columns: 19664 entries, label to ENSG00000181518
dtypes: float64(19663), object(1)
memory usage: 717.3+ MB


In [33]:
chunk_df_all = chunk_df.copy()
chunk_df_all['label'] = chunk_df_all['label'].astype(str).str[0]

In [35]:
chunk_df_all.to_csv("Output/Chunk_AllCancers_0vs1.csv")

### Create: Lung_A Cancerous and Lung_S Cancerous

In [40]:
Chunk_LungA1_vs_LungS1 = chunk_df[chunk_df["label"].str.contains("1_lung")]; Chunk_LungA1_vs_LungS1.info()
Chunk_LungA1_vs_LungS1["label"] = Chunk_LungA1_vs_LungS1["label"].replace("1_lung_s", "0")
Chunk_LungA1_vs_LungS1["label"] = Chunk_LungA1_vs_LungS1['label'].astype(str).str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [None]:
Chunk_LungA1_vs_LungS1.to_csv("Output/Chunk_LungA1_vs_LungS1.csv")
#1_lung_a = 1
#1_lung_s = 0

# Gene Selection

1. Now the data has been seperated into chunks and relevant genes can be selected.
- Five different models will be used to give each gene a feature importance.
- About 50 genes per sample type will be selected according to three criteria: Feature importance score, number of picks and overlaps with cosmic genes (platform which gives a list of tested cancer-associated genes):
https://cancer.sanger.ac.uk/cosmic
- These will all be combined in one dataframe.

## Classes used

### Class: FeatureSelection

Applies various feature selection algorithms and stores results (features + feature importance) in a dictionary.

Main function -> `call_methods(X_train, y_train, X_test, y_test, n_features = 300)`
- Input:
    - Dataset split into train and test 
    - `n_features` (number of features that should be selected by each method).
- Output: Dictionaries with selected features and feature importances.

Included functions:
- rfe: Recursive Feature Elimination.
- gradient_boost_classifier.
- elastic_net.
- boruta_tree: Boruta with Random Forest Classifier at the end.
- lasso_cv: Lasso with crossvalidation.

In [10]:
from GeneSelection import FeatureSelection

### Class: Evaluation

Stores results (Counts of genes selected, importances, overlaps with cosmic genes) in a dataframe. 
Cosmic: https://cancer.sanger.ac.uk/cosmic - you need a student e-mail to register and download data.

Main function -> `iterate_through_cancers(path_list, path_intogen_list, nrows, usecols, threshold = 2.5)`
- Input: 
    - `path_list` (filepath to expression data chunks)
    - `path_intogen_list` (filepath to cosmic data)
    - `threshold` for standard deviation filter
- Output: Stores results as csv files.

Included functions:
- Add Cosmic to Dict: Add a dictionary with Cosmic cancer-related genes.
- Results: Store results as csv file
- Normalize Importances: Normalize of importances and add column with Total Importance
- Final Results: Everything together in one df :)
- Iterate Through Cancers: Iterate throuhg all cancer data

In [11]:
from GeneSelection import Evaluation

### Class: Filter Results

Filter for best genes based on different criteria (Total Count, Importance, Overlaps with public data).

Main function -> `charmander(results_list, chunk_names)`:
- `input`: chunk_names (names of cancer tissues), results_list (list of results dataframes created by Evaluation class).
- `ouput`: Dataframe with selected genes and additional information from Human Protein Atlas.

Included functions:
- `top`: Retrieve top genes (based on 1. Importance Score, 2. Total Count, 3. Overlaps with Cosmic Genes).
- `top_all`: Combines top genes from all cancer types.

In [12]:
from GeneSelection import FilterResults

## Select Top 400 Genes per Chunks

- Select top features in each sample type.
- Show model accuracies.
- Store results as csvs.

In [15]:
# Instantiate Evaluation class and FeatureSelection class
evaluation = Evaluation()
FS = FeatureSelection(1888)

In [None]:
path_intogen_list = ["Data/Reference_Data/Census_allWed May 15 09_46_55 2019.csv"]#*7
path_list = ["Output/Chunk_Breast.csv",
            "Output/Chunk_LungAdenocarcinoma_Lung.csv",
            "Output/Chunk_LungSquamousCellCarcinoma_Lung.csv",
            "Output/Chunk_Skin.csv",
            "Output/Chunk_Thyroid_ThyroidGland.csv",
            "Output/Chunk_LungA1_vs_LungS1.csv",
            "Output/Chunk_AllCancers_0vs1.csv",
            "Output/Chunk_Colon.csv"]

In [None]:
evaluation.iterate_trough_cancers(path_list, path_intogen_list, nrows = None, usecols = None, threshold = 1)

## Select Top 70-90 Genes per Chunk and Combine in one Dataframe

- Select about 70-90 genes with three criteria
    - Top 30 genes sorted by feature importance score (across models).
    - Top 30 genes sorted by count of picks (across models).
    - Top 30 genes sorted by model pick overlaps with cosmic genes.

In [14]:
# Instantiate FilterResults
fr = FilterResults(1888) # 1888 = Seed

In [11]:
results_breast = pd.read_csv("Output/Results/Result_2.0_Breast.csv", index_col=0)
results_lung_a = pd.read_csv("Output/Results/Result_2.0_LungAdenocarcinoma_Lung.csv", index_col=0)
results_skin = pd.read_csv("Output/Results/Result_2.0_Skin.csv", index_col=0)
results_lung_s = pd.read_csv("Output/Results/Result_2.0_LungSquamousCellCarcinoma_Lung.csv", index_col=0)
results_thyroid = pd.read_csv("Output/Results/Result_2.0_Thyroid_ThyroidGland.csv", index_col=0)
results_all = pd.read_csv("Output/Results/Result_2.0_AllCancers_0vs1.csv", index_col=0)
results_colon = pd.read_csv("Output/Results/Result_2.0_Colon.csv", index_col=0)

In [15]:
path_list = [
    "Output/Chunk_Skin.csv",
    "Output/Chunk_Thyroid_ThyroidGland.csv",
    "Output/Chunk_Colon.csv",
    "Output/Chunk_Breast.csv",
    "Output/Chunk_LungAdenocarcinoma_Lung.csv",
    "Output/Chunk_LungSquamousCellCarcinoma_Lung.csv",
    "Output/Chunk_AllCancers_0vs1.csv"]

# Keys for dictionary
chunks = ["skin", "thyroid", "colon", "breast", "lung_a", "lung_s", "all"]

# List of results dataframes
results_list = [results_skin, results_thyroid, results_colon, results_breast, results_lung_a, results_lung_s, results_all]

# Dictionary: Key - tissue, Value - list of selected genes
final_top_genes = fr.charmander(results_list, chunks)

skin :  68 	genes selected | 13 duplicates removed 	T: 18 ,I: 23 ,C: 15 ,TI: 6 ,TC: 5 ,IC: 0 ,TIC: 1
thyroid :  80 	genes selected | 4 duplicates removed 	T: 27 ,I: 29 ,C: 20 ,TI: 0 ,TC: 3 ,IC: 1 ,TIC: 0
colon :  81 	genes selected | 9 duplicates removed 	T: 22 ,I: 23 ,C: 28 ,TI: 6 ,TC: 1 ,IC: 0 ,TIC: 1
breast :  74 	genes selected | 7 duplicates removed 	T: 23 ,I: 23 ,C: 21 ,TI: 7 ,TC: 0 ,IC: 0 ,TIC: 0
lung_a :  81 	genes selected | 7 duplicates removed 	T: 25 ,I: 24 ,C: 25 ,TI: 4 ,TC: 1 ,IC: 2 ,TIC: 0
lung_s :  84 	genes selected | 6 duplicates removed 	T: 26 ,I: 26 ,C: 27 ,TI: 2 ,TC: 1 ,IC: 1 ,TIC: 1
all :  82 	genes selected | 8 duplicates removed 	T: 23 ,I: 23 ,C: 28 ,TI: 6 ,TC: 1 ,IC: 1 ,TIC: 0
Top genes overall:  422 genes selected | 128 duplicates removed


In [14]:
final_top_genes_path = "Output/Results/Final_genes/final_top_genes.csv"
final_top_genes.to_csv(final_top_genes_path)

# Gene Importance

## Classes used

### Class: FinalDataFrame

Apply XGBoost to get feature importance across classes and combine with results from GeneSelection.

Main function -> `squirtle(self, chunk_path, pickle, top_genes_path, save_to_path, top_genes_n = 20)`:
- Input: 
    - `chunk_path` (all cancers data chunk)
    - `pickle` (path for pickle file of XGBoost model) 
    - `top_genes_path` (results from GeneSelection library)
    - `save_to_path` (where to save)
    - `top_genes_n` (how many of the top genes of each tissue type from SHAP should be used)
- Output:
    Dataframe with all information about the genes.

Functions included:
- load_data
- run_store_model
- load_model
- get_genes_df
- add_combination
- combine_with_top_genes

In [1]:
from GeneImportance import FinalDataFrame

### Class: ExtractExpression

Extract subset of expression data with selected genes.

Functions included:
- final_genes_expression_data.

In [4]:
from GeneImportance import ExtractExpression

### Class: ShowImportance

Visuzalize SHAP importance
    
Functions included:
- importance_in_class
- importance_across_classes

In [22]:
from GeneImportance import ShowImportance 

## Apply models get combined importance

In [2]:
fdf = FinalDataFrame(1888, top_genes_path = "Output/Results/Top_genes/top_genes_1.0.csv")

In [6]:
chunk_path = "Output/Chunk_AllCancers.csv"
X_train, y_train, X_test, y_test = fdf.load_data(chunk_path)
#fdf.run_store_model(X_train, y_train, X_test, y_test)

In [7]:
chunk_path = "Output/Chunk_AllCancers.csv"
filename = "Output/Models/all_cancers_model.sav"
final_top_genes_path = "Output/Results/Final_genes/final_top_genes.csv"
save_to_path = "Output/Results/Final_genes/final_top_genes.csv"
df_final = fdf.squirtle(chunk_path, filename, top_genes_path, save_to_path, top_genes_n = 50)

In [8]:
loaded_model = pickle.load(open(filename, 'rb'))

In [5]:
extract = ExtractExpression()

In [None]:
save_to_path = "Output/Results/Final_genes/expression_data_all_final_symbol_50.csv"
extract.final_genes_expression_data(chunk_path, all_final_path, save_to_path)

In [15]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1200 entries, 400 to 237
Data columns (total 5 columns):
Gene                1200 non-null object
Shap_Importance     1200 non-null object
Cancer              1200 non-null object
Combination         1200 non-null object
SHAP_Combination    1200 non-null object
dtypes: object(5)
memory usage: 56.2+ KB


In [19]:
df_final.to_csv("Output/Results/Final_genes/final_genes_50.csv")

In [17]:
top_genes = pd.read_csv("Output/Results/Final_genes/final_top_genes.csv")