<a href="https://colab.research.google.com/github/morshedik/Cancer-Classification-Project/blob/main/ML_Project1_BulkRNA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Upload the file to the Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
!ls "/content/drive/My Drive/Colab Notebooks/TCGA_Data"

In [None]:
import pandas as pd

# # Paths to your files
rnaseq_path = '/content/drive/My Drive/Colab Notebooks/TCGA_Data/EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena'
metadata_path = '/content/drive/My Drive/Colab Notebooks/TCGA_Data/TCGA_phenotype_denseDataOnlyDownload.tsv'



# Reload and merge (skip if merged_data is still in memory)
metadata = pd.read_csv(metadata_path, sep='\t')
data = pd.read_csv(rnaseq_path, sep='\t')

# Set the gene names as the index and transpose
#data = data.set_index('sample')  # Assuming the first column is 'gene'
#data = data.T  # Transpose so samples are rows
# Reset index to make sample IDs a column
#data = data.reset_index().rename(columns={'index': 'sample'})
data = data.set_index('sample').T.reset_index().rename(columns={'index': 'sample'})
# Merge with metadata
merged_data = pd.merge(data, metadata, on='sample', how='inner')

#  only change the name of column _primary_disease to primary_disease
merged_data = merged_data.rename(columns={'_primary_disease': 'primary_disease'})

# Check the merged data
print("Merged Data:")
print(merged_data.head())
print("shape:", merged_data.shape)


**Confirmation based on the metadata folder**


In [None]:
# prompt: find information of this TCGA-EE-A29N-06 in the merged_data
#make sure merging is correct

# Assuming merged_data is already created as in the previous code

# Find information for TCGA-EM-A2CS-06
sample_info = merged_data[merged_data['sample'] == 'TCGA-EE-A29N-06']

# Print the information
sample_info


# Explore the Data

In [None]:
# Number of rows and columns
print("Shape:", merged_data.shape)  # Expect ~20,000 rows (genes) x 11,061 columns (samples + gene names)

# Column names (sample IDs and gene names)
print("Columns:", merged_data.columns)

# First few rows
print(merged_data.head())

# Replace 'primary_disease' with the actual column name
print("Unique Cancer Types:")
print(merged_data['primary_disease'].unique())

# preprocess this for your classification task

**Explore and Filter the Data**

In [None]:
# Filter to BRCA, LUAD, PRAD
target_cancers = ['breast invasive carcinoma', 'lung adenocarcinoma', 'prostate adenocarcinoma']
filtered_data = merged_data[merged_data['primary_disease'].isin(target_cancers)]

# Verify the filter
print("Filtered Data Head:")
print(filtered_data.head())
print("Filtered Shape:", filtered_data.shape)
print("Filtered Cancer Types:")
print(filtered_data['primary_disease'].value_counts()) #count each new one


**Check for Missing Values and Data Cleaning:**

In [None]:

# Total missing values
print("Missing Values Total:")
print(filtered_data.isnull().sum().sum())
# Total missing values per column
missing_by_col = filtered_data.isnull().sum()
print("Columns with Missing Values:")
print(missing_by_col[missing_by_col > 0])

# Rows with missing values
missing_rows = filtered_data[filtered_data.isnull().any(axis=1)]
print("Rows with Missing Values (Head):")
print(missing_rows[['sample', 'primary_disease']].head())
print("Number of Rows with Missing Values:", len(missing_rows))

# Check if missing values are in gene columns
gene_cols = [col for col in filtered_data.columns if col not in ['sample', 'primary_disease']]
missing_in_genes = filtered_data[gene_cols].isnull().sum().sum()
print("Total Missing Values in Gene Columns:", missing_in_genes)
#Scale:
#Total genes: ~20,531 (based on TCGA norms).
#Missing in 1,443 genes: ~7% of all genes.
#Total samples (assuming ~2,500 after filtering BRCA, LUAD, PRAD): 33 is ~1.3%.
#Impact:
#These 33 samples are incomplete for a significant chunk of genes (1,443), making them less reliable for training.

# Drop rows with missing gene values
gene_cols = [col for col in filtered_data.columns if col not in ['sample', 'primary_disease']]
cleaned_data = filtered_data.dropna(subset=gene_cols)

# Verify
print("Cleaned Shape:", cleaned_data.shape)
print("Remaining Cancer Type Counts:")
print(cleaned_data['primary_disease'].value_counts())
print("tumor types befor removing missing rows", filtered_data['primary_disease'].value_counts())

# Check variance of dropped genes (optional)
#.loc[]: This is used for selection by label.
dropped_samples = filtered_data.loc[filtered_data.isnull().any(axis=1), gene_cols]
dropped_genes = dropped_samples.columns[dropped_samples.isnull().any()]
print("Variance of First 5 Dropped Genes:")
print(filtered_data[dropped_genes[:5]].var())

In [None]:
#The data is already log2-transformed (values like 0.00, 2.09 from your output confirm this),
#so no further normalization is needed for now. We’ll confirm:
print("Sample Gene Values:")
print(cleaned_data[gene_cols[:5]].head())  # First 5 genes

# Feature Selection

In [None]:
!pip install scikit-learn

**Extract Features:**

In [None]:
# Separate features (genes) from labels and sample IDs
X = filtered_data.drop(columns=['sample', 'primary_disease','sample_type'])
y = filtered_data['primary_disease']
# Select non-numeric columns for variance calculation(for confirmation)
numeric_X = X.select_dtypes(exclude=['number'])  # Keep only numeric columns
numeric_X.head()

In [None]:
#confirmation
# Find information for TCGA-EM-A2CS-06(Melanoma)
sample_info = cleaned_data[cleaned_data['sample'] == 'TCGA-EE-A29N-06']

# Print the information
sample_info

**Gene Variance**

In [None]:
# Variance of each gene
gene_variances = X.var()
print("Top 5 Gene Variances:")
print(gene_variances.nlargest(5))

**Select 100 top genes**

In [None]:
# Get top 100 most variable genes
#Genes with higher variance are generally more informative for classification tasks because
#they exhibit different expression patterns across different classes
top_genes = gene_variances.nlargest(100).index
X_selected = X[top_genes]

# Combine with sample and labels
preprocessed_data = pd.concat([cleaned_data[['sample', 'primary_disease']], X_selected], axis=1)
print("Preprocessed Data Head:")
print(preprocessed_data.head())
print("Shape:", preprocessed_data.shape)  # ~2500 rows x 102 columns

**Split the Data**
**Goal:** Create training (80%) and testing (20%) sets.

In [None]:
# @title
from sklearn.model_selection import train_test_split

# Features and labels
X = preprocessed_data.drop(columns=['sample', 'primary_disease'])
y = preprocessed_data['primary_disease']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Set Shape:", X_train.shape)
print("Testing Set Shape:", X_test.shape)
print("Training Labels Sample:", y_train.head())