# Upload the file to the Colab

In [None]:
## 1. Setup and Data Loading 📂

#Mount Google Drive and load the RNA-seq and metadata files.


from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Verify files in TCGA_Data folder
!ls "/content/drive/My Drive/Colab Notebooks/TCGA_Data"

In [None]:
!ls "/content/drive/My Drive/Colab Notebooks/TCGA_Data"

In [None]:
import pandas as pd

# File paths
rnaseq_path = '/content/drive/My Drive/Colab Notebooks/TCGA_Data/EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena'
metadata_path = '/content/drive/My Drive/Colab Notebooks/TCGA_Data/TCGA_phenotype_denseDataOnlyDownload.tsv'

# Load data
data = pd.read_csv(rnaseq_path, sep='\t')
metadata = pd.read_csv(metadata_path, sep='\t')

# Transpose RNA-seq data and merge with metadata
data = data.set_index('sample').T.reset_index().rename(columns={'index': 'sample'})
merged_data = pd.merge(data, metadata, on='sample', how='inner')

# Rename '_primary_disease' to 'primary_disease' for consistency
merged_data = merged_data.rename(columns={'_primary_disease': 'primary_disease'})

# Preview
print("Merged Data Head:")
print(merged_data.head())
print("Shape:", merged_data.shape)  # ~11,060 rows, 20,533 columns

In [None]:
# Shape and columns
print("Shape:", merged_data.shape)
print("Columns (First 10):", merged_data.columns[:10])

# Unique cancer types
print("Unique Cancer Types:")
print(merged_data['primary_disease'].unique())

# preprocess this for your classification task

**Filtering to Target Cancers**

In [None]:
# Target cancer types
target_cancers = ['breast invasive carcinoma', 'lung adenocarcinoma', 'prostate adenocarcinoma']
filtered_data = merged_data[merged_data['primary_disease'].isin(target_cancers)]

# Verify
print("Filtered Data Head:")
print(filtered_data.head())
print("Filtered Shape:", filtered_data.shape)  # e.g., ~2,500 rows
print("Cancer Type Counts:")
print(filtered_data['primary_disease'].value_counts())

**Check for Missing Values and Data Cleaning:**

In [None]:
# Check missing values
print("Total Missing Values:", filtered_data.isnull().sum().sum())
print("Columns with Missing Values (Sample):")
print(filtered_data.isnull().sum()[filtered_data.isnull().sum() > 0].head())

# Rows with missing values
missing_rows = filtered_data[filtered_data.isnull().any(axis=1)]
print("Rows with Missing Values (Head):")
print(missing_rows[['sample', 'primary_disease']].head())
print("Number of Rows with Missing Values:", len(missing_rows))

# Total missing in gene columns
gene_cols = [col for col in filtered_data.columns if col not in ['sample', 'primary_disease']]
print("Missing in Gene Columns:", filtered_data[gene_cols].isnull().sum().sum())

# Drop rows with missing gene values (1.3% of data)
cleaned_data = filtered_data.dropna(subset=gene_cols)
print("Cleaned Shape:", cleaned_data.shape)
print("Remaining Cancer Type Counts:")
print(cleaned_data['primary_disease'].value_counts())

# Verify normalization (log2-transformed data)
print("Sample Gene Values:")
print(cleaned_data[gene_cols[:5]].head())

In [None]:
#The data is already log2-transformed (values like 0.00, 2.09 from your output confirm this),
#so no further normalization is needed for now. We’ll confirm:
print("Sample Gene Values:")
print(cleaned_data[gene_cols[:5]].head())  # First 5 genes

# Feature Selection

**Extract Features:**

In [None]:
!pip install scikit-learn

# Separate features and labels
X = cleaned_data.drop(columns=['sample', 'primary_disease','sample_type'])
y = cleaned_data['primary_disease']

# Calculate gene variances
gene_variances = X.var()
print("Top 5 Gene Variances:")
print(gene_variances.nlargest(5))

# Select top 100 genes
top_genes = gene_variances.nlargest(100).index
X_selected = X[top_genes]

# Combine with essential columns
preprocessed_data = pd.concat([cleaned_data[['sample', 'primary_disease']], X_selected], axis=1)
print("Preprocessed Data Head:")
print(preprocessed_data.head())
print("Preprocessed Shape:", preprocessed_data.shape)  # e.g., ~2,467 x 102

**Gene Variance**

**Select 100 top genes**

In [None]:
from sklearn.model_selection import train_test_split

# Features and labels
X = preprocessed_data.drop(columns=['sample', 'primary_disease'])
y = preprocessed_data['primary_disease']

# Split with a fixed random state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Set Shape:", X_train.shape)
print("Testing Set Shape:", X_test.shape)
print("Training Labels Sample:")
print(y_train.head())