# Breast Cancer Gene Expression Data Exploration

This notebook explores the processed breast cancer gene expression dataset for NAC response prediction.

In [None]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

# Set style
sns.set(style='whitegrid')

In [None]:
# Load the processed data
df = pd.read_csv('../processed/cleaned_expression.csv', index_col=0)
labels = pd.read_csv('../processed/labels.csv', index_col=0)

print('Data shape:', df.shape)
print('Labels shape:', labels.shape)
print(df.head())
print(labels.head())

In [None]:
# Basic statistics
print('Response distribution:')
print(df['Response'].value_counts())

print('\nGene expression summary:')
print(df.drop('Response', axis=1).describe())

In [None]:
# Visualize response distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Response', data=df)
plt.title('Distribution of NAC Response')
plt.xlabel('Response (0 = No Response, 1 = Complete Response)')
plt.ylabel('Count')
plt.show()

In [None]:
# PCA for dimensionality reduction and visualization
X = df.drop('Response', axis=1)
y = df['Response']

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, palette='viridis')
plt.title('PCA of Gene Expression Data')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend(title='Response')
plt.show()

print('Explained variance ratio:', pca.explained_variance_ratio_)