# Exploratory Data Analysis on GSE2034 Gene Expression Dataset

In [None]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Plot settings
sns.set(style='whitegrid')

## Load Dataset

In [None]:
# Load the gene expression dataset
df = pd.read_csv('../../data/raw/GSE2034.csv')
df.head()

## Basic Info and Summary

In [None]:
df.info()
df.describe().T

## Missing Values

In [None]:
missing = df.isnull().sum()
missing[missing > 0].sort_values(ascending=False)

## Distribution of Expression Values

In [None]:
# Histogram for the first 3 genes
df.iloc[:, 1:4].hist(figsize=(12, 6), bins=30)
plt.suptitle('Distribution of Gene Expression')
plt.show()

## Correlation Heatmap

In [None]:
# Compute correlation matrix on a subset (for performance)
subset = df.iloc[:, 1:21]
corr = subset.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr, cmap='coolwarm', annot=False)
plt.title('Correlation Heatmap (subset of genes)')
plt.show()

## PCA (Principal Component Analysis)

In [None]:
# Standardize and run PCA
features = df.columns[1:]  # assuming column 0 is sample ID
x = df.loc[:, features].values
x = StandardScaler().fit_transform(x)

pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
pca_df = pd.DataFrame(data=principalComponents, columns=['PC1', 'PC2'])

# Plot PCA result
plt.figure(figsize=(8, 6))
sns.scatterplot(x='PC1', y='PC2', data=pca_df)
plt.title('PCA of Gene Expression')
plt.xlabel(f'PC1 - {pca.explained_variance_ratio_[0]*100:.2f}%')
plt.ylabel(f'PC2 - {pca.explained_variance_ratio_[1]*100:.2f}%')
plt.show()