# 📘 Principal Component Analysis (PCA)

Apply PCA to reduce the dimensionality of gene expression data and visualize the main sources of variance.

### 🔧 Step 1: Import Required Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns

### 📂 Step 2: Load the Gene Expression Dataset

In [None]:
# Load the gene expression dataset from a CSV file, using the first column as the index
df = pd.read_csv('your_expression_data.csv', index_col=0)

# Display the first few rows of the dataset to verify it loaded correctly
df.head()

### ⚖️ Step 3: Standardize the Data

In [None]:
# Initialize the StandardScaler to standardize the dataset
scaler = StandardScaler()

# Fit the scaler to the data and transform it to have zero mean and unit variance
scaled_data = scaler.fit_transform(df)

### 📉 Step 4: Apply PCA

In [None]:
# Initialize PCA with the number of components to retain (2 in this case)
pca = PCA(n_components=2)

# Fit the PCA model to the standardized data and transform it to the new principal component space
pca_result = pca.fit_transform(scaled_data)

# Create a DataFrame to store the PCA results with appropriate column names
pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])

# Set the index of the PCA DataFrame to match the original dataset's index
pca_df.index = df.index

# Display the first few rows of the PCA results to verify
pca_df.head()

### 📊 Step 5: Visualize the PCA Results

In [None]:
# Set the figure size for the plot
plt.figure(figsize=(8,6))

# Create a scatter plot of the PCA results using seaborn
sns.scatterplot(data=pca_df, x='PC1', y='PC2')

# Set the title of the plot
plt.title('PCA of Gene Expression Data')

# Label the x-axis as 'Principal Component 1'
plt.xlabel('Principal Component 1')

# Label the y-axis as 'Principal Component 2'
plt.ylabel('Principal Component 2')

# Add a grid to the plot for better readability
plt.grid(True)

# Display the plot
plt.show()

### 🏷️ Optional: Load Labels for Coloring

In [None]:
# Uncomment and use if you have a label file
# labels = pd.read_csv('your_labels.csv', index_col=0).squeeze()
# df['Label'] = labels

### 💾 Optional: Save Transformed Data

In [None]:
# Save the result of Principal Component Analysis (PCA) transformation
# principal component analysis (pca)_df.to_csv('principal component analysis (pca)_result.csv')

### 🌐 Optional: 3D Visualization

In [None]:
# Import the 3D plotting toolkit from matplotlib
from mpl_toolkits.mplot3d import Axes3D

# Create a new figure with a specified size for the 3D plot
fig = plt.figure(figsize=(10, 7))

# Add a 3D subplot to the figure for creating 3D visualizations
ax = fig.add_subplot(111, projection='3d')


# Uncomment and modify if you have 3D data
# ax.scatter(embedded_df['X'], embedded_df['Y'], embedded_df['Z'], c=labels, cmap='viridis')
# ax.set_title('3D Visualization')
# plt.show()

### 🎛️ Optional: Interactive Parameters (Requires ipywidgets)

In [None]:
# Import the interact function from ipywidgets for creating interactive widgets
from ipywidgets import interact

# Define a function to update the PCA visualization based on the number of components
def update(n_components=2):
    # Initialize a PCA model with the specified number of components
    model = PCA(n_components=n_components)
    
    # Fit the PCA model to the standardized data and transform it
    result = model.fit_transform(scaled_data)
    
    # Create a DataFrame to store the PCA results with dynamic column names
    df_plot = pd.DataFrame(result, columns=['Component {}'.format(i+1) for i in range(n_components)])
    
    # Set the figure size for the plot
    plt.figure(figsize=(8, 6))
    
    # Create a scatter plot of the first two principal components using seaborn
    sns.scatterplot(x=df_plot.iloc[:, 0], y=df_plot.iloc[:, 1])
    
    # Set the title of the plot
    plt.title('Principal Component Analysis (PCA) Interactive')
    
    # Label the x-axis as 'Component 1'
    plt.xlabel('Component 1')
    
    # Label the y-axis as 'Component 2'
    plt.ylabel('Component 2')
    
    # Add a grid to the plot for better readability
    plt.grid(True)
    
    # Display the plot
    plt.show()

# Create an interactive widget to adjust the number of PCA components dynamically
interact(update, n_components=(2, 10))