# MPLID Dataset Exploration

This notebook provides an interactive exploration of the Membrane Protein-Lipid Interface Dataset (MPLID).

**Author**: Folorunsho Bright Omage  
**License**: MIT

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configure plotting
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

## 1. Load Dataset

In [None]:
# Load training data
train = pd.read_csv('../data/processed/train_residues.csv')
val = pd.read_csv('../data/processed/val_residues.csv')
test = pd.read_csv('../data/processed/test_residues.csv')

print(f"Training samples: {len(train):,}")
print(f"Validation samples: {len(val):,}")
print(f"Test samples: {len(test):,}")
print(f"Total: {len(train) + len(val) + len(test):,}")

## 2. Class Distribution

In [None]:
# Calculate class distribution for each split
splits = {'Train': train, 'Validation': val, 'Test': test}

for name, df in splits.items():
    n_contacts = df['is_contact'].sum()
    n_total = len(df)
    print(f"{name}: {n_contacts:,} contacts / {n_total:,} residues ({n_contacts/n_total:.2%})")

## 3. Lipid Type Distribution

In [None]:
# Analyze lipid types in training data
contacts = train[train['is_contact'] == 1]
lipid_counts = contacts['lipid_type'].value_counts().head(15)

fig, ax = plt.subplots(figsize=(12, 6))
lipid_counts.plot(kind='bar', ax=ax, color='steelblue', edgecolor='black')
ax.set_xlabel('Lipid Type')
ax.set_ylabel('Number of Contacts')
ax.set_title('Top 15 Lipid Types by Contact Frequency')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 4. Amino Acid Preferences

In [None]:
# Calculate enrichment of amino acids at lipid contact sites
aa_contact = train[train['is_contact'] == 1]['residue_name'].value_counts()
aa_total = train['residue_name'].value_counts()

enrichment = (aa_contact / aa_contact.sum()) / (aa_total / aa_total.sum())
enrichment = enrichment.sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(14, 6))
colors = ['darkgreen' if v > 1 else 'darkred' for v in enrichment.values]
enrichment.plot(kind='bar', ax=ax, color=colors, edgecolor='black')
ax.axhline(y=1, color='black', linestyle='--', linewidth=1)
ax.set_xlabel('Amino Acid')
ax.set_ylabel('Enrichment Ratio')
ax.set_title('Amino Acid Enrichment at Lipid Contact Sites')
plt.tight_layout()
plt.show()

## 5. Distance Distribution

In [None]:
# Plot distance distribution for contact residues
distances = train[train['is_contact'] == 1]['min_distance'].dropna()

fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(distances, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
ax.axvline(x=4.0, color='red', linestyle='--', linewidth=2, label='4.0 Å cutoff')
ax.set_xlabel('Minimum Distance to Lipid (Å)')
ax.set_ylabel('Number of Residues')
ax.set_title('Distance Distribution for Lipid Contact Residues')
ax.legend()
plt.tight_layout()
plt.show()

## 6. Protein Size Distribution

In [None]:
# Calculate protein sizes
protein_sizes = train.groupby('pdb_id').size()

fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(protein_sizes, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
ax.set_xlabel('Number of Residues')
ax.set_ylabel('Number of Proteins')
ax.set_title('Protein Size Distribution')
plt.tight_layout()
plt.show()

print(f"Median protein size: {protein_sizes.median():.0f} residues")
print(f"Mean protein size: {protein_sizes.mean():.0f} residues")