# Train PhaseProfiler Models

This notebook trains the ML models used by PhaseProfiler:
- **Bottleneck Classifier**: Random Forest classifier to identify bottlenecks
- **Regression Model**: Predicts optimization speedup

## Steps:
1. Load training data from CSV
2. Prepare features and labels
3. Train classifier model
4. Train regression model
5. Save models to `../models/` directory


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
import pickle
import os


In [None]:
# Load training data
data_path = '../data/training_data.csv'
df = pd.read_csv(data_path)
print(f"Loaded {len(df)} samples")
print(df.head())


In [None]:
# Prepare features for classification
feature_columns = [
    'cpu_percent',
    'memory_percent',
    'memory_used_gb',
    'disk_read_mb',
    'disk_write_mb',
    'network_sent_mb',
    'network_recv_mb'
]

X = df[feature_columns].values
y_classification = df['phase'].values  # For classification

print(f"Features shape: {X.shape}")
print(f"Labels shape: {y_classification.shape}")
print(f"Unique phases: {np.unique(y_classification)}")


In [None]:
# Train-Test Split for Classification
X_train, X_test, y_train, y_test = train_test_split(
    X, y_classification, test_size=0.2, random_state=42
)

# Train Random Forest Classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
classifier.fit(X_train, y_train)

# Evaluate
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Classification Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
# Save classifier model
models_dir = '../models'
os.makedirs(models_dir, exist_ok=True)

classifier_path = os.path.join(models_dir, 'bottleneck_classifier.pkl')
with open(classifier_path, 'wb') as f:
    pickle.dump(classifier, f)
print(f"Classifier saved to {classifier_path}")


In [None]:
# For regression: predict speedup (example - you may need to generate labels)
# This is a placeholder - you would need actual speedup data
# For now, we'll create synthetic labels based on bottleneck types

phase_to_speedup = {
    'cpu_bound': 1.5,  # CPU optimizations can yield ~50% speedup
    'io_bound': 2.0,   # I/O optimizations can yield ~100% speedup
    'memory_bound': 1.3,  # Memory optimizations can yield ~30% speedup
    'mixed': 1.2,      # Mixed optimizations can yield ~20% speedup
    'idle': 1.0        # No optimization needed
}

y_regression = df['phase'].map(phase_to_speedup).values

# Train-Test Split for Regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X, y_regression, test_size=0.2, random_state=42
)

# Train Random Forest Regressor
regressor = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
regressor.fit(X_train_reg, y_train_reg)

# Evaluate
y_pred_reg = regressor.predict(X_test_reg)
mse = mean_squared_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mse)
print(f"Regression RMSE: {rmse:.3f}")
print(f"Mean predicted speedup: {np.mean(y_pred_reg):.3f}")


In [None]:
# Save regression model
regressor_path = os.path.join(models_dir, 'regression_model.pkl')
with open(regressor_path, 'wb') as f:
    pickle.dump(regressor, f)
print(f"Regressor saved to {regressor_path}")


## Next Steps

1. Collect more training data for better model performance
2. Fine-tune hyperparameters
3. Use the trained models in `app.py` for predictions
4. Models are now available in `../models/` directory
