In [1]:
# Title & Notes
# ----------------------
# Scaling the Iris dataset with SMOTE (configurable for huge expansion on Nautilus)
# Instructions:
#   - By default this notebook runs in TEST MODE to avoid overloading local machines.
#   - When running on Nautilus: set small_test_mode = False, update output_dir to your PVC mount,
#     ensure required packages are installed in the container, and adjust resources.
# Paths: use /mnt/data for PVC-mounted storage on Nautilus.

In [2]:
# Parameters (edit before running at scale)
small_test_mode = True        # <-- Set to False on Nautilus when ready to scale
original_multiplier = 100000  # intended multiplier (100k)
test_multiplier = 10          # small multiplier for local testing
output_dir = "/home/jovyan/cloud_comp/module5/practices/project1"  # change to your PVC mount on Nautilus

# Effective multiplier used by the notebook
multiplier = test_multiplier if small_test_mode else original_multiplier

print("Running with multiplier:", multiplier)
print("Output directory:", output_dir)

Running with multiplier: 10
Output directory: /home/jovyan/cloud_comp/module5/practices/project1


In [3]:
# Cell 3 — Install dependencies (run once)
# Note: On Nautilus you may prefer to build a Docker image with these already installed.
import sys, subprocess, pkgutil
def pip_install(pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", *pkgs])

needed = []
for pkg in ("pandas","numpy","scikit-learn","joblib","pyarrow"):
    if not pkgutil.find_loader(pkg):
        needed.append(pkg)
# imbalanced-learn is optional; will be installed if needed
if needed:
    print("Installing:", needed)
    pip_install(needed)
else:
    print("Core packages present. If you need imbalanced-learn later, install it then.")

Installing: ['scikit-learn']


In [4]:
# Cell 4 — Imports and setup
import os, math, time
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import joblib

os.makedirs(output_dir, exist_ok=True)
print("Output directory ready:", output_dir)

Output directory ready: /home/jovyan/cloud_comp/module5/practices/project1


In [5]:
# Cell 5 — Load Iris (built-in) and inspect
iris = load_iris()
X = pd.DataFrame(iris.data, columns=[c.replace(" (cm)","").strip() for c in iris.feature_names])
y = pd.Series(iris.target, name="species")
print("Original shape:", X.shape)
print(y.value_counts())
display(X.head())

Original shape: (150, 4)
species
0    50
1    50
2    50
Name: count, dtype: int64


Unnamed: 0,sepal length,sepal width,petal length,petal width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
# Cell 6 — Preprocessing: scaling and train-test split BEFORE augmentation
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
scaler_params = pd.DataFrame({
    "mean": scaler.mean_,
    "var": scaler.var_,
    "scale": scaler.scale_
})

scaler_params.to_csv(os.path.join(output_dir, "scaler_params.csv"), index=False)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (120, 4) Test shape: (30, 4)


In [7]:
# Cell 7 — Augmentation helpers (SMOTE or noise fallback)
use_smote = True
try:
    from imblearn.over_sampling import SMOTE
except Exception as e:
    print("imbalanced-learn not available:", e)
    use_smote = False

def generate_with_noise(X_in, y_in, target_total_per_class, noise_scale=0.01):
    X_arr = np.array(X_in)
    y_arr = np.array(y_in)
    classes = np.unique(y_arr)
    gen_X = []
    gen_y = []
    rng = np.random.default_rng(42)
    feature_stds = X_arr.std(axis=0)
    for c in classes:
        cls_idx = np.where(y_arr==c)[0]
        if len(cls_idx)==0: continue
        base = X_arr[cls_idx]
        n = max(0, target_total_per_class - len(base))
        if n <= 0: continue
        sel_idx = rng.choice(len(base), size=n, replace=True)
        sampled = base[sel_idx]
        noise = rng.normal(0, noise_scale*feature_stds, size=sampled.shape)
        aug = sampled + noise
        gen_X.append(aug)
        gen_y.append(np.full(n, c))
    if len(gen_X)==0:
        return np.empty((0,X_arr.shape[1])), np.empty((0,))
    return np.vstack(gen_X), np.hstack(gen_y)

imbalanced-learn not available: No module named 'imblearn'


In [8]:
# Cell 8 — Decide targets and generate synthetic data (safe defaults)
n_train = X_train.shape[0]
print("Train rows:", n_train)
# target_total_per_class = desired number of rows per class after augmentation
approx_target_total = math.ceil((multiplier * n_train) / 3)
print("Approx target per-class (rough):", approx_target_total)

if small_test_mode:
    print("SMALL TEST MODE: using noise augmentation for a small expansion.")
    X_synth, y_synth = generate_with_noise(X_train, y_train, approx_target_total, noise_scale=0.02)
else:
    if use_smote:
        print("WARNING: Chunked SMOTE generation code is heavy and must be run on Nautilus with sufficient RAM/CPU.")
        # naive SMOTE (may fail at extreme scale) — recommended to implement a chunking strategy in production
        sm = SMOTE(sampling_strategy='not majority', random_state=42)
        X_res, y_res = sm.fit_resample(X_train, y_train)
        # For simplicity here we will repeat resample until rough multiplier achieved (not optimal)
        repeats = multiplier // (len(X_res) // len(X_train) if len(X_train)>0 else 1)
        X_synth = np.vstack([X_res[len(X_train):]] * max(1, repeats))
        y_synth = np.hstack([y_res[len(X_train):]] * max(1, repeats))
    else:
        print("imbalanced-learn not available — falling back to noise augmentation at requested scale.")
        X_synth, y_synth = generate_with_noise(X_train, y_train, approx_target_total, noise_scale=0.02)

print("Synthetic generated shape:", X_synth.shape, y_synth.shape)

Train rows: 120
Approx target per-class (rough): 400
SMALL TEST MODE: using noise augmentation for a small expansion.
Synthetic generated shape: (1080, 4) (1080,)


In [9]:
# Cell 9 — Combine, shuffle, and save (Parquet recommended)
if X_synth.size > 0:
    X_comb = np.vstack([X_train, X_synth])
    y_comb = np.hstack([y_train, y_synth])
else:
    X_comb = X_train
    y_comb = y_train

X_comb, y_comb = shuffle(X_comb, y_comb, random_state=42)
df_comb = pd.DataFrame(X_comb, columns=[c.replace(" (cm)","").strip() for c in iris.feature_names])
df_comb["species"] = y_comb.astype(int)

out_csv = os.path.join(output_dir, f"iris_smote_x{multiplier}.csv")
df_comb.to_csv(out_csv, index=False)
print("Saved combined dataset to:", out_csv, "Shape:", df_comb.shape)

Saved combined dataset to: /home/jovyan/cloud_comp/module5/practices/project1/iris_smote_x10.csv Shape: (1200, 5)


In [10]:
# Cell 11 — Save metadata and notes
import json, time
meta = {
    "multiplier": multiplier,
    "combined_rows": int(df_comb.shape[0]),
    "generated_rows": int(X_synth.shape[0]) if X_synth.size else 0,
    "timestamp": time.ctime(),
    "small_test_mode": bool(small_test_mode)
}
with open(os.path.join(output_dir, "generation_metadata.json"), "w") as f:
    json.dump(meta, f, indent=2)
print("Metadata saved to PVC.")

Metadata saved to PVC.


In [11]:
# Cell 10 — Quick validation model (RandomForest) using real held-out test set
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
rf.fit(X_comb, y_comb)
preds = rf.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))
joblib.dump(rf, os.path.join(output_dir, "rf_model.joblib"))

Test accuracy: 0.9
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.82      0.90      0.86        10
           2       0.89      0.80      0.84        10

    accuracy                           0.90        30
   macro avg       0.90      0.90      0.90        30
weighted avg       0.90      0.90      0.90        30



['/home/jovyan/cloud_comp/module5/practices/project1/rf_model.joblib']

In [12]:
import joblib

# Load the Random Forest model
rf = joblib.load("/home/jovyan/cloud_comp/module5/practices/project1/rf_model.joblib")

# Use it for predictions
predictions = rf.predict(X_test)
print(predictions)

[0 2 1 1 0 1 0 0 2 1 2 2 2 1 0 0 0 1 1 1 0 2 1 1 2 2 1 0 2 0]


In [13]:
# Cell 12 — Generate a sample Kubernetes Job YAML (edit before apply)
k8s_yaml = f'''
apiVersion: batch/v1
kind: Job
metadata:
  name: iris-smote-job
spec:
  template:
    spec:
      containers:
      - name: iris-smote
        image: your-docker-repo/iris-smote:latest
        command: ["python", "/workspace/iris_smote_script.py"]
        volumeMounts:
        - name: pvc-storage
          mountPath: /mnt/data
        resources:
          limits:
            cpu: "8"
            memory: "32Gi"
      restartPolicy: Never
      volumes:
      - name: pvc-storage
        persistentVolumeClaim:
          claimName: your-pvc-name
  backoffLimit: 3
'''
yaml_path = os.path.join(output_dir, "iris_smote_job.yaml")
with open(yaml_path, "w") as f:
    f.write(k8s_yaml)
print("Wrote sample YAML to:", yaml_path)
print("IMPORTANT: edit 'image' and 'claimName' before applying on Nautilus.")

Wrote sample YAML to: /home/jovyan/cloud_comp/module5/practices/project1/iris_smote_job.yaml
IMPORTANT: edit 'image' and 'claimName' before applying on Nautilus.
