In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import joblib
import csv
from pathlib import Path
from datetime import datetime
from pathlib import Path

base_dir = Path.cwd()

def run_experiment(
    infile=base_dir / "command_normalized_classifications.txt",
    min_class_size=15,
    ngram_range=(1,2),
    min_df=2,
    max_iter=1000,
    class_weight="balanced",
    log_file=base_dir / "experiment_results.csv",
    model_dir=base_dir / "models"
):
    """
    Train a LogisticRegression text classifier on command/classification data,
    filter out classes with fewer than `min_class_size` samples,
    and append key metrics (precision, recall, F1, rows, timestamp, experiment ID) to a CSV log.
    Each model is saved with its experiment ID in the filename.
    """

    # ==== Load and clean input file ====
    with open(infile) as f:
        base_text = f.read().splitlines()

    csv_rows = []
    invalid_rows = 0
    invalid_classifications = [
        '', '?', 'unknown', 'n/a', 'not specified', 'not sure', 'none',
        'other', 'misc', 'miscellaneous', 'various', 'varied', 'variety',
        'varied commands', 'various commands'
    ]

    for row in base_text:
        row_list = [item.strip('"') for item in row.split(',')]
        csv_check = len(row_list) == 2 and row_list[-1] not in invalid_classifications
        if csv_check:
            csv_rows.append(row_list)
        else:
            invalid_rows += 1

    df = pd.DataFrame(csv_rows, columns=['classification', 'command'])
    X = df["command"]
    y = df["classification"]

    print(f"Number of invalid rows: {invalid_rows}")
    print(f"Number of valid rows: {len(csv_rows)}")

    # ==== Filter out small classes ====
    counts = y.value_counts()
    mask = y.isin(counts[counts >= min_class_size].index)
    X, y = X[mask], y[mask]
    print(f"Remaining rows after filtering: {len(X)}")

    # ==== Train/Test Split ====
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # ==== Build Model ====
    model = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=ngram_range, min_df=min_df)),
        ("clf", LogisticRegression(max_iter=max_iter, class_weight=class_weight))
    ])

    # ==== Train ====
    model.fit(X_train, y_train)

    # ==== Evaluate ====
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)

    weighted_f1 = report["weighted avg"]["f1-score"]
    weighted_precision = report["weighted avg"]["precision"]
    weighted_recall = report["weighted avg"]["recall"]

    print(classification_report(y_test, y_pred))
    print(f"✅ Weighted Precision: {weighted_precision:.3f}, "
          f"Recall: {weighted_recall:.3f}, F1: {weighted_f1:.3f}")

    # ==== Append to log CSV and get experiment ID ====
    log_path = Path(log_file)
    write_header = not log_path.exists()

    if log_path.exists():
        with open(log_path, newline='', encoding="utf-8") as f:
            existing_rows = sum(1 for _ in f) - 1  # exclude header
        exp_id = f"exp_{existing_rows+1:03d}"
    else:
        exp_id = "exp_001"

    with open(log_path, "a", newline='', encoding="utf-8") as f:
        writer = csv.writer(f)
        if write_header:
            writer.writerow([
                "experiment_id", "timestamp", "min_class_size", "ngram_range", "min_df",
                "max_iter", "class_weight", "remaining_rows",
                "weighted_precision", "weighted_recall", "weighted_f1"
            ])
        writer.writerow([
            exp_id,
            datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            min_class_size, ngram_range, min_df, max_iter,
            class_weight, len(X),
            f"{weighted_precision:.3f}",
            f"{weighted_recall:.3f}",
            f"{weighted_f1:.3f}"
        ])

    print(f"📊 Results logged as {exp_id} in {log_file}")

    # ==== Save Model with experiment ID ====
    model_filename = Path(model_dir) / f"command_classifier_{exp_id}.pkl"
    joblib.dump(model, model_filename)
    print(f"💾 Model saved as {model_filename}")

    return exp_id, weighted_precision, weighted_recall, weighted_f1


In [None]:
# example run
#run_experiment(min_class_size=15)
#run_experiment(min_class_size=20, ngram_range=(1,3), min_df=3)

In [None]:
from tqdm import tqdm

# ==== Define iteration ranges here ====
min_class_size_range = range(2, 30)       # 2 → 29
min_df_range = range(1, 5)                # 1 → 4
ngram_range_options = [(1,1), (1,2), (1,3)]

# ==== Compute total runs ====
total_runs = len(min_class_size_range) * len(min_df_range) * len(ngram_range_options)

# ==== Run experiments with progress bar ====
with tqdm(total=total_runs, desc="Experiments") as pbar:
    for min_class_size in min_class_size_range:
        for min_df in min_df_range:
            for ngram_range in ngram_range_options:
                run_experiment(
                    min_class_size=min_class_size,
                    ngram_range=ngram_range,
                    min_df=min_df
                )
                pbar.update(1)


In [8]:
#min_class_size ~20–22, ngram_range=(1,2), min_df=2
run_experiment(    
    min_class_size=21,
    ngram_range=(1,2),
    min_df=2
)

Number of invalid rows: 195
Number of valid rows: 730
Remaining rows after filtering: 596
                         precision    recall  f1-score   support

      Executable/Binary       0.38      0.56      0.45         9
           Launch Agent       0.25      0.09      0.13        11
  System Daemon Startup       0.23      0.29      0.26        17
   System Service Agent       0.68      0.23      0.34        65
System Service Launcher       0.19      0.46      0.27        13
   XPC Service Launcher       0.19      1.00      0.31         5

               accuracy                           0.31       120
              macro avg       0.32      0.44      0.29       120
           weighted avg       0.48      0.31      0.31       120

✅ Weighted Precision: 0.481, Recall: 0.308, F1: 0.311
📊 Results logged as exp_337 in /Users/peter/Documents/PID_Crawler/experiment_results.csv
💾 Model saved as /Users/peter/Documents/PID_Crawler/models/command_classifier_exp_337.pkl


('exp_337', 0.481306521410688, 0.30833333333333335, 0.3113292490555422)