In [1]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier  # <-- NEW MODEL
from sklearn.metrics import accuracy_score, f1_score
from mlflow.tracking import MlflowClient
from sklearn.compose import ColumnTransformer

In [2]:
# Load the dataset
try:
    # This path works INSIDE the Docker container
    df = pd.read_csv('/data/IMDB Dataset.csv')
except FileNotFoundError:
    # This path works for LOCAL execution (from the project root)
    print("ERROR: 'IMDB Dataset.csv' not found.")
    df = pd.read_csv('../../data/IMDB Dataset.csv')

# Map sentiment labels to 0 and 1 (matching our old model)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Rename 'review' column to 'text' to match our original code
df = df.rename(columns={'review': 'text'})

# --- NEW LINE ---
# Subsample the data to prevent OOM errors in CI
# We'll use 10,000 rows, which is plenty for a CI run
if len(df) > 10000:
    print(f"Original size: {len(df)}. Sampling down to 10,000 rows.")
    df = df.sample(n=10000, random_state=42)
# --- END NEW LINE ---

print(f"Using {len(df)} reviews for training.")
print(df.head())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df[['text']], df['sentiment'], test_size=0.2, random_state=42)

ERROR: 'IMDB Dataset.csv' not found.
Original size: 50000. Sampling down to 10,000 rows.
Using 10000 reviews for training.
                                                    text  sentiment
33553  I really liked this Summerslam due to the look...          1
9427   Not many television shows appeal to quite as m...          1
199    The film quickly gets to a major chase scene w...          0
12447  Jane Austen would definitely approve of this o...          1
39489  Expectations were somewhat high for me when I ...          0


In [3]:
print("Starting Phase 1: Model Training and MLflow Integration...")
mlflow.set_experiment("GigaFlow-Sentiment")

with mlflow.start_run() as run:
    # --- 1. Model Training ---
    
    params = {
        "loss": "hinge",
        "penalty": "l2",
        "alpha": 1e-4, # Slightly stronger regularization
        "random_state": 42,
        "max_iter": 100
    }

    # Define the preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1,2)), 'text')
        ],
        remainder='passthrough'
    )
    
    # Create the full pipeline with the new model
    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('clf', SGDClassifier(**params))
    ])
    
    print("Training SGDClassifier...")
    model_pipeline.fit(X_train, y_train)
    print("Training complete.")

    # --- 2. MLflow Integration (Logging) ---
    
    mlflow.log_params(params)
    mlflow.log_param("model_type", "SGDClassifier_with_TFIDF")

    print("Evaluating model...")
    preds = model_pipeline.predict(X_test)
    
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average='weighted')
    
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)
    
    print("Saving training data as artifact...")
    # We save X_train as a parquet file. It's efficient.
    X_train.to_parquet("training_data.parquet", index=False)

    # Log the parquet file to MLflow in a folder named 'reference_data'
    mlflow.log_artifact("training_data.parquet", "reference_data")
    print("Training data artifact saved.")
    
    # Log the final model artifact
    print("Logging model to MLflow...")
    mlflow.sklearn.log_model(
        sk_model=model_pipeline,
        artifact_path="model",
        input_example=X_train.iloc[:1],
        registered_model_name="giga-flow-sentiment"
    )
    
    run_id = run.info.run_id
    print(f"\n--- MLflow Run Complete ---")
    print(f"Run ID: {run_id}")
    print(f"Logged Metrics: Accuracy={acc:.4f}, F1-Score={f1:.4f}")

Traceback (most recent call last):
  File "c:\Dev\GitHub\giga-flow-mlops\.venv\Lib\site-packages\mlflow\store\tracking\file_store.py", line 302, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Dev\GitHub\giga-flow-mlops\.venv\Lib\site-packages\mlflow\store\tracking\file_store.py", line 395, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Dev\GitHub\giga-flow-mlops\.venv\Lib\site-packages\mlflow\store\tracking\file_store.py", line 1303, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Dev\GitHub\giga-flow-mlops\.venv\Lib\site-packages\mlflow\store\tracking\file_store.py", line 1296, in _read_helper
    result = read_yaml(root, file_name)
             ^^^

Starting Phase 1: Model Training and MLflow Integration...
Training SGDClassifier...
Training complete.
Evaluating model...
Saving training data as artifact...
Training data artifact saved.
Logging model to MLflow...


  output_schema = _infer_schema(prediction)



--- MLflow Run Complete ---
Run ID: 441d7aedac7f440fbd6915e58e57988f
Logged Metrics: Accuracy=0.8860, F1-Score=0.8859


Registered model 'giga-flow-sentiment' already exists. Creating a new version of this model...
Created version '5' of model 'giga-flow-sentiment'.


In [4]:
print(f"\n--- Starting Phase 1.3: Local Testing ---")
print(f"Loading model from Run ID: {run_id}")

logged_model_uri = f"runs:/{run_id}/model"
loaded_model = mlflow.pyfunc.load_model(logged_model_uri)

test_data = pd.DataFrame({
    'text': [
        "This is a truly wonderful and amazing product",
        "I am so angry and frustrated with this."
    ]
})

predictions = loaded_model.predict(test_data)

print("\n--- Inference Results ---")
print(f"Input: {test_data['text'].iloc[0]} -> Prediction: {'Positive' if predictions[0] == 1 else 'Negative'}")
print(f"Input: {test_data['text'].iloc[1]} -> Prediction: {'Positive' if predictions[1] == 1 else 'Negative'}")


--- Starting Phase 1.3: Local Testing ---
Loading model from Run ID: 441d7aedac7f440fbd6915e58e57988f

--- Inference Results ---
Input: This is a truly wonderful and amazing product -> Prediction: Positive
Input: I am so angry and frustrated with this. -> Prediction: Negative


In [5]:
print(f"\n--- Starting Phase 1.4: Register Model ---")

client = MlflowClient()
model_name = "giga-flow-sentiment"

# Get the latest version that was just registered
latest_version = client.get_latest_versions(model_name, stages=None)[0]
version_number = latest_version.version

print(f"Registered Model: {model_name}, Version: {version_number}")
print(f"Run ID: {run_id}")
print("Model successfully registered. Promotion will be handled by the CI/CD pipeline.")

# We also need to print the run_id so the GitHub Action can read it
print(f"MLFLOW_RUN_ID={run_id}")


--- Starting Phase 1.4: Register Model ---
Registered Model: giga-flow-sentiment, Version: 5
Run ID: 441d7aedac7f440fbd6915e58e57988f
Model successfully registered. Promotion will be handled by the CI/CD pipeline.
MLFLOW_RUN_ID=441d7aedac7f440fbd6915e58e57988f


  latest_version = client.get_latest_versions(model_name, stages=None)[0]
