In [None]:
import pandas as pd
import mlflow
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from mlflow.tracking import MlflowClient
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import mlflow.pyfunc

In [None]:
# Load the dataset
try:
    # This path works INSIDE the Docker container
    df = pd.read_csv('/data/IMDB Dataset.csv')
except FileNotFoundError:
    # This path works for LOCAL execution (from the project root)
    print("ERROR: 'IMDB Dataset.csv' not found.")
    df = pd.read_csv('../../data/IMDB Dataset.csv')

# Map sentiment labels to 0 and 1 (matching our old model)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Rename 'review' column to 'text' to match our original code
df = df.rename(columns={'review': 'text'})

# --- MODIFIED SAMPLING LOGIC --- [cite: 19, 20]
# Check for an env var. Default to 'CI' if not set.
TRAINING_MODE = os.getenv("TRAINING_MODE", "CI")

# Only sample down if in 'CI' mode
if TRAINING_MODE == "CI" and len(df) > 10000:
    print(f"CI Mode: Sampling down to 10,000 rows.")
    df = df.sample(n=10000, random_state=42)
else:
    print(f"FULL Mode: Using all {len(df)} rows for training.")
# --- END MODIFIED LOGIC ---

print(f"Using {len(df)} reviews for training.")
print(df.head())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df[['text']], df['sentiment'], test_size=0.2, random_state=42)

In [None]:
import warnings
import math
warnings.filterwarnings("ignore")

# 1. Define the Hugging Face model we want to use 
HF_MODEL_NAME = "distilbert-base-uncased-finetuned-sst-2-english"
mlflow.set_experiment("GigaFlow-Sentiment")

# 2. Define a custom MLflow PythonModel wrapper
# This is the key to making the new model "work seamlessly" 
# It ensures the model.predict() output is 0 or 1, just like the old model.
class HfSentimentWrapper(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        # Load the pre-trained pipeline with truncation enabled
        self.pipeline = pipeline(
            "sentiment-analysis", 
            model=HF_MODEL_NAME, 
            tokenizer=HF_MODEL_NAME,
            device=0 if torch.cuda.is_available() else -1,  # Use GPU if present
            truncation=True,  # ADD THIS LINE
            max_length=512    # ADD THIS LINE
        )

    def predict(self, context, model_input, batch_size=32):
        # model_input is a pandas DataFrame
        text_list = model_input['text'].tolist()
        
        results = []
        
        # Process in batches
        num_batches = math.ceil(len(text_list) / batch_size)
        for i in range(num_batches):
            batch = text_list[i*batch_size : (i+1)*batch_size]
            
            # Run inference on the batch
            preds_raw = self.pipeline(batch, truncation=True, max_length=512)  # ADD truncation=True, max_length=512
            
            # Convert output to 0 or 1
            preds = [1 if p['label'] == 'POSITIVE' else 0 for p in preds_raw]
            results.extend(preds)
            
        return pd.Series(results)

print("Starting Phase 1: Model Training and MLflow Integration...")

# 3. Start the MLflow Run
with mlflow.start_run() as run:
    
    print(f"Logging parameters for {HF_MODEL_NAME}")
    mlflow.log_param("model_type", "HuggingFace_Transformer")
    mlflow.log_param("model_name", HF_MODEL_NAME)
    mlflow.log_param("max_sequence_length", 512)  # ADD THIS LINE
    mlflow.log_param("truncation", True)  # ADD THIS LINE

    # --- 4. Evaluate the new model ---
    # We must evaluate it to get metrics *before* logging the model
    print("Evaluating model on test set...")
    
    # Create a temporary pipeline for evaluation with truncation enabled
    temp_pipeline = pipeline(
        "sentiment-analysis",
        model=HF_MODEL_NAME,
        tokenizer=HF_MODEL_NAME,
        device=0 if torch.cuda.is_available() else -1,
        truncation=True,  # ADD THIS LINE
        max_length=512    # ADD THIS LINE
    )
    
    preds = []
    batch_size = 32
    test_texts = X_test['text'].tolist()
    
    num_batches = math.ceil(len(test_texts) / batch_size)
    print(f"Evaluating {len(test_texts)} test samples in {num_batches} batches...")

    for i in range(num_batches):
        batch = test_texts[i*batch_size : (i+1)*batch_size]
        # Add truncation to the pipeline call as well
        preds_raw = temp_pipeline(batch, truncation=True, max_length=512)  # ADD truncation=True, max_length=512
        preds.extend([1 if p['label'] == 'POSITIVE' else 0 for p in preds_raw])

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average='weighted')
    
    print(f"Evaluation complete: Accuracy={acc:.4f}, F1-Score={f1:.4f}")
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)
    
    # --- 5. Log the custom wrapper as the model ---
    print("Saving training data as artifact...")
    X_train.to_parquet("training_data.parquet", index=False)
    mlflow.log_artifact("training_data.parquet", "reference_data")
    print("Training data artifact saved.")

    print("Logging model to MLflow...")
    
    # Define the environment for the model
    conda_env = {
        'channels': ['defaults', 'conda-forge'],
        'dependencies': [
            f'python=3.11', # Match your Dockerfile
            'pip',
            {
                'pip': [
                    'mlflow',
                    'pandas',
                    'torch',
                    'transformers',
                    'accelerate'
                ]
            }
        ]
    }
    
    # Log the custom PyFunc model
    mlflow.pyfunc.log_model(
        artifact_path="model",
        python_model=HfSentimentWrapper(),
        input_example=X_train.iloc[:5],
        registered_model_name="giga-flow-sentiment",
        conda_env=conda_env
    )
    
    run_id = run.info.run_id
    print(f"\n--- MLflow Run Complete ---")
    print(f"Run ID: {run_id}")
    print(f"Logged Metrics: Accuracy={acc:.4f}, F1-Score={f1:.4f}")

    # We also need to print the run_id so the GitHub Action can read it
    print(f"MLFLOW_RUN_ID={run_id}")

In [None]:
print(f"\n--- Starting Phase 1.3: Local Testing ---")
print(f"Loading model from Run ID: {run_id}")

logged_model_uri = f"runs:/{run_id}/model"
loaded_model = mlflow.pyfunc.load_model(logged_model_uri)

test_data = pd.DataFrame({
    'text': [
        "This is a truly wonderful and amazing product",
        "I am so angry and frustrated with this."
    ]
})

predictions = loaded_model.predict(test_data)

print("\n--- Inference Results ---")
print(f"Input: {test_data['text'].iloc[0]} -> Prediction: {'Positive' if predictions[0] == 1 else 'Negative'}")
print(f"Input: {test_data['text'].iloc[1]} -> Prediction: {'Positive' if predictions[1] == 1 else 'Negative'}")

In [None]:
print(f"\n--- Starting Phase 1.4: Register Model ---")

client = MlflowClient()
model_name = "giga-flow-sentiment"

# Get the latest version that was just registered
latest_version = client.get_latest_versions(model_name, stages=None)[0]
version_number = latest_version.version

print(f"Registered Model: {model_name}, Version: {version_number}")
print(f"Run ID: {run_id}")
print("Model successfully registered. Promotion will be handled by the CI/CD pipeline.")

# We also need to print the run_id so the GitHub Action can read it
print(f"MLFLOW_RUN_ID={run_id}")