In [None]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier  # <-- NEW MODEL
from sklearn.metrics import accuracy_score, f1_score
from mlflow.tracking import MlflowClient
from sklearn.compose import ColumnTransformer

In [None]:
# Load the dataset
try:
    df = pd.read_csv('/data/IMDB Dataset.csv')
except FileNotFoundError:
    print("ERROR: 'IMDB Dataset.csv' not found.")
    print("Please download it from Kaggle and place it in the 'data/' directory.")
    # We'll re-run this from inside Docker, where the path will be correct
    # For local testing, make sure your notebook is running from the project root
    # and the file is at './data/IMDB Dataset.csv'
    df = pd.read_csv('./data/IMDB Dataset.csv')

# Map sentiment labels to 0 and 1 (matching our old model)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Rename 'review' column to 'text' to match our original code
df = df.rename(columns={'review': 'text'})

print(f"Loaded {len(df)} reviews.")
print(df.head())

# Split the data (using a 20% test size for a large dataset)
X_train, X_test, y_train, y_test = train_test_split(df[['text']], df['sentiment'], test_size=0.2, random_state=42)

In [None]:
print("Starting Phase 1: Model Training and MLflow Integration...")
mlflow.set_experiment("GigaFlow-Sentiment")

with mlflow.start_run() as run:
    # --- 1. Model Training ---
    
    params = {
        "loss": "hinge",
        "penalty": "l2",
        "alpha": 1e-4, # Slightly stronger regularization
        "random_state": 42,
        "max_iter": 100
    }

    # Define the preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1,2)), 'text')
        ],
        remainder='passthrough'
    )
    
    # Create the full pipeline with the new model
    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('clf', SGDClassifier(**params))
    ])
    
    print("Training SGDClassifier...")
    model_pipeline.fit(X_train, y_train)
    print("Training complete.")

    # --- 2. MLflow Integration (Logging) ---
    
    mlflow.log_params(params)
    mlflow.log_param("model_type", "SGDClassifier_with_TFIDF")

    print("Evaluating model...")
    preds = model_pipeline.predict(X_test)
    
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average='weighted')
    
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)
    
    # Log the final model artifact
    print("Logging model to MLflow...")
    mlflow.sklearn.log_model(
        sk_model=model_pipeline,
        artifact_path="model",
        input_example=X_train.iloc[:1],
        registered_model_name="giga-flow-sentiment"
    )
    
    run_id = run.info.run_id
    print(f"\n--- MLflow Run Complete ---")
    print(f"Run ID: {run_id}")
    print(f"Logged Metrics: Accuracy={acc:.4f}, F1-Score={f1:.4f}")

In [None]:
print(f"\n--- Starting Phase 1.3: Local Testing ---")
print(f"Loading model from Run ID: {run_id}")

logged_model_uri = f"runs:/{run_id}/model"
loaded_model = mlflow.pyfunc.load_model(logged_model_uri)

test_data = pd.DataFrame({
    'text': [
        "This is a truly wonderful and amazing product",
        "I am so angry and frustrated with this."
    ]
})

predictions = loaded_model.predict(test_data)

print("\n--- Inference Results ---")
print(f"Input: {test_data['text'].iloc[0]} -> Prediction: {'Positive' if predictions[0] == 1 else 'Negative'}")
print(f"Input: {test_data['text'].iloc[1]} -> Prediction: {'Positive' if predictions[1] == 1 else 'Negative'}")

In [None]:
print(f"\n--- Starting Phase 1.4: Register Model and Set Alias ---")

client = MlflowClient()
model_name = "giga-flow-sentiment"
model_alias = "champion"

latest_version = client.get_latest_versions(model_name, stages=None)[0]
version_number = latest_version.version

print(f"Registered Model: {model_name}, Version: {version_number}")

client.set_registered_model_alias(
    name=model_name,
    alias=model_alias,
    version=version_number
)

print(f"Alias '{model_alias}' set for {model_name} version {version_number}.")
print("Model successfully registered and aliased for deployment.")