In [1]:
# Cell 1: Imports
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# Cell 2: Create Dummy Data (Expanded)

data = {
    'text': [
        "This is an amazing product!",
        "I hate this, it's terrible.",
        "The service was mediocre.",
        "I'm feeling happy and great.",
        "This is the worst experience ever.",
        "It's alright, not bad.",
        "Absolutely fantastic!",
        "Complete waste of money.",
        "I love this new feature!",
        "This is so frustrating and useless.",
        "The customer support was very helpful.",
        "It's an okay product, nothing special.",
        "I will never buy this again.",
        "This is the best! Highly recommend.",
        "Such a disappointment.",
        "I'm very satisfied with the purchase."
    ],
    'sentiment': [1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1]  # 1 = Positive, 0 = Negative
}
df = pd.DataFrame(data)

# Split the data (now 16 rows total)
# test_size=0.25 will give us 12 for training and 4 for testing
X_train, X_test, y_train, y_test = train_test_split(df[['text']], df['sentiment'], test_size=0.25, random_state=42)

In [3]:
# Cell 3: Phase 1.1 & 1.2 - Train Model and Log with MLflow (Corrected)

# --- NEW IMPORT ---
from sklearn.compose import ColumnTransformer
# ------------------

print("Starting Phase 1: Model Training and MLflow Integration...")
mlflow.set_experiment("GigaFlow-Sentiment")

with mlflow.start_run() as run:
    # --- 1. Model Training ---
    
    params = {
        "C": 1.0,
        "solver": "liblinear",
        "random_state": 42
    }

    # --- DEFINE THE PIPELINE WITH COLUMNTRANSFORMER ---
    
    # This preprocessor tells the pipeline to apply TfidfVectorizer 
    # ONLY to the 'text' column and to pass other columns through.
    preprocessor = ColumnTransformer(
        transformers=[
            ('tfidf', TfidfVectorizer(), 'text')  # (name, transformer, column_name)
        ],
        remainder='passthrough'
    )
    
    # Create the full pipeline
    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('clf', LogisticRegression(C=params['C'], solver=params['solver'], random_state=params['random_state']))
    ])
    
    # --- END OF NEW PIPELINE DEFINITION ---
    
    # (Optional) Debug prints - you can keep or remove these
    print(f"DEBUG: Shape of X_train: {X_train.shape}")
    print(f"DEBUG: Shape of y_train: {y_train.shape}")
    
    # Train the model (X_train is still the df[['text']] from Cell 2)
    model_pipeline.fit(X_train, y_train)

    # --- 2. MLflow Integration (Logging) ---
    
    mlflow.log_params(params)
    mlflow.log_param("model_type", "LogisticRegression_with_TFIDF")

    # Make predictions and log metrics (X_test is the df[['text']] from Cell 2)
    preds = model_pipeline.predict(X_test)
    
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average='weighted')
    
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)
    
    # Log the final model artifact
    mlflow.sklearn.log_model(
        sk_model=model_pipeline,
        artifact_path="model",
        # X_train.iloc[:1] is now a (1, 1) DataFrame, which is perfect.
        input_example=X_train.iloc[:1] 
    )
    
    run_id = run.info.run_id
    print(f"\n--- MLflow Run Complete ---")
    print(f"Run ID: {run_id}")
    print(f"Logged Metrics: Accuracy={acc:.4f}, F1-Score={f1:.4f}")
    print(f"Model artifact logged to 'model' directory within the run.")

Starting Phase 1: Model Training and MLflow Integration...


  return FileStore(store_uri, store_uri)


DEBUG: Shape of X_train: (12, 1)
DEBUG: Shape of y_train: (12,)





--- MLflow Run Complete ---
Run ID: 8a728fdad39840148eb0412b4b351b21
Logged Metrics: Accuracy=0.7500, F1-Score=0.6429
Model artifact logged to 'model' directory within the run.


In [4]:
# Cell 4: Phase 1.3 - Local Testing 
print(f"\n--- Starting Phase 1.3: Local Testing ---")
print(f"Loading model from Run ID: {run_id}")

# 1. Define the URI to load the model
logged_model_uri = f"runs:/{run_id}/model"

# 2. Load the model as a 'pyfunc'
loaded_model = mlflow.pyfunc.load_model(logged_model_uri)

# 3. Test with new data
test_data = pd.DataFrame({
    'text': [
        "This is a truly wonderful and amazing product",
        "I am so angry and frustrated with this."
    ]
})

predictions = loaded_model.predict(test_data)

print(f"DEBUG: Predictions array: {predictions}")

print("\n--- Inference Results ---")
print(f"Input: {test_data['text'].iloc[0]} -> Prediction: {'Positive' if predictions[0] == 1 else 'Negative'}")
print(f"Input: {test_data['text'].iloc[1]} -> Prediction: {'Positive' if predictions[1] == 1 else 'Negative'}")


--- Starting Phase 1.3: Local Testing ---
Loading model from Run ID: 8a728fdad39840148eb0412b4b351b21
DEBUG: Predictions array: [0 0]

--- Inference Results ---
Input: This is a truly wonderful and amazing product -> Prediction: Negative
Input: I am so angry and frustrated with this. -> Prediction: Negative
