# GenAI-Camp: Day 01
## Lesson: ML Service - Validation of User Input

This lesson is intended to show you the basics of input validation using pydantic.  
When deploying a machine learning model as a service, ensuring the quality and integrity of incoming data is just as important as the model itself. This is where data validation comes into play. One of the most powerful tools for data validation in Python is Pydantic.
Pydantic is a data validation and settings management library that leverages Python's type annotations. It allows you to define data structures with clear, type-safe expectations and automatically validate incoming data. If the data does not match the expected format, Pydantic raises clear and descriptive errors, making it easy to pinpoint issues.

During this lesson you will learn how to ...

- create pydantic models
- use pydantic models for validation


### Set up the environment
Import the necessary libraries, set constants, and define helper functions.

In [None]:
# Import necessary libraries
import pandas as pd
import tokenizers
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
import string
from nltk.stem import SnowballStemmer
import torch
import torch.nn as nn
import numpy as np
from pydantic import BaseModel, Field
import os

In [None]:
# Check runtime environment to make sure we are running in a colab environment. 
if os.getenv("COLAB_RELEASE_TAG"):
   COLAB = True
   print("Running on COLAB environment.") 
else:
   COLAB = False
   print("WARNING: Running on LOCAL environment.")

In [None]:
# Define path of ressources
if COLAB:
    # Clone the data repository into colab
    !git clone https://github.com/openknowledge/workshop-genai-camp-data.git
    ROOT_PATH = "/content/workshop-genai-camp-data/day-01/"
else:
    ROOT_PATH = "../"
DATA_PATH = ROOT_PATH + "/data"
MODEL_PATH = ROOT_PATH + "/models"

IMDB_FILE = DATA_PATH + "/imdb_dataset.csv"
MODEL_FILE = MODEL_PATH + "/sentiment_model.pth"
TOKENIZER_FILE = MODEL_PATH + "/tokenizer.json"

In [None]:
# Data preparation functions

def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Rename columns of the dataframe to make them more readable.
    """

    df_renamed = df.copy()
    df_renamed.rename(columns={"text": "review", "label": "sentiment"}, inplace=True)

    return df_renamed


def remove_punctuation(text: str) -> str:
    """
    Remove punctuation from the text.
    """

    translation_table = {ord(i): None for i in string.punctuation}
    
    return text.translate(translation_table)


def remove_html_tags(text: str) -> str:
    """
    Remove HTML tags from a string.
    """
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()


def transform_to_lowercase(text: str) -> str:
    """
    Transform a string to lowercase.
    """
    return text.lower()


nltk.download('stopwords')

def remove_stopwords(text: str) -> str:
    """
    Remove stopwords from a string.
    """

    # Load the english stopwords
    stop_words = set(stopwords.words('english'))

    words = text.split()

    filtered_text = [word for word in words if word not in stop_words]
    return ' '.join(filtered_text)


def stem_text(text: str) -> str:
    """
    Stem the text using a nltk stemmer.
    """

    # Initialize the stemmer
    stemmer = SnowballStemmer("english")

    # Split the text into words
    words = text.split()

    # Stem each word
    stemmed_words = [stemmer.stem(word) for word in words]

    return ' '.join(stemmed_words)


def preparation_pipeline(df: pd.DataFrame) -> pd.DataFrame:
    """
    Prepares the DataFrame for modeling using the available preprocessing functions.
    """

    # Copy dataframe
    df_processed = df.copy()

    # Remove HTML tags
    df_processed['review'] = df_processed['review'].apply(remove_html_tags)

    # Remove punctuations
    df_processed['review'] = df_processed['review'].apply(remove_punctuation)
    
    # Transform to lowercase
    df_processed['review'] = df_processed['review'].apply(transform_to_lowercase)
    
    # Remove stopwords
    df_processed['review'] = df_processed['review'].apply(remove_stopwords)

    # Stem the text
    df_processed['review'] = df_processed['review'].apply(stem_text)
    
    return df_processed

In [None]:
def vectorize_text(text: str, tokenizer: tokenizers.Tokenizer):
  """
  Vectorizes the input text using a tokenizer.
  Args:
      text (str): The input text to be vectorized.
      tokenizer (tokenizers.Tokenizer): The tokenizer to use for vectorization.
  Returns:
      np.ndarray: A vector representation of the input text.
  """
  # Tokenize the text
  encoding = tokenizer.encode(text)

  # Create a vector of zeros with the size of the vocabulary
  vector = np.zeros(tokenizer.get_vocab_size())

  # Bag Of Words: Count the number of times each token appears in the text
  for token_id in encoding.ids:
      vector[token_id] +=1

  # Normalize the vector
  vector = vector / vector.sum()
  return vector

In [None]:
# Define hyperparameters
BATCH_SIZE = 64
INPUT_DIM = 2000
OUTOUT_DIM = 2
HIDDEN_DIM = 64
LEARNING_RATE = 0.0003
DROPOUT_RATE = 0.7

# Define the model
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_rate):
        super(SentimentClassifier, self).__init__()
        self.linear_1 = nn.Linear(input_dim, hidden_dim)
        self.drop_out_1 = nn.Dropout(p=dropout_rate)
        self.relu = nn.ReLU()
        self.linear_2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.relu(x)
        x = self.drop_out_1(x)
        x = self.linear_2(x)
        x = self.softmax(x)
        return x

In [None]:
def predict_sentiment(text: str, model: nn.Module, tokenizer: tokenizers.Tokenizer) -> tuple[int, float]: # tuple[label, probability]
    """
    Predict the sentiment of a given text using the trained model.
    Args:
        text (str): The text to be predicted.
        model (nn.Module): The trained model.
        tokenizer (tokenizers.Tokenizer): The tokenizer used for vectorization.
    Returns:
        int: The predicted sentiment (0 or 1).
    """

    # Preprocess the text
    processed_text = preparation_pipeline(pd.DataFrame({'review': [text]}))['review'][0]

    # Vectorize the text
    vector = vectorize_text(processed_text, tokenizer)
    vector = torch.tensor(vector, dtype=torch.float32).unsqueeze(0)

    # Predict the sentiment
    with torch.no_grad():
        output = model(vector)

    label = torch.argmax(output, dim=1).item()
    probability = output[0][label].item()

    return (label, probability)


def print_prediction(review: str, label: int, probability: float):
    """
    Print the prediction result.
    Args:
        review (str): The review text.
        label (int): The predicted sentiment (0 or 1).
        probability (float): The probability of the predicted sentiment.
    """
    sentiment = "positive" if label == 1 else "negative"
    print(f"Review: {review}\nPredicted Sentiment: {sentiment} ({probability:.2f})\n")


In [None]:
def load_checkpoint(model: nn.Module, filename: str):
    """
    Load the model state from a file.
    Args:
        model (nn.Module): The model to be loaded.
        filename (str): The name of the file to load the checkpoint from.
    """
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Model state has been restored from {filename}")

### [Pydantic](https://docs.pydantic.dev/latest/)
Pydantic is a data validation and parsing library for Python that leverages Python's type annotations. It provides powerful data validation using Python's standard type hints, making it easier to enforce data types and handle errors gracefully. Pydantic models are simple classes that define attributes and their types, and Pydantic ensures that the data conforms to those types. It's commonly used with FastAPI for request validation but is versatile enough for any data modeling.

In [None]:
# This is a standard python class, which does NOT use the pydantic library
class User:
    def __init__(self, id: int, name: str, tags: list[str]):
        self.id = id
        self.name = name
        self.tags = tags

In [None]:
# Instantiating with wrong types does not raise any errors. This is bad!
user = User(id="123", name=789, tags="not-a-list")
print(user.id, user.name, user.tags)

In [None]:
# Pydantic solves this problem

# All we need to do is to inherit from BaseModel
class UserModel(BaseModel):
    id: int
    name: str
    tags: list[str]

In [None]:
# Instantiating with wrong types (raises a clear error)
try:
    user = UserModel(id="123", name=789, tags="not-a-list")
except ValueError as e:
    print(e)

In [None]:
# You can create more detailed validation using pydantics Field
class UserModelWithValidation(BaseModel):
    id: int = Field(gt=0, description="The ID of the user, must be greater than 0")
    name: str = Field(min_length=3, max_length=50, description="The name of the user")
    tags: list[str] = Field(min_items=1, description="A list of tags associated with the user")

In [None]:
try:
    user = UserModelWithValidation(id=0, name="ab", tags=[])
except ValueError as e:
    print(e)

### Initialization

In [None]:
# Load model
model = SentimentClassifier(INPUT_DIM, HIDDEN_DIM, OUTOUT_DIM, DROPOUT_RATE)
load_checkpoint(model, filename=MODEL_FILE)

# Load tokenizer
tokenizer = tokenizers.Tokenizer.from_file(TOKENIZER_FILE)

In [None]:
# Test the model on a new review
review = "This is a great movie! I love it."
label, probability = predict_sentiment(review, model, tokenizer)
print_prediction(review, label, probability)

In [None]:
# Since the colab environment does not support running a web server, we will use the function below to simulate the API call.
def get_sentiment_api(request: dict[str:str]) -> dict:
    """
    Get the sentiment prediction for a given review.
    Args:
        request (dict): A dictionary containing the review text.
    Returns:
        dict: A dictionary containing the predicted sentiment and probability.
    """
    review = request.get("review")
    label, probability = predict_sentiment(review, model, tokenizer)
    sentiment = "positive" if label == 1 else "negative"
    return {"sentiment": sentiment, "probability": probability}

In [None]:
# Call the API function
request = {"review": "This is a great movie! I love it."}
result = get_sentiment_api(request=request)
print(f"API Result: {result}")


In [None]:
# Let's see how the "API" handles an invalid input
try:
    get_sentiment_api({"review": 123})
except Exception as e:
    print(f"Error: {e}")


### Exercise 01: Input validation
Your task is to validate the request dictionary within the api-call-function using a pydantic model.
1. Create a Pydantic model named *SentimentRequest*.
2. Enforce that the "review" field exists and is a string.
3. Add basic validation: the review text should not be empty.
4. Use the pydantic model within the api-call to validate the incoming request dictionary.

**Hints**:
* Use pydantics [BaseModel]("https://docs.pydantic.dev/latest/api/base_model/") and [Field]("https://docs.pydantic.dev/latest/concepts/fields/")


In [None]:
# TODO: Implement the pydantic model "SentimentRequest"

In [None]:
# TODO: Update the function below to use the pydantic model to validate the request
def get_sentiment_api(request: dict) -> dict:
    """
    Get the sentiment prediction for a given review.
    Args:
        request (dict): A dictionary containing the review text.
    Returns:
        dict: A dictionary containing the predicted sentiment and probability.
    """

    # TODO: Create an instance of SentimentRequest using the request data
    sentiment_request = 
    review = sentiment_request.review
    
    label, probability = predict_sentiment(review, model, tokenizer)
    sentiment = "positive" if label == 1 else "negative"
    return {"sentiment": sentiment, "probability": probability}

In [None]:
# TODO: Test the API with invalid inputs. The error message should now be clear and informative.
request = {"review": 123}
try:
    get_sentiment_api(request)
except Exception as e:
    print(f"Error: {e}")