# MLOPS project - Model design using MLFlow with an experiment function

Author : **Nicolas Deronsart**

In [1]:
import mlflow
import subprocess

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.pipeline import Pipeline

from spacy.lang.fr.stop_words import STOP_WORDS as french_stopwords

from hyperopt import hp, fmin, tpe

## MLFlow tracking

We set up MLFlow tracking to monitor the model training and evaluation.

In [2]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

mlflow.sklearn.autolog(log_datasets=False)
mlflow.set_experiment("model_design")

<Experiment: artifact_location='mlflow-artifacts:/638688685417919419', creation_time=1699969570525, experiment_id='638688685417919419', last_update_time=1699969570525, lifecycle_stage='active', name='model_design', tags={}>

## Import the data

Before training a first model, we need to get the training and validation dataset.

In [3]:
df = pd.read_csv('../data/train.csv')
df = df.drop('Unnamed: 0', axis=1)
df

Unnamed: 0,film-url,review,polarity
0,http://www.allocine.fr/film/fichefilm-135259/c...,Si vous cherchez du cinéma abrutissant à tous ...,0
1,http://www.allocine.fr/film/fichefilm-172430/c...,"Trash, re-trash et re-re-trash...! Une horreur...",0
2,http://www.allocine.fr/film/fichefilm-15105/cr...,"Et si, dans les 5 premières minutes du film, l...",0
3,http://www.allocine.fr/film/fichefilm-188629/c...,Mon dieu ! Quelle métaphore filée ! Je suis ab...,0
4,http://www.allocine.fr/film/fichefilm-23514/cr...,"Premier film de la saga Kozure Okami, ""Le Sabr...",1
...,...,...,...
159995,http://www.allocine.fr/film/fichefilm-132387/c...,Un rythme bien trop lent et un Ashton Kutcher ...,0
159996,http://www.allocine.fr/film/fichefilm-53313/cr...,Monsieur Duchovny vous êtes aussi piètre acteu...,0
159997,http://www.allocine.fr/film/fichefilm-248258/c...,Complètement différent des films de la série C...,1
159998,http://www.allocine.fr/film/fichefilm-268731/c...,Alors franchement pour le moment c'est le meil...,1


In [4]:
df_valid = pd.read_csv('../data/valid.csv')
df_valid = df_valid.drop('Unnamed: 0', axis=1)
df_valid

Unnamed: 0,film-url,review,polarity
0,http://www.allocine.fr/film/fichefilm-51895/cr...,Ce film est tout ce qu'il y a de plus sympa. M...,0
1,http://www.allocine.fr/film/fichefilm-272/crit...,"The Wall a été réalisé par Alan Parker (Fame, ...",1
2,http://www.allocine.fr/film/fichefilm-60134/cr...,"Encore un film majeur tres mal distribué, comm...",1
3,http://www.allocine.fr/film/fichefilm-31396/cr...,L'idée est très bonne mais le film manque de r...,0
4,http://www.allocine.fr/film/fichefilm-135195/c...,Un petit nanar rigolo a regarder. A voir une f...,0
...,...,...,...
19995,http://www.allocine.fr/film/fichefilm-39142/cr...,Ce petit film tourné en 18 jours pour la somme...,1
19996,http://www.allocine.fr/film/fichefilm-8171/cri...,"Le roman de Forsyth, d'où est tiré le scénario...",0
19997,http://www.allocine.fr/film/fichefilm-228026/c...,"Qu'on aime ou pas ""Toni Erdmann"" - et au sorti...",1
19998,http://www.allocine.fr/film/fichefilm-219994/c...,"Un film qui a fait un certain buzz médiatique,...",1


Now, we can create a pipeline to train and predict the polaririty prediction model.

## Creation of an experiment function

In [5]:
def build_model(
    dataset,
    pipeline,
    mlflow_run_tags = None,
    mlflow_run_parameters = None,
    mlflow_run_description = None,
    validation_set = None
):
    """
    Build a sentiment analysis model, print the evaluation result and store everything to MLFlow
    @param: dataset: pandas dataframe containing the input training set
    @param: pipeline: scikit-learn pipeline that will be applied to the input data
    @param: mlflow_run_tags: dict of tags that will be stored in the MLFlow run
    @param: mlflow_run_parameters: dict of parameters that will be stored in the MLFlow run
    @param: mlflow_run_description: textual description of the run 
    @param: validation_set = None
    @return: the ModelInfo of the model generated by MLFlow 
    """
    with mlflow.start_run(description=mlflow_run_description):
        pipeline.fit(dataset['review'], dataset['polarity'])

        mlflow.set_tags(mlflow_run_tags)

        mlflow.log_params(mlflow_run_parameters)

        if validation_set is not None:
            y_pred = pipeline.predict(validation_set['review'])
            
            y_pred_proba = pipeline.predict_proba(validation_set['review'])
            y_pred_proba = [x[1] for x in y_pred_proba]

            y_true = validation_set['polarity']
            
            experiment_metrics = {
                "accuracy": accuracy_score(y_true, y_pred),
                "f1": f1_score(y_true, y_pred),
                "precision": precision_score(y_true, y_pred),
                "recall": recall_score(y_true, y_pred),
                "roc_auc":roc_auc_score(y_true, y_pred_proba)
            }
            mlflow.log_metrics(experiment_metrics)

## Experimenting with different models and hyperparameters

To get a good model we can try different models and hyperparameters.

First, we can try with a simple logistic regression model.

In [6]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=list(french_stopwords))),
    ('model', LogisticRegression(penalty='l2', C=1.0, max_iter=500, random_state=42))
])

build_model(
    dataset=df,
    pipeline=pipeline,
    mlflow_run_tags={
        "vectorizer": "TfidfVectorizer",
        "classifier": "LogisticRegression",
        "mlflow.source.name": "model_design_3.ipynb",
        "mlflow.note.content": "Sentiment analysis on movies reviews",
        "mlflow.source.git.commit": subprocess.check_output(["git", "rev-parse", "HEAD"]),
        "dataset": "train.csv",
    },
    mlflow_run_parameters={
        "penalty": "l2",
        "C": 1.0,
        "max_iter": 500,
        "random_state": 42,
    },
    mlflow_run_description="Logistic regression model with TF-IDF vectorization",
    validation_set=df_valid
)

                            'anterieur', 'dessous', 'ouvert', 'elle-même',
                            'etc', 'ouste', 'seules', 'ouverts', 'proche',
                            'vers', 'quoique', 'hé', 'enfin', 'siennes', 'elle',
                            'possible', 'ô', 'ci', 'ah', 'me', 'celle-la', "n'",
                            'd’', 'maint', 'pres', 'soit', ...])), ('model', LogisticRegression(max_iter=...`


We can try other hyperparameters for a logistic regression model.

In [7]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=list(french_stopwords))),
    ('model', LogisticRegression(penalty='l1', solver='liblinear', C=10, max_iter=500, random_state=42))
])

build_model(
    dataset=df,
    pipeline=pipeline,
    mlflow_run_tags={
        "vectorizer": "TfidfVectorizer",
        "classifier": "LogisticRegression",
        "mlflow.source.name": "model_design_3.ipynb",
        "mlflow.note.content": "Sentiment analysis on movies reviews",
        "mlflow.source.git.commit": subprocess.check_output(["git", "rev-parse", "HEAD"]),
        "dataset": "train.csv",
    },
    mlflow_run_parameters={
        "penalty": "l2",
        "solver": "liblinear",
        "C": 1.0,
        "max_iter": 500,
        "random_state": 42,
    },
    mlflow_run_description="Logistic regression model with TF-IDF vectorization",
    validation_set=df_valid
)

                            'anterieur', 'dessous', 'ouvert', 'elle-même',
                            'etc', 'ouste', 'seules', 'ouverts', 'proche',
                            'vers', 'quoique', 'hé', 'enfin', 'siennes', 'elle',
                            'possible', 'ô', 'ci', 'ah', 'me', 'celle-la', "n'",
                            'd’', 'maint', 'pres', 'soit', ...])), ('model', LogisticRegression(C=10, max...`


Let's also try with a mutlinomial naive bayes model. First with the default hyperparameter alpha = 1.

In [8]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=list(french_stopwords))),
    ('model', MultinomialNB(alpha=1.0))
])

build_model(
    dataset=df,
    pipeline=pipeline,
    mlflow_run_tags={
        "vectorizer": "TfidfVectorizer",
        "classifier": "MultinomialNB",
        "mlflow.source.name": "model_design_3.ipynb",
        "mlflow.note.content": "Sentiment analysis on movies reviews",
        "mlflow.source.git.commit": subprocess.check_output(["git", "rev-parse", "HEAD"]),
        "dataset": "train.csv",
    },
    mlflow_run_parameters={
        "alpha": 1.0,
        "force_alpha": "warn",
    },
    mlflow_run_description="Multinomial Naive Bayes model with TF-IDF vectorization",
    validation_set=df_valid
)



And then with another value for alpha : 0.1.

In [9]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=list(french_stopwords))),
    ('model', MultinomialNB(alpha=0.1))
])

build_model(
    dataset=df,
    pipeline=pipeline,
    mlflow_run_tags={
        "vectorizer": "TfidfVectorizer",
        "classifier": "MultinomialNB",
        "mlflow.source.name": "model_design_3.ipynb",
        "mlflow.note.content": "Sentiment analysis on movies reviews",
        "mlflow.source.git.commit": subprocess.check_output(["git", "rev-parse", "HEAD"]),
        "dataset": "train.csv",
    },
    mlflow_run_parameters={
        "alpha": 0.1,
        "force_alpha": "warn",
    },
    mlflow_run_description="Multinomial Naive Bayes model with TF-IDF vectorization",
    validation_set=df_valid
)



By comparing the results of the different models in the MLFlow application, we can see that the best model between the ones we tried is the logistic regression model with the hyperparameter C = 1 and the penalty "l2". Indeed we have an accuracy on the validation set of :
* 0.919 for the logistic regression model with the hyperparameter C = 1 and the penalty "l2";
* 0.911 for the logistic regression model with the hyperparameter C = 10 and the penalty "l1";
* 0.897 for the multinomial naive Bayes model with the hyperparameter alpha = 1;
* 0.891 for the multinomial naive Bayes model with the hyperparameter alpha = 0.1.

## Hyperparameters optimization

To be able to find the best hyperparameters for a logistic regression model with our data, we can use the hyperopt library.

In [10]:
def build_optimized_model(
    params,
    dataset=df,
    validation_set = df_valid,
    mlflow_run_tags = None,
    mlflow_run_parameters = None,
    mlflow_run_description = None
):
    """
    Build a sentiment analysis model, print the evaluation result and store everything to MLFlow
    @param: params: dict of parameters that will be passed to the model
    @param: dataset: pandas dataframe containing the input training set
    @param: validation_set: pandas dataframe containing the validation set
    @param: mlflow_run_tags: dict of tags that will be stored in the MLFlow run
    @param: mlflow_run_parameters: dict of parameters that will be stored in the MLFlow run
    @param: mlflow_run_description: textual description of the run 
    @return: the ModelInfo of the model generated by MLFlow 
    """
    with mlflow.start_run(description=mlflow_run_description):
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words=list(french_stopwords))),
            ('model', LogisticRegression(**params, max_iter=500, random_state=42))
        ])
        pipeline.fit(dataset['review'], dataset['polarity'])

        mlflow.set_tags(mlflow_run_tags)

        mlflow.log_params(mlflow_run_parameters)

        y_pred = pipeline.predict(validation_set['review'])
        
        y_pred_proba = pipeline.predict_proba(validation_set['review'])
        y_pred_proba = [x[1] for x in y_pred_proba]

        y_true = validation_set['polarity']
        
        experiment_metrics = {
            "accuracy": accuracy_score(y_true, y_pred),
            "f1": f1_score(y_true, y_pred),
            "precision": precision_score(y_true, y_pred),
            "recall": recall_score(y_true, y_pred),
            "roc_auc":roc_auc_score(y_true, y_pred_proba)
        }
        mlflow.log_metrics(experiment_metrics)

        return 1 - experiment_metrics['accuracy']

In [11]:
def objective(params):
    """
    Objective function for the hyperopt optimization
    @param: params: dict of parameters to test
    @return: dict of results
    """
    
    return build_optimized_model(
        params=params,
        mlflow_run_tags={
            "vectorizer": "TfidfVectorizer",
            "classifier": "LogisticRegression",
            "mlflow.source.name": "model_design_3.ipynb",
            "mlflow.note.content": "Sentiment analysis on movies reviews",
            "mlflow.source.git.commit": subprocess.check_output(["git", "rev-parse", "HEAD"]),
            "dataset": "train.csv",
            "hyperopt_candidate": "True"
        },
        mlflow_run_parameters=params,
        mlflow_run_description="Logistic regression model with TF-IDF vectorization",
    )

In [12]:
space = {
    'C': hp.uniform('C', 0.1, 5.0)
}

In [13]:
fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=5
)

  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]

                            'anterieur', 'dessous', 'ouvert', 'elle-même',
                            'etc', 'ouste', 'seules', 'ouverts', 'proche',
                            'vers', 'quoique', 'hé', 'enfin', 'siennes', 'elle',
                            'possible', 'ô', 'ci', 'ah', 'me', 'celle-la', "n'",
                            'd’', 'maint', 'pres', 'soit', ...])), ('model', LogisticRegression(C=2.98046...`





 20%|██        | 1/5 [00:33<02:14, 33.65s/trial, best loss: 0.08040000000000003]

                            'anterieur', 'dessous', 'ouvert', 'elle-même',
                            'etc', 'ouste', 'seules', 'ouverts', 'proche',
                            'vers', 'quoique', 'hé', 'enfin', 'siennes', 'elle',
                            'possible', 'ô', 'ci', 'ah', 'me', 'celle-la', "n'",
                            'd’', 'maint', 'pres', 'soit', ...])), ('model', LogisticRegression(C=4.60235...`





 40%|████      | 2/5 [01:07<01:41, 33.98s/trial, best loss: 0.08040000000000003]

                            'anterieur', 'dessous', 'ouvert', 'elle-même',
                            'etc', 'ouste', 'seules', 'ouverts', 'proche',
                            'vers', 'quoique', 'hé', 'enfin', 'siennes', 'elle',
                            'possible', 'ô', 'ci', 'ah', 'me', 'celle-la', "n'",
                            'd’', 'maint', 'pres', 'soit', ...])), ('model', LogisticRegression(C=0.51214...`





 60%|██████    | 3/5 [01:41<01:08, 34.02s/trial, best loss: 0.08040000000000003]

                            'anterieur', 'dessous', 'ouvert', 'elle-même',
                            'etc', 'ouste', 'seules', 'ouverts', 'proche',
                            'vers', 'quoique', 'hé', 'enfin', 'siennes', 'elle',
                            'possible', 'ô', 'ci', 'ah', 'me', 'celle-la', "n'",
                            'd’', 'maint', 'pres', 'soit', ...])), ('model', LogisticRegression(C=1.80140...`





 80%|████████  | 4/5 [02:15<00:33, 33.69s/trial, best loss: 0.08040000000000003]

                            'anterieur', 'dessous', 'ouvert', 'elle-même',
                            'etc', 'ouste', 'seules', 'ouverts', 'proche',
                            'vers', 'quoique', 'hé', 'enfin', 'siennes', 'elle',
                            'possible', 'ô', 'ci', 'ah', 'me', 'celle-la', "n'",
                            'd’', 'maint', 'pres', 'soit', ...])), ('model', LogisticRegression(C=0.15856...`





100%|██████████| 5/5 [02:48<00:00, 33.62s/trial, best loss: 0.08040000000000003]


{'C': 2.9804647537341}

By searching the best hyperparameters combination with the hyperopt library, we can find the best hyperparameters for a logistic regression model with our data. The best hyperparameters are : C = 'C': 2.9804647537341 with a penalty "l2". We can find all of our models in the MLFlow application.