In [1]:
"""
"""
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, roc_curve, auc, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder

# Load the data
file = open('data/answers.json', 'r')
list_answers = json.load(file)
file.close()

In [2]:
import re
import numpy as np
import pandas as pd
from collections import Counter
def count_previous_answers_from_list(list_answer) -> int:
    """

    :param list_answer: dictionary containing the "created" value of an answer and the campaign id
    :return: the number of answers already validated in the campaign
    """
    answer_to_process = list_answer.pop(0)
    count = 0
    for item in list_answer:
        if item['campaign'] != answer_to_process['campaign']:
            break
        count += 1
    return count

def extract_features(answer_doc, get_status: bool=True, training: bool=False):
    """

    :param answer_doc: dict object of an answer
    :param get_status: True if we want to extract the answer status in order to train a model
    :param training: True if we are in training mode
    :return: a processed dict object used to create the dataframe
    """
    res = {}
    job_dict = {
        "UPPER-MANAGEMENT": ["CEO", "COO", "CFO", "CTO", "AVP", "Assistant Vice President", "General Manager",
                             "Country Manager", "Department Head", "Head of Innovation", "Executive Director",
                             "Director", "Director of R&D", "Innovation Management & Business Development",
                             "Technology Manager", "Managing Director", "Technical Director", "Engineering Manager",
                             "Leader of Innovation Committee", "Regulatory Affairs Director"],

        "SALES": ["Key Account Manager", "Innovations Markets Manager", "Tech Transfer Officer",
                  "Global Process Analytics", "CGA", "Business Developer", "Supply Chain"],

        "OPERATIONAL": ["Senior Manager", "Senior Scientist", "Supervisor", "Project Manager", "R&D Manager",
                        "R&D", "Research Engineer", "Engineer", "Chemist", "Ingénieur d'affaires",
                        "Environmental Health & Safety", "EHS", "Proposal Writer", "Engineering", "Analyst"],

        "ACADEMIC": ["Research", "PhD", "Professor", "Lecturer", "Graduate student", "Dean", "Associate Dean"],

        "ENTREPRENEUR": ["Founder", "Co-founder", "Owner", "Co-owner", "President", "Managing Director"],

        "CONSULTANT": ["Lead Consultant", "Senior Consultant", "Consultant", "Senior Advisor", "Advisor",
                       "Scientific Medical Advisor"]
    }

    res["time_elapsed"] = 0
    if "time_elapsed" in answer_doc:
        res["time_elapsed"] = answer_doc["time_elapsed"]
        

    res["interest"] = 0
    if "answers" in answer_doc:
        answer_count = 0
        index_answer = []
        for (question_type, answer) in answer_doc["answers"].items():
            if is_answered(answer):
                answer_count += 1
                index_answer.append(answer_count)
                if re.search(r"lead|partner", question_type, re.I):
                    if re.search(r"comment", question_type, re.I) and isinstance(answer, str) \
                            and (len("".join(w for w in answer if w.isdigit())) > 6
                                 or re.search(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,63}", answer)): # Has an email?
                        res["interest"] += 2
                    else:
                        res["interest"] += 1

        if not training:  # if we are in training mode we don't want to ditch any answers
            if index_answer == [1, 2, 3, 4]:
                raise Error("Only 4 questions answered - rejecting the answer...")

        res["completion_rate"] = answer_count / len(answer_doc["answers"])
    else:
        res["completion_rate"] = 0

    res["job_title"] = "UNK"
    if "job" in answer_doc and isinstance(answer_doc["job"], str):
        for job_titles in job_dict:
            for title in job_dict[job_titles]:
                if title.lower() in answer_doc["job"].lower():
                    res["job_title"] = job_titles
                    break

    
    if get_status:
        res["label"] = "VALIDATED" if answer_doc["status"] == "VALIDATED_NO_MAIL" else answer_doc["status"]
    res["previous_answers"] = answer_doc["previous_answer"]
    
    return res

def is_answered(question) -> bool:
    if question:
        if question is None or question == "None":
            return False
        elif isinstance(question, dict):
            if all(value is None for value in question.values()):
                return False
            else:
                return True

        elif isinstance(question, list):
            if all(value is None for value in question):
                return False
            else:
                return True
        else:
            return True
    else:
        return False

In [3]:
import time
def format_log(message):
    now = time.time()
    print("{}:\t{}".format(now, message))

In [4]:
# Configure the mlflow experiment
import mlflow
import time

def preprocess_data():
    label_encoder = LabelEncoder()
    data = []
    label = []  # VALIDATED / REJECTED
    res = {"status": "not ok"}
    for index, answer in enumerate(answers):
        previous_answer = count_previous_answers_from_list(answers[index:])
        if "answers" in answer and "time_elapsed" in answer and 0 < answer["time_elapsed"] < 5000:
            answer["previous_answer"] = previous_answer
            label.append(answer["status"])
            data.append(extract_features(answer, training=True))
    data = pd.DataFrame(data)
    data["time_elapsed"] = data["time_elapsed"] / 5000
    label_encoder.fit(data["job_title"])
    job_title = label_encoder.transform(data["job_title"])
    data["job_title"] = job_title
    return data

answers = list_answers
data = preprocess_data() # This prepares the data

In [5]:
# Let's start the training here
import mlflow
from mlflow.models import ModelSignature, infer_signature
from mlflow.types.schema import Schema, ColSpec

# Split the dataset
# First training/test
# Second training/validation
# So we have tree data sets
train, test = train_test_split(data, test_size=0.25, random_state=69)
train_x = train.drop(["label"], axis=1).values
train_y = train[["label"]].values.ravel()
test_x = test.drop(["label"], axis=1).values
test_y = test[["label"]].values.ravel()

label_encoder = LabelEncoder()
label_encoder.fit(train_y)

train_y_labels = label_encoder.transform(train_y)
test_y_labels = label_encoder.transform(test_y)

input_schema = Schema(
    [
        ColSpec("double", "time_elapsed"),
        ColSpec("integer", "interest"),
        ColSpec("double", "completion_rate"),
        ColSpec("integer", "job_title"),
        ColSpec("integer", "previous_answers"),
    ]
)
output_schema = Schema([ColSpec("string")])
signature = ModelSignature(inputs=input_schema, outputs=output_schema)

In [6]:
# Weights and labels
def compute_weigts(train_y):
    # 0=rejected 1=validated -> shape: [{"0": x}, {"1": y}]
    class_weight = compute_class_weight(class_weight="balanced", classes=np.array(["REJECTED", "VALIDATED"]), y=train_y) 
    class_distrib = Counter(train_y)
    return (class_weight, class_distrib)

def train_model_(params, train_x, train_y, test_x, test_y):
    model = RandomForestClassifier(**params, 
        bootstrap=True, 
        max_features="sqrt", 
        min_samples_leaf=1, 
        min_samples_split=2,
        warm_start=True)
    model.fit(train_x, train_y_labels)
    return model
    
# Train
def train_model(params, train_x, train_y, test_x, test_y):
    with mlflow.start_run(nested=True):
        model = train_model_(params, train_x, train_y, test_x, test_y)
        scores_rf = model.predict_proba(test_x)[:, 1]
        predicted_rf = [*map(lambda x: 1 if x > 0.5 else 0, scores_rf)]
        accuracy = accuracy_score(y_true=test_y_labels, y_pred=predicted_rf)

        # Log parameters and results
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("loss", 1.0 - accuracy)

        return {"loss": 1.0 - accuracy, "status": STATUS_OK, "model": model}
    


In [7]:
def objective(params):
    # MLflow will track the parameters and results for each run
    result = train_model(
        params,
        train_x=train_x,
        train_y=train_y,
        test_x=test_x,
        test_y=test_y,
    )
    return result

In [8]:
import mlflow
from mlflow.models import infer_signature
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
class_weight, class_distrib = compute_weigts(train_y)
#This is for Hyperopt
space = {
    "n_estimators": hp.randint("n_estimators", 50, 600),
    "max_depth": hp.randint("max_depth", 10, 100),
    "class_weight": {0: class_weight[0], 1: class_weight[1]}
}

In [9]:

# Try to configure MLFlow experiment here
mlflow.set_tracking_uri(uri="http://mlflow-server:8080")
mlflow.set_experiment("/mail-answer-prediction")

with mlflow.start_run():
    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("NLP training test", "Answers classification")

    # Conduct the hyperparameter search using Hyperopt
    trials = Trials()
    best = fmin(
        fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=4,
        trials=trials,
    )

    # Fetch the details of the best run
    best_run = sorted(trials.results, key=lambda x: x["loss"])[0]

    # Log the best parameters, loss, and model
    mlflow.log_params(best)
    mlflow.log_metric("accuracy", 1.0 - best_run["loss"])
    mlflow.log_metric("loss", best_run["loss"])
    

    # Print out the best parameters and corresponding loss
    print(f"Best parameters: {best}")
    print(f"Best eval accuracy: {1.0 - best_run['loss']}")

    model = train_model_(best, train_x, train_y, test_x, test_y)
    
    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="answers_classification",
        signature=signature,
        input_example=train_x,
        registered_model_name="answers_classification",
    )

2024/04/29 15:25:07 INFO mlflow.tracking.fluent: Experiment with name '/mail-answer-prediction' does not exist. Creating a new experiment.


100%|██████████| 4/4 [01:58<00:00, 29.69s/trial, best loss: 0.24651210504718912]
Best parameters: {'max_depth': 19, 'n_estimators': 149}
Best eval accuracy: 0.7534878949528109


Successfully registered model 'answers_classification'.
2024/04/29 15:27:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: answers_classification, version 1
Created version '1' of model 'answers_classification'.
