In [1]:
from final.cleaning import import_dataset, clean_whole_df, filter_df, clean_test_df, mongo_connect
from final.model_processing import partition, run_model
from final.dashboard import rate
import mlflow

In [2]:
host = 'localhost'
port = 27017
db_name = 'Movielens'

mongo_connect(host, port, db_name)

df = import_dataset(host, port, db_name)
cleaned_df = clean_whole_df(df)

In [3]:
x = 10
y = 800
input_data = 'df_train'  #change if the split  of the data is on time => in line 7 it is defined
split = 'split random' # 'split time'
base_type = 'train base' # 

In [4]:
%%time

filtering_opts = {
    "min_mean_rating": 1.5,
    "max_mean_rating": 4.5,
    "movies_threshold": 35,
    "movies_few_notes": True,
    "users_threshold": 45,
    "users_few_notes": True,
    "users_no_discriminating": True,
    "users_constant_dt": True,
}

filtered_df = filter_df(cleaned_df, **filtering_opts)

Nombre de ratings par utilisateur :
count    6040.000000
mean      163.412417
std       188.350206
min        16.000000
25%        44.000000
50%        95.000000
75%       204.000000
max      1999.000000
Name: count, dtype: float64


CPU times: user 621 ms, sys: 88.7 ms, total: 709 ms
Wall time: 691 ms


In [5]:
partition_opts = {"test_size": 0.8, "mini_size": 0.03}

train_df, test_df, train_mini, test_mini = partition(filtered_df, partition_opts)
train_df.shape, test_df.shape

((42251, 4), (169007, 4))

In [6]:
opts = {
    "n_components": x,
    "max_iter": y,
    "normalize": {"should": True, "min": 1, "max": 5},
}

model, predict_matrix = run_model(train_df, opts)
model, predict_matrix.shape

(NMF(max_iter=800, n_components=10), (10714678, 3))

In [7]:
cleaned_test_df = clean_test_df(train_df,test_df)

In [8]:
%%time

options = {
    "mse": True,
    "top_10": True,
    "bottom_10": True,
    "ndcg" : True
}

rating_train = rate(predict_matrix, train_df, options)
rating_test = rate(predict_matrix, cleaned_test_df, options)
rating_train_mse, rating_train_top_10, rating_train_bottom_10, rating_train_ndcg  = rating_train
rating_test_mse, rating_test_top_10, rating_test_bottom_10, rating_test_ndcg  = rating_test
rating_train, rating_test


CPU times: user 2.39 s, sys: 369 ms, total: 2.76 s
Wall time: 1.82 s


([7.367767735434177,
  1.3322378477529808,
  1.6647814124121063,
  0.9699745173750138],
 [7.292930594959254,
  1.2952542684046264,
  1.6948779144483201,
  0.9737218527657426])

In [9]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
mlflow.set_experiment("reco_movies_analyses")

<Experiment: artifact_location='mlflow-artifacts:/515938476354317348', creation_time=1708096681051, experiment_id='515938476354317348', last_update_time=1708096681051, lifecycle_stage='active', name='reco_movies_analyses', tags={}>

In [10]:
# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    params = {
    'n_components' : x,
    'max_iter' : y
}
    mlflow.log_params(params)

    # Log the loss metric
    mlflow.log_metric("mse_test", rating_test_mse),
    mlflow.log_metric("top_10_test", rating_test_top_10),
    mlflow.log_metric("worse_10_test", rating_test_bottom_10),
    mlflow.log_metric("ndcg_test", rating_test_ndcg),
    mlflow.log_metric("mse_train", rating_train_mse),
    mlflow.log_metric("top_10_train", rating_train_top_10),
    mlflow.log_metric("worse_10_train", rating_train_bottom_10),
    mlflow.log_metric("ndcg_train", rating_train_ndcg)
    # mlflow.log_metric("indicators_test", rating_test),
    # mlflow.log_metric("indicators_test", rating_train)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag(f"{split} ", f"components {x}, iteration {y} ")

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="NMF_Model",
        signature=None,
        input_example= input_data,
        registered_model_name=f"NMF on {base_type} datas base {split}, {x} components and {y} iteration max ",
    )

Successfully registered model 'NMF on train base datas base split random, 10 components and 800 iteration max '.
2024/02/19 12:09:45 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: NMF on train base datas base split random, 10 components and 800 iteration max , version 1
Created version '1' of model 'NMF on train base datas base split random, 10 components and 800 iteration max '.
