CP Data science

In [9]:
import pandas as pd
from mlm_wm_client_prospecting.data_loader import DataLoader
from mlm_wm_client_prospecting.all_models import RandomForest, XGBoost, ExtraTrees, PermutationImportance
import warnings
warnings.filterwarnings("ignore")

import mlflow
import mlflow.sklearn

N_CHUNKS = 7
NB_FEATURES = 30

### Create a function for model processing

In [10]:
def process_model(model, features, labels, features_test, labels_test, permutation=False):
    """
    Process single model. Train the model, get feature importance list, measure accuracy on the test data.
    :param model:
    :param features:
    :param labels:
    :param features_test:
    :param labels_test:
    :param permutation:
    :return:
    """

    model.train(features, labels)
    features_imp = model.get_feature_importance(features, nb_features=NB_FEATURES)
    metrics = model.get_metrics(features_test, labels_test)

    mlflow.log_metric("accuracy", metrics['Accuracy'])
    mlflow.log_metric("f1", metrics['F1'])
    mlflow.log_metric("precision", metrics['Precision'])

    df = pd.DataFrame(features_imp)
    df.reset_index().to_json('features.json', orient='values')
    mlflow.log_artifact("features.json") # log features

    mlflow.sklearn.log_model(model.model, "model") # save model

In [11]:
# load data
df = pd.read_csv('train.csv') # change this

In [12]:
target='AP005200_Likely_To_Switch_Investment_Provider_Fin_rank_base_20_AP005200'
threshold=15
dataLoader = DataLoader(df=df)
dataLoader.label_data(target, threshold)
features, _, labels, _ = dataLoader.split_data(0.0, target)
chunk_size = features.shape[0] // N_CHUNKS

## Experiments
Apply train process on different models and data size. Log metrics, data size, model and feature importance

In [13]:
mlflow.create_experiment("WM - Client Prospecting")
mlflow.set_experiment("WM - Client Prospecting")

In [14]:
for i in range(1, N_CHUNKS+1):
    X_Features = features.iloc[:i * chunk_size]
    Y_Labels = labels.iloc[:i * chunk_size]

    rf = RandomForest()
    et = ExtraTrees()
    xgb = XGBoost()

    with mlflow.start_run():
        mlflow.log_param('model', 'Random Forest')
        mlflow.log_param('data_size', i*chunk_size)
        process_model(rf, X_Features, Y_Labels, X_Features, Y_Labels, permutation=False)
    with mlflow.start_run():
        mlflow.log_param('data_size', i * chunk_size)
        mlflow.log_param('model', 'Extra Trees')
        process_model(et, X_Features, Y_Labels, X_Features, Y_Labels, permutation=False)
    with mlflow.start_run():
        mlflow.log_param('data_size', i * chunk_size)
        mlflow.log_param('model', 'XGBoost')
        process_model(xgb, X_Features, Y_Labels, X_Features, Y_Labels, permutation=False)