# Import Packages

In [1]:
from MLTrainer import MLTrainer
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Generate Classification Dataset

In [2]:
X, y = make_classification(random_state=42, n_samples=10000, n_features=5, n_classes=3,
                          n_informative=2, n_clusters_per_class=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Train Multiple Models Using MLTrainer

Multiple Sklearn models will be trained using either with GridSearchCV or without. The trained models, their parameters and cross validation scores will be stored.

Explanations of parameters of methods can be seen in the source code itself. Parameters are fully laid out when calling them in this notebook.

Model parameters used for grid search are found in source code, default model parameters are used when grid search is not enabled.

.cv_scores attribute contains a Pandas DataFrame containing model names, parameters, mean cross validation scores for each batch and remark. Remark will only have an entry if the model was not able to be trained.

.models attribute contains a list of trained model objects

.fit method trains multiple models on training set and saves their mean cross validation scores in a Pandas DataFrame

.evaluate method generates scores for test set for each model, then saves classification reports, confusion matrices and label probabilities in CSV. Each model will have its own folder in the current directory.

In [3]:
models = MLTrainer(ensemble=True, linear=True, naive_bayes=True, neighbors=True, svm=True, decision_tree=True, seed=100)
models.fit(X=X_train, Y=y_train, n_folds=5, scoring="accuracy", n_jobs=-1, gridsearchcv=True, param_grids={}, greater_is_better=True)


<MLTrainer.models.MLTrainer at 0x7f8b4a218eb8>

In [4]:
models.cv_scores

Unnamed: 0,model,parameters,mean_cv_accuracy,remarks
0,adaboost,"{'algorithm': 'SAMME', 'base_estimator': None,...",0.9172,
1,bagging,"{'base_estimator': None, 'bootstrap': True, 'b...",0.943733,
2,extratrees,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.942,
3,gradientboosting,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...",0.9524,
4,randomforest,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.9464,
5,logreg,,,Solver newton-cg supports only 'l2' or 'none' ...
6,bernoulli,"{'alpha': 0.1, 'binarize': 0.0, 'class_prior':...",0.9044,
7,gaussian,"{'priors': None, 'var_smoothing': 1e-10}",0.943733,
8,multinomial,,,Negative values in data passed to MultinomialN...
9,complement,,,Negative values in data passed to ComplementNB...


In [5]:
models.models

[AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=0.1,
                    n_estimators=50, random_state=None),
 BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                   max_features=1.0, max_samples=1.0, n_estimators=50,
                   n_jobs=None, oob_score=False, random_state=None, verbose=0,
                   warm_start=False),
 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                      criterion='gini', max_depth=None, max_features='auto',
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False),
 GradientBoostingClassifier(ccp_alpha=0.0, crite

In [6]:
preds = models.predict(X_test)
preds

{'adaboost': array([2, 0, 0, ..., 1, 1, 1]),
 'bagging': array([2, 0, 0, ..., 1, 1, 1]),
 'extratrees': array([2, 0, 0, ..., 1, 1, 1]),
 'gradientboosting': array([2, 0, 0, ..., 1, 1, 1]),
 'randomforest': array([2, 0, 0, ..., 1, 1, 1]),
 'logreg': sklearn.exceptions.NotFittedError("This LogisticRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."),
 'bernoulli': array([2, 0, 0, ..., 1, 1, 1]),
 'gaussian': array([2, 0, 0, ..., 1, 1, 1]),
 'multinomial': AttributeError("'MultinomialNB' object has no attribute 'feature_log_prob_'"),
 'complement': AttributeError("'ComplementNB' object has no attribute 'feature_log_prob_'"),
 'knn': array([2, 0, 0, ..., 1, 1, 1]),
 'nu': array([2, 0, 0, ..., 1, 1, 1]),
 'svc': array([2, 0, 0, ..., 1, 1, 1]),
 'decision': array([2, 0, 0, ..., 1, 1, 1]),
 'extra': array([2, 0, 0, ..., 1, 1, 1])}

In [7]:
pred_probas = models.predict_proba(X_test)
pred_probas

{'adaboost': array([[0.34281552, 0.2800355 , 0.37714899],
        [0.37135809, 0.28016314, 0.34847877],
        [0.37135809, 0.28016314, 0.34847877],
        ...,
        [0.31941884, 0.40163403, 0.27894713],
        [0.31941884, 0.40163403, 0.27894713],
        [0.32305279, 0.39777039, 0.27917682]]),
 'bagging': array([[0.  , 0.  , 1.  ],
        [0.98, 0.  , 0.02],
        [1.  , 0.  , 0.  ],
        ...,
        [0.  , 1.  , 0.  ],
        [0.  , 1.  , 0.  ],
        [0.  , 1.  , 0.  ]]),
 'extratrees': array([[0.  , 0.  , 1.  ],
        [0.94, 0.  , 0.06],
        [1.  , 0.  , 0.  ],
        ...,
        [0.  , 1.  , 0.  ],
        [0.  , 1.  , 0.  ],
        [0.  , 1.  , 0.  ]]),
 'gradientboosting': array([[0.00719011, 0.00333348, 0.98947641],
        [0.97954192, 0.00689448, 0.0135636 ],
        [0.97954192, 0.00689448, 0.0135636 ],
        ...,
        [0.00603196, 0.98996448, 0.00400356],
        [0.00603196, 0.98996448, 0.00400356],
        [0.0078834 , 0.98772158, 0.00439503

In [8]:
models.evaluate(test_X=X_test, test_Y=y_test, idx_label_dic=None, class_report="classf_report.csv", con_mat="confusion_matrix.csv", pred_proba="predictions_proba.csv")