# Import Packages

In [1]:
from MLTrainer import MLTrainer
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Generate Classification Dataset

In [2]:
X, y = make_classification(random_state=42, n_samples=10000, n_features=5, n_classes=3,
                          n_informative=2, n_clusters_per_class=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Train Multiple Models Using MLTrainer

Multiple Sklearn models will be trained using either with GridSearchCV or without. The trained models, their parameters and cross validation scores will be stored.

Explanations of parameters of methods can be seen in the source code itself. Parameters are fully laid out when calling them in this notebook.

Model parameters used for grid search are found in source code, default model parameters are used when grid search is not enabled.

.cv_scores attribute contains a Pandas DataFrame containing model names, parameters, mean cross validation scores for each batch and remark. Remark will only have an entry if the model was not able to be trained.

.models attribute contains a list of trained model objects

.fit method trains multiple models on training set and saves their mean cross validation scores in a Pandas DataFrame

.evaluate method generates scores for test set for each model, then saves classification reports, confusion matrices and label probabilities in CSV. Each model will have its own folder in the current directory.

In [3]:
models = MLTrainer(ensemble=True, linear=True, naive_bayes=True, neighbors=True, svm=True, decision_tree=True, seed=100)
models.fit(X=X_train, Y=y_train, n_folds=5, scoring="accuracy", n_jobs=-1, gridsearchcv=False, param_grids={}, greater_is_better=True)






ValueError: Negative values in data passed to MultinomialNB (input X)

ValueError: Negative values in data passed to MultinomialNB (input X)

ValueError: Negative values in data passed to MultinomialNB (input X)

ValueError: Negative values in data passed to MultinomialNB (input X)

ValueError: Negative values in data passed to MultinomialNB (input X)

ValueError: Negative values in data passed to ComplementNB (input X)

ValueError: Negative values in data passed to ComplementNB (input X)

ValueError: Negative values in data passed to ComplementNB (input X)

ValueError: Negative values in data passed to ComplementNB (input X)

ValueError: Negative values in data passed to ComplementNB (input X)



<MLTrainer.models.MLTrainer at 0x7f99dbdd0cd0>

In [4]:
models.cv_scores

Unnamed: 0,model,parameters,mean_cv_accuracy,remarks
0,adaboost,"{'algorithm': 'SAMME.R', 'base_estimator': Non...",0.892667,
1,bagging,"{'base_estimator': None, 'bootstrap': True, 'b...",0.943333,
2,extratrees,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.941333,
3,gradientboosting,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...",0.950267,
4,randomforest,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.946933,
5,xgboost,"{'objective': 'multi:softprob', 'use_label_enc...",0.945467,
6,logreg,"{'C': 1.0, 'class_weight': None, 'dual': False...",0.938533,
7,bernoulli,"{'alpha': 1.0, 'binarize': 0.0, 'class_prior':...",0.9044,
8,gaussian,"{'priors': None, 'var_smoothing': 1e-09}",0.943733,
9,multinomial,,,Negative values in data passed to MultinomialN...


In [5]:
models.models

[AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                    n_estimators=50, random_state=100),
 BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                   max_features=1.0, max_samples=1.0, n_estimators=10, n_jobs=-1,
                   oob_score=False, random_state=100, verbose=0,
                   warm_start=False),
 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                      criterion='gini', max_depth=None, max_features='auto',
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                      oob_score=False, random_state=100, verbose=0,
                      warm_start=False),
 GradientBoostingClassifier(ccp_alpha=0.0, criterion='frie

In [6]:
preds = models.predict(X_test)
preds

{'adaboost': array([2, 0, 0, ..., 1, 1, 1]),
 'bagging': array([2, 0, 0, ..., 1, 1, 1]),
 'extratrees': array([2, 0, 0, ..., 1, 1, 1]),
 'gradientboosting': array([2, 0, 0, ..., 1, 1, 1]),
 'randomforest': array([2, 0, 0, ..., 1, 1, 1]),
 'xgboost': array([2, 0, 0, ..., 1, 1, 1]),
 'logreg': array([2, 0, 0, ..., 1, 1, 1]),
 'bernoulli': array([2, 0, 0, ..., 1, 1, 1]),
 'gaussian': array([2, 0, 0, ..., 1, 1, 1]),
 'multinomial': AttributeError("'MultinomialNB' object has no attribute 'feature_log_prob_'"),
 'complement': AttributeError("'ComplementNB' object has no attribute 'feature_log_prob_'"),
 'knn': array([2, 0, 0, ..., 1, 1, 1]),
 'nu': array([2, 0, 0, ..., 1, 1, 1]),
 'svc': array([2, 0, 0, ..., 1, 1, 1]),
 'decision': array([2, 0, 0, ..., 1, 1, 1]),
 'extra': array([2, 0, 0, ..., 1, 1, 1])}

In [7]:
pred_probas = models.predict_proba(X_test)
pred_probas

{'adaboost': array([[0.34227098, 0.28903583, 0.36869319],
        [0.35026398, 0.32208636, 0.32764966],
        [0.35432213, 0.32600699, 0.31967088],
        ...,
        [0.31837592, 0.37471306, 0.30691102],
        [0.31837592, 0.37471306, 0.30691102],
        [0.31893436, 0.35985944, 0.3212062 ]]),
 'bagging': array([[0., 0., 1.],
        [1., 0., 0.],
        [1., 0., 0.],
        ...,
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.]]),
 'extratrees': array([[0.  , 0.  , 1.  ],
        [0.97, 0.  , 0.03],
        [1.  , 0.  , 0.  ],
        ...,
        [0.  , 1.  , 0.  ],
        [0.  , 1.  , 0.  ],
        [0.  , 1.  , 0.  ]]),
 'gradientboosting': array([[0.00376862, 0.00122177, 0.99500961],
        [0.9857674 , 0.00364433, 0.01058827],
        [0.98765278, 0.00348545, 0.00886177],
        ...,
        [0.0015049 , 0.99731264, 0.00118245],
        [0.0016525 , 0.99720732, 0.00114018],
        [0.00281662, 0.99520107, 0.00198231]]),
 'randomforest': array([[0., 0.

In [8]:
models.evaluate(test_X=X_test, test_Y=y_test, idx_label_dic=None, class_report="classf_report.csv", con_mat="confusion_matrix.csv", pred_proba="predictions_proba.csv")