In [36]:
from create_lda_datasets import *
import scipy.sparse as sp
from classifiers import *

In [37]:
def evaluate_model(clf, X_train, X_test, y_train, y_test):
    y_true, y_pred = y_train, clf.predict(X_train)

    print("Detailed classification report:\n")
    print("Scores on training set.\n")
    print(classification_report(y_true, y_pred))

    y_true, y_pred = y_test, clf.predict(X_test)
    print("Scores on test set.\n")
    print(classification_report(y_true, y_pred))

In [38]:
u = 166576483

In [39]:
X_train, X_valid, X_test, y_train, y_valid, y_test = load_small_validation_dataframe(u)

In [40]:
X_train_lda, X_valid_lda, X_test_lda, y_train, y_valid, y_test = load_lda_dataset(u)

In [41]:
X_train_combined = sp.hstack((X_train, X_train_lda))
X_valid_combined = sp.hstack((X_valid, X_valid_lda))
X_test_combined = sp.hstack((X_test, X_test_lda))

In [42]:
w1 = sum(y_train)/len(y_train)
w0 = 1 - w1
sample_weights = np.array([w0 if x==0 else w1 for x in y_train])

In [43]:
ds_comb = (X_train_combined, X_valid_combined, y_train, y_valid)

In [44]:
ds_sna = (X_train, X_valid, y_train, y_valid)

In [45]:
clf_sna = model_select_rdf((X_train, X_valid, y_train, y_valid))

# Tuning hyper-parameters for f1

Best parameters set found on training set:

{'max_features': 50, 'n_estimators': 100, 'min_samples_leaf': 1, 'max_depth': None, 'class_weight': 'balanced'}
Detailed classification report:

Scores on training set.
             precision    recall  f1-score   support

        0.0       0.99      0.96      0.98      3238
        1.0       0.67      0.91      0.77       262

avg / total       0.97      0.96      0.96      3500


Scores on test set.

             precision    recall  f1-score   support

        0.0       0.98      0.95      0.97       466
        1.0       0.56      0.79      0.66        34

avg / total       0.96      0.94      0.95       500




In [51]:
clf_sna2 = model_select_sgd((X_train, X_valid, y_train, y_valid))

# Tuning hyper-parameters for f1

Best parameters set found on training set:

{'penalty': 'l1', 'alpha': 1e-05, 'n_iter': 80, 'loss': 'hinge'}
Detailed classification report:

Scores on training set.
             precision    recall  f1-score   support

        0.0       0.98      1.00      0.99      3238
        1.0       1.00      0.77      0.87       262

avg / total       0.98      0.98      0.98      3500


Scores on test set.

             precision    recall  f1-score   support

        0.0       0.97      0.99      0.98       466
        1.0       0.85      0.65      0.73        34

avg / total       0.97      0.97      0.97       500




In [52]:
clf_sna3 = model_select_svc((X_train, X_valid, y_train, y_valid))

# Tuning hyper-parameters for f1

Best parameters set found on training set:

{'kernel': 'rbf', 'C': 0.01, 'gamma': 10}
Detailed classification report:

Scores on training set.
             precision    recall  f1-score   support

        0.0       0.98      1.00      0.99      3238
        1.0       1.00      0.77      0.87       262

avg / total       0.98      0.98      0.98      3500


Scores on test set.

             precision    recall  f1-score   support

        0.0       0.97      1.00      0.99       466
        1.0       1.00      0.65      0.79        34

avg / total       0.98      0.98      0.97       500




In [46]:
clf_sna_ = RandomForestClassifier()
clf_sna_.fit(X_train, y_train, sample_weight=sample_weights)
evaluate_model(clf_sna_, X_train, X_valid, y_train, y_valid)

Detailed classification report:

Scores on training set.

             precision    recall  f1-score   support

        0.0       0.99      1.00      0.99      3238
        1.0       0.97      0.82      0.89       262

avg / total       0.98      0.98      0.98      3500

Scores on test set.

             precision    recall  f1-score   support

        0.0       0.97      0.99      0.98       466
        1.0       0.81      0.65      0.72        34

avg / total       0.96      0.97      0.96       500



In [47]:
clf_comb = model_select_rdf((X_train_combined, X_valid_combined, y_train, y_valid))

# Tuning hyper-parameters for f1

Best parameters set found on training set:

{'max_features': 'auto', 'n_estimators': 30, 'min_samples_leaf': 3, 'max_depth': None, 'class_weight': 'balanced'}
Detailed classification report:

Scores on training set.
             precision    recall  f1-score   support

        0.0       0.99      0.83      0.90      3238
        1.0       0.30      0.87      0.44       262

avg / total       0.94      0.84      0.87      3500


Scores on test set.

             precision    recall  f1-score   support

        0.0       0.98      0.70      0.82       466
        1.0       0.16      0.76      0.26        34

avg / total       0.92      0.70      0.78       500




In [48]:
clf_comb2 = model_select_sgd((X_train_combined, X_valid_combined, y_train, y_valid))

# Tuning hyper-parameters for f1

Best parameters set found on training set:

{'penalty': 'l2', 'alpha': 1e-05, 'n_iter': 80, 'loss': 'hinge'}
Detailed classification report:

Scores on training set.
             precision    recall  f1-score   support

        0.0       0.98      1.00      0.99      3238
        1.0       0.96      0.81      0.88       262

avg / total       0.98      0.98      0.98      3500


Scores on test set.

             precision    recall  f1-score   support

        0.0       0.98      0.99      0.98       466
        1.0       0.79      0.68      0.73        34

avg / total       0.96      0.97      0.96       500




In [49]:
clf_comb3 = model_select_svc((X_train_combined, X_valid_combined, y_train, y_valid))

# Tuning hyper-parameters for f1

Best parameters set found on training set:

{'kernel': 'rbf', 'C': 1, 'gamma': 10}
Detailed classification report:

Scores on training set.
             precision    recall  f1-score   support

        0.0       0.99      1.00      0.99      3238
        1.0       1.00      0.83      0.91       262

avg / total       0.99      0.99      0.99      3500


Scores on test set.

             precision    recall  f1-score   support

        0.0       0.98      1.00      0.99       466
        1.0       0.96      0.68      0.79        34

avg / total       0.98      0.98      0.97       500


