In [19]:
from create_lda_datasets import *
import scipy.sparse as sp
from classifiers import *

In [4]:
def evaluate_model(clf, X_train, X_test, y_train, y_test):
    y_true, y_pred = y_train, clf.predict(X_train)

    print("Detailed classification report:\n")
    print("Scores on training set.\n")
    print(classification_report(y_true, y_pred))

    y_true, y_pred = y_test, clf.predict(X_test)
    print("Scores on test set.\n")
    print(classification_report(y_true, y_pred))

In [5]:
u = 83119297

In [6]:
X_train, X_valid, X_test, y_train, y_valid, y_test = load_small_validation_dataframe(u)

X_train_lda, X_valid_lda, X_test_lda, y_train, y_valid, y_test = load_lda_dataset(u)

X_train_combined = sp.hstack((X_train, X_train_lda))
X_valid_combined = sp.hstack((X_valid, X_valid_lda))
X_test_combined = sp.hstack((X_test, X_test_lda))

In [9]:
w1 = sum(y_train)/len(y_train)
w0 = 1 - w1
sample_weights = np.array([w0 if x==0 else w1 for x in y_train])

In [10]:
ds_comb = (X_train_combined, X_valid_combined, y_train, y_valid)

In [11]:
ds_sna = (X_train, X_valid, y_train, y_valid)

In [12]:
clf_sna = model_select_rdf((X_train, X_valid, y_train, y_valid))

# Tuning hyper-parameters for f1

Best parameters set found on training set:

{'max_features': None, 'n_estimators': 100, 'min_samples_leaf': 1, 'max_depth': None, 'class_weight': 'balanced_subsample'}
Detailed classification report:

Scores on training set.
             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00      3345
        1.0       0.93      0.99      0.96       155

avg / total       1.00      1.00      1.00      3500


Scores on test set.

             precision    recall  f1-score   support

        0.0       1.00      0.95      0.97       480
        1.0       0.43      1.00      0.61        20

avg / total       0.98      0.95      0.96       500




In [13]:
clf_sna2 = model_select_sgd((X_train, X_valid, y_train, y_valid))

# Tuning hyper-parameters for f1

Best parameters set found on training set:

{'penalty': 'l2', 'alpha': 1e-05, 'n_iter': 80, 'loss': 'log'}
Detailed classification report:

Scores on training set.
             precision    recall  f1-score   support

        0.0       0.96      1.00      0.98      3345
        1.0       0.50      0.01      0.03       155

avg / total       0.94      0.96      0.94      3500


Scores on test set.

             precision    recall  f1-score   support

        0.0       0.96      1.00      0.98       480
        1.0       1.00      0.05      0.10        20

avg / total       0.96      0.96      0.95       500




In [14]:
clf_sna3 = model_select_svc((X_train, X_valid, y_train, y_valid))

# Tuning hyper-parameters for f1

Best parameters set found on training set:

{'kernel': 'rbf', 'C': 0.05, 'gamma': 10}
Detailed classification report:

Scores on training set.
             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00      3345
        1.0       1.00      0.95      0.97       155

avg / total       1.00      1.00      1.00      3500


Scores on test set.

             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00       480
        1.0       1.00      0.95      0.97        20

avg / total       1.00      1.00      1.00       500




In [16]:
clf_comb = model_select_rdf((X_train_combined, X_valid_combined, y_train, y_valid))

# Tuning hyper-parameters for f1

Best parameters set found on training set:

{'max_features': 'auto', 'n_estimators': 10, 'min_samples_leaf': 3, 'max_depth': 5, 'class_weight': 'balanced_subsample'}
Detailed classification report:

Scores on training set.
             precision    recall  f1-score   support

        0.0       0.97      0.76      0.85      3345
        1.0       0.09      0.50      0.15       155

avg / total       0.93      0.75      0.82      3500


Scores on test set.

             precision    recall  f1-score   support

        0.0       0.96      0.89      0.93       480
        1.0       0.07      0.20      0.10        20

avg / total       0.93      0.86      0.89       500




In [17]:
clf_comb2 = model_select_sgd((X_train_combined, X_valid_combined, y_train, y_valid))

# Tuning hyper-parameters for f1

Best parameters set found on training set:

{'penalty': 'l1', 'alpha': 1e-05, 'n_iter': 80, 'loss': 'hinge'}
Detailed classification report:

Scores on training set.
             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00      3345
        1.0       0.98      0.97      0.97       155

avg / total       1.00      1.00      1.00      3500


Scores on test set.

             precision    recall  f1-score   support

        0.0       1.00      0.95      0.97       480
        1.0       0.45      1.00      0.62        20

avg / total       0.98      0.95      0.96       500




In [18]:
clf_comb3 = model_select_svc((X_train_combined, X_valid_combined, y_train, y_valid))

# Tuning hyper-parameters for f1

Best parameters set found on training set:

{'kernel': 'rbf', 'C': 1, 'gamma': 10}
Detailed classification report:

Scores on training set.
             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00      3345
        1.0       1.00      0.95      0.98       155

avg / total       1.00      1.00      1.00      3500


Scores on test set.

             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00       480
        1.0       1.00      0.95      0.97        20

avg / total       1.00      1.00      1.00       500


