In [144]:
from create_lda_datasets import *
import scipy.sparse as sp
from classifiers import *

In [50]:
def evaluate_model(clf, X_train, X_test, y_train, y_test):
    y_true, y_pred = y_train, clf.predict(X_train)

    print("Detailed classification report:\n")
    print("Scores on training set.\n")
    print(classification_report(y_true, y_pred))

    y_true, y_pred = y_test, clf.predict(X_test)
    print("Scores on test set.\n")
    print(classification_report(y_true, y_pred))

In [119]:
u = 37226353

In [120]:
X_train, X_valid, X_test, y_train, y_valid, y_test = load_small_validation_dataframe(u)

In [121]:
X_train_lda, X_valid_lda, X_test_lda, y_train, y_valid, y_test = load_lda_dataset(u)

In [122]:
X_train_combined = sp.hstack((X_train, X_train_lda))
X_valid_combined = sp.hstack((X_valid, X_valid_lda))
X_test_combined = sp.hstack((X_test, X_test_lda))

In [123]:
X_train_lda

<3500x488 sparse matrix of type '<type 'numpy.float64'>'
	with 27449 stored elements in Compressed Sparse Column format>

In [124]:
X_valid_lda

<500x488 sparse matrix of type '<type 'numpy.float64'>'
	with 4189 stored elements in Compressed Sparse Column format>

In [130]:
w1 = sum(y_train)/len(y_train)
w0 = 1 - w1
sample_weights = np.array([w0 if x==0 else w1 for x in y_train])

In [126]:
ds_comb = (X_train_combined, X_valid_combined, y_train, y_valid)

In [127]:
ds_sna = (X_train, X_valid, y_train, y_valid)

In [145]:
clf_sna = model_select_rdf((X_train, X_valid, y_train, y_valid))

# Tuning hyper-parameters for f1

Best parameters set found on training set:

{'max_features': 'auto', 'n_estimators': 100, 'min_samples_leaf': 1, 'max_depth': None, 'class_weight': 'balanced_subsample'}
Detailed classification report:

Scores on training set.
             precision    recall  f1-score   support

        0.0       0.99      0.97      0.98      2245
        1.0       0.94      0.98      0.96      1255

avg / total       0.97      0.97      0.97      3500


Scores on test set.

             precision    recall  f1-score   support

        0.0       0.97      0.90      0.93       320
        1.0       0.84      0.94      0.89       180

avg / total       0.92      0.92      0.92       500




In [142]:
clf_sna2 = model_select_sgd((X_train, X_valid, y_train, y_valid))

# Tuning hyper-parameters for f1

Best parameters set found on training set:

{'penalty': 'l2', 'alpha': 0.001, 'n_iter': 10, 'loss': 'hinge'}
Detailed classification report:

Scores on training set.
             precision    recall  f1-score   support

        0.0       0.98      0.96      0.97      2245
        1.0       0.93      0.96      0.95      1255

avg / total       0.96      0.96      0.96      3500


Scores on test set.

             precision    recall  f1-score   support

        0.0       0.95      0.89      0.92       320
        1.0       0.83      0.92      0.87       180

avg / total       0.91      0.90      0.90       500




In [143]:
clf_sna3 = model_select_svc((X_train, X_valid, y_train, y_valid))

# Tuning hyper-parameters for f1

Best parameters set found on training set:

{'kernel': 'rbf', 'C': 1, 'gamma': 10}
Detailed classification report:

Scores on training set.
             precision    recall  f1-score   support

        0.0       0.99      0.97      0.98      2245
        1.0       0.95      0.98      0.96      1255

avg / total       0.97      0.97      0.97      3500


Scores on test set.

             precision    recall  f1-score   support

        0.0       0.93      0.95      0.94       320
        1.0       0.91      0.88      0.89       180

avg / total       0.92      0.92      0.92       500




In [137]:
clf_comb = model_select_rdf((X_train_combined, X_valid_combined, y_train, y_valid))

# Tuning hyper-parameters for f1

Best parameters set found on training set:

{'max_features': None, 'n_estimators': 100, 'min_samples_leaf': 1, 'max_depth': None, 'class_weight': 'balanced_subsample'}
Detailed classification report:

Scores on training set.
             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00      2245
        1.0       1.00      1.00      1.00      1255

avg / total       1.00      1.00      1.00      3500


Scores on test set.

             precision    recall  f1-score   support

        0.0       0.94      0.89      0.91       320
        1.0       0.82      0.89      0.85       180

avg / total       0.89      0.89      0.89       500




In [139]:
clf_comb2 = model_select_sgd((X_train_combined, X_valid_combined, y_train, y_valid))

# Tuning hyper-parameters for f1

Best parameters set found on training set:

{'penalty': 'l2', 'alpha': 0.001, 'n_iter': 10, 'loss': 'hinge'}
Detailed classification report:

Scores on training set.
             precision    recall  f1-score   support

        0.0       0.98      0.96      0.97      2245
        1.0       0.93      0.97      0.95      1255

avg / total       0.96      0.96      0.96      3500


Scores on test set.

             precision    recall  f1-score   support

        0.0       0.96      0.89      0.92       320
        1.0       0.83      0.93      0.87       180

avg / total       0.91      0.90      0.91       500




In [140]:
clf_comb3 = model_select_svc((X_train_combined, X_valid_combined, y_train, y_valid))

# Tuning hyper-parameters for f1

Best parameters set found on training set:

{'kernel': 'rbf', 'C': 1, 'gamma': 10}
Detailed classification report:

Scores on training set.
             precision    recall  f1-score   support

        0.0       0.99      0.97      0.98      2245
        1.0       0.95      0.98      0.96      1255

avg / total       0.97      0.97      0.97      3500


Scores on test set.

             precision    recall  f1-score   support

        0.0       0.93      0.95      0.94       320
        1.0       0.91      0.88      0.89       180

avg / total       0.92      0.92      0.92       500




# ⬆ Overfitting again!!!! :/