In [7]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

imdb_train = load_files('aclImdb/train/', categories=['neg', 'pos'])
imdb_test = load_files('aclImdb/test/', categories=['neg', 'pos'])

count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(imdb_train.data)
x_test_counts = count_vect.transform(imdb_test.data)

tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
x_test_tfidf = tfidf_transformer.transform(x_test_counts)

In [8]:
import classifiers

parameters = {
    'max_iter':[100, 500, 1000],
    'solver':['lbfgs', 'liblinear', 'newton-cg']
}

classifiers.lr_fold(x_train_tfidf, x_test_tfidf, imdb_train, imdb_test, parameters)

Best Parameters: {'max_iter': 100, 'solver': 'liblinear'}
Best Score: 0.8882
Parameters: {'max_iter': 100, 'solver': 'lbfgs'}, Score: 0.888 (+/- 0.003)
Parameters: {'max_iter': 100, 'solver': 'liblinear'}, Score: 0.888 (+/- 0.003)
Parameters: {'max_iter': 100, 'solver': 'newton-cg'}, Score: 0.888 (+/- 0.003)
Parameters: {'max_iter': 500, 'solver': 'lbfgs'}, Score: 0.888 (+/- 0.003)
Parameters: {'max_iter': 500, 'solver': 'liblinear'}, Score: 0.888 (+/- 0.003)
Parameters: {'max_iter': 500, 'solver': 'newton-cg'}, Score: 0.888 (+/- 0.003)
Parameters: {'max_iter': 1000, 'solver': 'lbfgs'}, Score: 0.888 (+/- 0.003)
Parameters: {'max_iter': 1000, 'solver': 'liblinear'}, Score: 0.888 (+/- 0.003)
Parameters: {'max_iter': 1000, 'solver': 'newton-cg'}, Score: 0.888 (+/- 0.003)
Test Accuracy: 0.8831


In [9]:
import classifiers

parameters = {
    'criterion':['gini', 'entropy', 'log_loss'],
    'splitter':['best', 'random']
}

classifiers.decision_tree_fold(x_train_tfidf, x_test_tfidf, imdb_train, imdb_test, parameters)

Best Parameters: {'criterion': 'log_loss', 'splitter': 'random'}
Best Score: 0.7125999999999999
Parameters: {'criterion': 'gini', 'splitter': 'best'}, Score: 0.705 (+/- 0.005)
Parameters: {'criterion': 'gini', 'splitter': 'random'}, Score: 0.711 (+/- 0.007)
Parameters: {'criterion': 'entropy', 'splitter': 'best'}, Score: 0.700 (+/- 0.006)
Parameters: {'criterion': 'entropy', 'splitter': 'random'}, Score: 0.707 (+/- 0.004)
Parameters: {'criterion': 'log_loss', 'splitter': 'best'}, Score: 0.698 (+/- 0.006)
Parameters: {'criterion': 'log_loss', 'splitter': 'random'}, Score: 0.713 (+/- 0.008)
Test Accuracy: 0.7108


In [14]:
import classifiers

parameters = {
    'max_iter':[100, 500, 1000]
}

classifiers.svc_fold(x_train_tfidf, x_test_tfidf, imdb_train, imdb_test, parameters)

Best Parameters: {'max_iter': 100}
Best Score: 0.8928800000000001
Parameters: {'max_iter': 100}, Score: 0.893 (+/- 0.006)
Parameters: {'max_iter': 500}, Score: 0.893 (+/- 0.006)
Parameters: {'max_iter': 1000}, Score: 0.893 (+/- 0.006)
Test Accuracy: 0.8772


In [11]:
import classifiers

parameters = {
    'n_estimators':[10, 25, 50],
    'learning_rate':[0.01, 0.1, 1]
}

classifiers.ada_boost_fold(x_train_tfidf, x_test_tfidf, imdb_train, imdb_test, parameters)

Best Parameters: {'learning_rate': 1, 'n_estimators': 50}
Best Score: 0.80456
Parameters: {'learning_rate': 0.01, 'n_estimators': 10}, Score: 0.615 (+/- 0.006)
Parameters: {'learning_rate': 0.01, 'n_estimators': 25}, Score: 0.653 (+/- 0.009)
Parameters: {'learning_rate': 0.01, 'n_estimators': 50}, Score: 0.653 (+/- 0.009)
Parameters: {'learning_rate': 0.1, 'n_estimators': 10}, Score: 0.681 (+/- 0.010)
Parameters: {'learning_rate': 0.1, 'n_estimators': 25}, Score: 0.696 (+/- 0.009)
Parameters: {'learning_rate': 0.1, 'n_estimators': 50}, Score: 0.729 (+/- 0.010)
Parameters: {'learning_rate': 1, 'n_estimators': 10}, Score: 0.728 (+/- 0.008)
Parameters: {'learning_rate': 1, 'n_estimators': 25}, Score: 0.776 (+/- 0.008)
Parameters: {'learning_rate': 1, 'n_estimators': 50}, Score: 0.805 (+/- 0.004)
Test Accuracy: 0.8034


In [12]:
import classifiers

parameters = {
    'n_estimators':[25, 50, 100],
    'max_depth':[10, 100, None],
    'n_jobs':[-1]
}

classifiers.random_forest_fold(x_train_tfidf, x_test_tfidf, imdb_train, imdb_test, parameters)

Best Parameters: {'max_depth': 100, 'n_estimators': 100, 'n_jobs': -1}
Best Score: 0.8352
Parameters: {'max_depth': 10, 'n_estimators': 25, 'n_jobs': -1}, Score: 0.774 (+/- 0.012)
Parameters: {'max_depth': 10, 'n_estimators': 50, 'n_jobs': -1}, Score: 0.799 (+/- 0.009)
Parameters: {'max_depth': 10, 'n_estimators': 100, 'n_jobs': -1}, Score: 0.813 (+/- 0.010)
Parameters: {'max_depth': 100, 'n_estimators': 25, 'n_jobs': -1}, Score: 0.794 (+/- 0.008)
Parameters: {'max_depth': 100, 'n_estimators': 50, 'n_jobs': -1}, Score: 0.820 (+/- 0.004)
Parameters: {'max_depth': 100, 'n_estimators': 100, 'n_jobs': -1}, Score: 0.835 (+/- 0.006)
Parameters: {'max_depth': None, 'n_estimators': 25, 'n_jobs': -1}, Score: 0.792 (+/- 0.004)
Parameters: {'max_depth': None, 'n_estimators': 50, 'n_jobs': -1}, Score: 0.817 (+/- 0.006)
Parameters: {'max_depth': None, 'n_estimators': 100, 'n_jobs': -1}, Score: 0.833 (+/- 0.004)
Test Accuracy: 0.8390
