In [3]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)),
])

In [11]:
import numpy as np

twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.9127829560585885

In [12]:
from sklearn.model_selection import GridSearchCV

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [14]:
%time gs_clf = gs_clf.fit(twenty_train.data[:1500], twenty_train.target[:1500])

CPU times: user 626 ms, sys: 64.6 ms, total: 691 ms
Wall time: 19.4 s


In [18]:
gs_clf.best_score_

0.95933333333333337

In [15]:
!pip install sklearn-deap

Collecting sklearn-deap
  Downloading https://files.pythonhosted.org/packages/c0/25/036417008f630f8c6f9ecde43030b525571e9a09c9267a039c47b3243270/sklearn-deap-0.2.2.tar.gz
Collecting deap>=1.0.2 (from sklearn-deap)
  Downloading https://files.pythonhosted.org/packages/af/29/e7f2ecbe02997b16a768baed076f5fc4781d7057cd5d9adf7c94027845ba/deap-1.2.2.tar.gz (936kB)
[K    100% |████████████████████████████████| 942kB 244kB/s ta 0:00:01
Building wheels for collected packages: sklearn-deap, deap
  Running setup.py bdist_wheel for sklearn-deap ... [?25ldone
[?25h  Stored in directory: /home/rafael/.cache/pip/wheels/f4/8c/dc/37182364e7eec60b7ca2f647f96fd94cca7e3af422a2d26913
  Running setup.py bdist_wheel for deap ... [?25ldone
[?25h  Stored in directory: /home/rafael/.cache/pip/wheels/22/ea/bf/dc7c8a2262025a0ab5da9ef02282c198be88902791ca0c6658
Successfully built sklearn-deap deap
Installing collected packages: deap, sklearn-deap
Successfully installed deap-1.2.2 sklearn-deap-0.2.2
[33mYou a

In [25]:
from evolutionary_search import EvolutionaryAlgorithmSearchCV
from sklearn.model_selection import StratifiedKFold

cv = EvolutionaryAlgorithmSearchCV(estimator=text_clf,
                                   params=parameters,
                                   verbose=1,
                                   population_size=50,
                                   gene_mutation_prob=0.10,
                                   gene_crossover_prob=0.5,
                                   tournament_size=3,
                                   generations_number=5,
                                   n_jobs=4)



In [26]:
%time cv.fit(twenty_train.data[:1500], twenty_train.target[:1500])

Types [1, 2, 1] and maxint [1, 1, 1] detected
--- Evolve in 8 possible combinations ---
gen	nevals	avg     	min     	max     	std      
0  	50    	0.892013	0.793333	0.959333	0.0572212
1  	29    	0.929307	0.793333	0.959333	0.032214 
2  	34    	0.948373	0.886   	0.959333	0.0209991
3  	37    	0.957413	0.915333	0.959333	0.00835293
4  	30    	0.959333	0.959333	0.959333	0         
5  	38    	0.959333	0.959333	0.959333	0         
Best individual is: {'vect__ngram_range': (1, 1), 'clf__alpha': 0.001, 'tfidf__use_idf': True}
with fitness: 0.9593333333333334
CPU times: user 771 ms, sys: 89.8 ms, total: 861 ms
Wall time: 4min 28s
