Skip to content

Commit

Permalink
Merge branch 'master' into multiple_grid_search
Browse files Browse the repository at this point in the history
Conflicts:
	sklearn/cross_validation.py
	sklearn/feature_selection/rfe.py
	sklearn/grid_search.py
	sklearn/learning_curve.py
	sklearn/metrics/scorer.py
	sklearn/metrics/tests/test_score_objects.py
	sklearn/tests/test_grid_search.py
  • Loading branch information
mblondel committed Jan 16, 2014
2 parents 79656d5 + de2be61 commit f6a44a0
Show file tree
Hide file tree
Showing 32 changed files with 1,487 additions and 175 deletions.
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ test-code: in
$(NOSETESTS) -s -v sklearn
test-doc:
$(NOSETESTS) -s -v doc/ doc/modules/ doc/datasets/ \
doc/developers doc/tutorial/basic doc/tutorial/statistical_inference
doc/developers doc/tutorial/basic doc/tutorial/statistical_inference \
doc/tutorial/text_analytics

test-coverage:
rm -rf coverage .coverage
Expand Down
109 changes: 54 additions & 55 deletions doc/documentation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Documentation of scikit-learn 0.15
.. raw:: html

<!-- Block section -->
<!-- row -->
<div class="row-fluid">
<div class="span4 box">
<h2><a href="tutorial/basic/tutorial.html">Quick Start</a></h2>
Expand All @@ -17,10 +18,10 @@ Documentation of scikit-learn 0.15
</blockquote>
</div>
<div class="span4 box">
<h2><a href="user_guide.html">User Guide</a></h2>
<h2><a href="user_guide.html">User Guide</a></h2>
<blockquote>The main documentation. This contains an
in-depth description of all algorithms and how
to apply them.
to apply them.
</blockquote>
</div>
<div class="span4 box">
Expand All @@ -32,69 +33,67 @@ Documentation of scikit-learn 0.15
</div>
</div>

<!-- row -->
<div class="row-fluid">
<div class="span8">
<!-- Documentation overview -->
<!-- Documentation overview -->
<div class="row-fluid">
<div class="span6 box">
<h2><a href="tutorial/index.html">Tutorial</a></h2>
<blockquote>A tutorial on statistical learning for
data analysis. Contains a more in-depth discussion
of important concepts.
<div class="span4 box">
<h2><a href="tutorial/index.html">Tutorials</a></h2>
<blockquote>Useful tutorials for developing a feel
for some of scikit-learn's applications in the
machine learning field.
</blockquote>
</div>
<div class="span6 box">
<h2><a href="modules/classes.html">API</a></h2>
<blockquote>The exact API of all functions and classes, as given by the docstrings.
The API documents expected types and allowed features for all functions,
and all parameters available for the algorithms.
</blockquote>
<div class="span4 box">
<h2><a href="modules/classes.html">API</a></h2>
<blockquote>The exact API of all functions and classes, as given by the docstrings.
The API documents expected types and allowed features for all functions,
and all parameters available for the algorithms.
</blockquote>
</div>
<div class="span4 box">
<!-- doc versions -->
<h2>Other Versions</h2>
<ul>
<li><a href="http://scikit-learn.org/stable/user_guide.html">scikit-learn 0.14 (stable)</a></li>
<li>scikit-learn 0.15 (development)</li>
<li><a href="http://scikit-learn.org/0.13/user_guide.html">scikit-learn 0.13</a></li>
<li><a href="http://scikit-learn.org/0.12/user_guide.html">scikit-learn 0.12</a></li>
<li><a href="http://scikit-learn.org/0.11/user_guide.html">scikit-learn 0.11</a></li>
<li id="other-versions">Older versions
<a class="btn dropdown-toggle" data-toggle="dropdown">
<span class="caret"></span>
</a>
<ul class="dropdown-menu">
<li><a href="http://scikit-learn.org/0.10/user_guide.html">scikit-learn 0.10</a></li>
<li><a href="http://scikit-learn.org/0.9/user_guide.html">scikit-learn 0.9</a></li>
<li><a href="http://scikit-learn.org/0.8/user_guide.html">scikit-learn 0.8</a></li>
<li><a href="http://scikit-learn.org/0.7/user_guide.html">scikit-learn 0.7</a></li>
<li><a href="http://scikit-learn.org/0.6/user_guide.html">scikit-learn 0.6</a></li>
<li><a href="http://scikit-learn.org/0.5/user_guide.html">scikit-learn 0.5</a></li>
</ul>
</li>
</ul>

</div>
<!-- row -->
</div>
</div>

<div class="row-fluid">
<div class="span6 box">
<h2><a href="developers/index.html">Contributing</a></h2>
<blockquote>Information on how to contribute. This also
contains useful information for advanced users, for example
how to build their own estimators.
</blockquote>
</div>
<div class="span6 box">
<h2><a href="presentations.html">Additional Resources</a></h2>
<blockquote>Talks given, slide-sets and other information relevant to scikit-learn.
</blockquote>
</div>
<!-- row -->
</div>

<!-- doc overview -->
<!-- row -->
<div class="row-fluid">
<div class="span4 box">
<h2><a href="presentations.html">Additional Resources</a></h2>
<blockquote>Talks given, slide-sets and other information relevant to scikit-learn.
</blockquote>
</div>

<!-- doc versions -->

<div class="span4 box">
<h2>Other Versions</h2>
<ul>
<li><a href="http://scikit-learn.org/stable/user_guide.html">scikit-learn 0.14 (stable)</a></li>
<li>scikit-learn 0.15 (development)</li>

<li><a href="http://scikit-learn.org/0.13/user_guide.html">scikit-learn 0.13</a></li>
<li><a href="http://scikit-learn.org/0.12/user_guide.html">scikit-learn 0.12</a></li>
<li><a href="http://scikit-learn.org/0.11/user_guide.html">scikit-learn 0.11</a></li>
<li><a href="http://scikit-learn.org/0.10/user_guide.html">scikit-learn 0.10</a></li>
<li><a href="http://scikit-learn.org/0.9/user_guide.html">scikit-learn 0.9</a></li>
<li><a href="http://scikit-learn.org/0.8/user_guide.html">scikit-learn 0.8</a></li>
<li><a href="http://scikit-learn.org/0.7/user_guide.html">scikit-learn 0.7</a></li>
<li><a href="http://scikit-learn.org/0.6/user_guide.html">scikit-learn 0.6</a></li>
<li><a href="http://scikit-learn.org/0.5/user_guide.html">scikit-learn 0.5</a></li>
</ul>

<h2><a href="developers/index.html">Contributing</a></h2>
<blockquote>Information on how to contribute. This also
contains useful information for advanced users, for example
how to build their own estimators.
</blockquote>
</div>


<!-- row -->

</div>
</div>

1 change: 0 additions & 1 deletion doc/themes/scikit-learn/layout.html
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@
</a>
<ul class="dropdown-menu">
<li class="link-title">Scikit-learn 0.14 (stable)</li>
<li><a href="{{ pathto('tutorial/basic/tutorial') }}">Quick Start</a></li>
<li><a href="{{ pathto('tutorial/index') }}">Tutorials</a></li>
<li><a href="{{ pathto('user_guide') }}">User guide</a></li>
<li><a href="{{ pathto('modules/classes') }}">API</a></li>
Expand Down
31 changes: 30 additions & 1 deletion doc/themes/scikit-learn/static/nature.css_t
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ div.buttonPrevious, div.buttonNext {

div.buttonPrevious {
border-top-right-radius: .8em;
right: 1;
left: 0;
}

div.buttonNext {
Expand Down Expand Up @@ -1110,6 +1110,35 @@ a.btn.dropdown-toggle, a.btn.dropdown-toggle:hover{
vertical-align: baseline;
}

li#other-versions {
position: absolute;
left: inherit;
right: inherit;
top: inherit;
}

#other-versions a.btn.dropdown-toggle {
margin-left: 0.5em;
padding: 4px 6px 4px 11px;
}

#other-versions .caret {
border-left: 5px solid transparent;
border-right: 6px solid transparent;
border-bottom: 8px solid black;
border-top: 0px;
margin-right: 5px;
margin-top: 0px;
vertical-align: middle;
}

#other-versions .dropdown-menu {
position: absolute;
right: -60%;
top: -805%;
left: initial;
}

.navbar .btn, .navbar .open>.btn, .navbar .btn:hover{
display: inline-block;
padding: 4px 12px;
Expand Down
20 changes: 18 additions & 2 deletions doc/tutorial/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,31 @@

.. include:: ../includes/big_toc_css.rst

Tutorial: Statistical Learning for scientific data analysis
=======================================================================
======================
scikit-learn Tutorials
======================

|
.. toctree::
:maxdepth: 2

basic/tutorial.rst


.. toctree::
:maxdepth: 2

statistical_inference/index.rst


.. toctree::
:maxdepth: 2

text_analytics/working_with_text_data.rst

|
.. note:: **Doctest Mode**

The code-examples in the above tutorials are written in a
Expand Down
25 changes: 25 additions & 0 deletions doc/tutorial/text_analytics/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# cruft
.*.swp
*.pyc
.DS_Store
*.pdf

# folder to be used for working on the exercises
workspace

# output of the sphinx build of the documentation
tutorial/_build

# datasets to be fetched from the web and cached locally
data/twenty_newsgroups/20news-bydate.tar.gz
data/twenty_newsgroups/20news-bydate-train
data/twenty_newsgroups/20news-bydate-test

data/movie_reviews/txt_sentoken
data/movie_reviews/poldata.README.2.0

data/languages/paragraphs
data/languages/short_paragraphs
data/languages/html

data/labeled_faces_wild/lfw_preprocessed/
105 changes: 105 additions & 0 deletions doc/tutorial/text_analytics/data/languages/fetch_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@

# simple python script to collect text paragraphs from various languages on the
# same topic namely the Wikipedia encyclopedia itself

import os
try:
# Python 2 compat
from urllib2 import Request, build_opener
except ImportError:
# Python 3
from urllib.request import Request, build_opener

import lxml.html
from lxml.etree import ElementTree
import numpy as np

pages = {
u'ar': u'http://ar.wikipedia.org/wiki/%D9%88%D9%8A%D9%83%D9%8A%D8%A8%D9%8A%D8%AF%D9%8A%D8%A7',
u'de': u'http://de.wikipedia.org/wiki/Wikipedia',
u'en': u'http://en.wikipedia.org/wiki/Wikipedia',
u'es': u'http://es.wikipedia.org/wiki/Wikipedia',
u'fr': u'http://fr.wikipedia.org/wiki/Wikip%C3%A9dia',
u'it': u'http://it.wikipedia.org/wiki/Wikipedia',
u'ja': u'http://ja.wikipedia.org/wiki/Wikipedia',
u'nl': u'http://nl.wikipedia.org/wiki/Wikipedia',
u'pl': u'http://pl.wikipedia.org/wiki/Wikipedia',
u'pt': u'http://pt.wikipedia.org/wiki/Wikip%C3%A9dia',
u'ru': u'http://ru.wikipedia.org/wiki/%D0%92%D0%B8%D0%BA%D0%B8%D0%BF%D0%B5%D0%B4%D0%B8%D1%8F',
# u'zh': u'http://zh.wikipedia.org/wiki/Wikipedia',
}

html_folder = u'html'
text_folder = u'paragraphs'
short_text_folder = u'short_paragraphs'
n_words_per_short_text = 5


if not os.path.exists(html_folder):
os.makedirs(html_folder)

for lang, page in pages.items():

text_lang_folder = os.path.join(text_folder, lang)
if not os.path.exists(text_lang_folder):
os.makedirs(text_lang_folder)

short_text_lang_folder = os.path.join(short_text_folder, lang)
if not os.path.exists(short_text_lang_folder):
os.makedirs(short_text_lang_folder)

opener = build_opener()
html_filename = os.path.join(html_folder, lang + '.html')
if not os.path.exists(html_filename):
print("Downloading %s" % page)
request = Request(page)
# change the User Agent to avoid being blocked by Wikipedia
# downloading a couple of articles ones should not be abusive
request.add_header('User-Agent', 'OpenAnything/1.0')
html_content = opener.open(request).read()
open(html_filename, 'wb').write(html_content)

# decode the payload explicitly as UTF-8 since lxml is confused for some
# reason
html_content = open(html_filename).read()
if hasattr(html_content, 'decode'):
html_content = html_content.decode('utf-8')
tree = ElementTree(lxml.html.document_fromstring(html_content))
i = 0
j = 0
for p in tree.findall('//p'):
content = p.text_content()
if len(content) < 100:
# skip paragraphs that are too short - probably too noisy and not
# representative of the actual language
continue

text_filename = os.path.join(text_lang_folder,
'%s_%04d.txt' % (lang, i))
print("Writing %s" % text_filename)
open(text_filename, 'wb').write(content.encode('utf-8', 'ignore'))
i += 1

# split the paragraph into fake smaller paragraphs to make the
# problem harder e.g. more similar to tweets
if lang in ('zh', 'ja'):
# FIXME: whitespace tokenizing does not work on chinese and japanese
continue
words = content.split()
n_groups = len(words) / n_words_per_short_text
if n_groups < 1:
continue
groups = np.array_split(words, n_groups)

for group in groups:
small_content = u" ".join(group)

short_text_filename = os.path.join(short_text_lang_folder,
'%s_%04d.txt' % (lang, j))
print("Writing %s" % short_text_filename)
open(short_text_filename, 'wb').write(
small_content.encode('utf-8', 'ignore'))
j += 1
if j >= 1000:
break

27 changes: 27 additions & 0 deletions doc/tutorial/text_analytics/data/movie_reviews/fetch_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Script to download the movie review dataset"""

import os
import tarfile
try:
from urllib import urlopen
except ImportError:
from urllib.request import urlopen


URL = ("http://www.cs.cornell.edu/people/pabo/"
"movie-review-data/review_polarity.tar.gz")

ARCHIVE_NAME = URL.rsplit('/', 1)[1]
DATA_FOLDER = "txt_sentoken"


if not os.path.exists(DATA_FOLDER):

if not os.path.exists(ARCHIVE_NAME):
print("Downloading dataset from %s (3 MB)" % URL)
opener = urlopen(URL)
open(ARCHIVE_NAME, 'wb').write(opener.read())

print("Decompressing %s" % ARCHIVE_NAME)
tarfile.open(ARCHIVE_NAME, "r:gz").extractall(path='.')
os.remove(ARCHIVE_NAME)
Loading

0 comments on commit f6a44a0

Please sign in to comment.