Merge branch 'master' into multiple_grid_search

Conflicts: sklearn/cross_validation.py sklearn/feature_selection/rfe.py sklearn/grid_search.py sklearn/learning_curve.py sklearn/metrics/scorer.py sklearn/metrics/tests/test_score_objects.py sklearn/tests/test_grid_search.py
mblondel · Jan 16, 2014 · f6a44a0 · f6a44a0
2 parents 79656d5 + de2be61
commit f6a44a0
Show file tree

Hide file tree

Showing 32 changed files with 1,487 additions and 175 deletions.
diff --git a/Makefile b/Makefile
@@ -24,7 +24,8 @@ test-code: in
 	$(NOSETESTS) -s -v sklearn
 test-doc:
 	$(NOSETESTS) -s -v doc/ doc/modules/ doc/datasets/ \
-	doc/developers doc/tutorial/basic doc/tutorial/statistical_inference
+	doc/developers doc/tutorial/basic doc/tutorial/statistical_inference \
+	doc/tutorial/text_analytics
 
 test-coverage:
 	rm -rf coverage .coverage

diff --git a/doc/documentation.rst b/doc/documentation.rst
@@ -8,6 +8,7 @@ Documentation of scikit-learn 0.15
 .. raw:: html
 
         <!-- Block section -->
+          <!-- row -->
             <div class="row-fluid">
                 <div class="span4 box">
                     <h2><a href="tutorial/basic/tutorial.html">Quick Start</a></h2>
@@ -17,10 +18,10 @@ Documentation of scikit-learn 0.15
                     </blockquote>
                 </div>
                 <div class="span4 box">
-                    <h2><a href="user_guide.html">User Guide</a></h2>
+		    <h2><a href="user_guide.html">User Guide</a></h2>
                     <blockquote>The main documentation. This contains an
                         in-depth description of all algorithms and how
-                        to apply them.
+                      	to apply them.
                     </blockquote>
                 </div>
                 <div class="span4 box">
@@ -32,69 +33,67 @@ Documentation of scikit-learn 0.15
                 </div>
             </div>
 
+          <!-- row -->
             <div class="row-fluid">
-                <div class="span8">
-                    <!-- Documentation overview -->
+                <!-- Documentation overview -->
                     <div class="row-fluid">
-                        <div class="span6 box">
-                            <h2><a href="tutorial/index.html">Tutorial</a></h2>
-                            <blockquote>A tutorial on statistical learning for
-                            data analysis. Contains a more in-depth discussion
-                            of important concepts.
+                        <div class="span4 box">
+                            <h2><a href="tutorial/index.html">Tutorials</a></h2>
+                            <blockquote>Useful tutorials for developing a feel
+			    for some of scikit-learn's applications in the
+			    machine learning field.
                             </blockquote>
                         </div>
-                        <div class="span6 box">
-                            <h2><a href="modules/classes.html">API</a></h2>
-                            <blockquote>The exact API of all functions and classes, as given by the docstrings.
-                                The API documents expected types and allowed features for all functions,
-                                and all parameters available for the algorithms.
-                            </blockquote>
+                        <div class="span4 box">
+			    <h2><a href="modules/classes.html">API</a></h2>
+                    	    <blockquote>The exact API of all functions and classes, as given by the docstrings.
+                    	    The API documents expected types and allowed features for all functions,
+                    	    and all parameters available for the algorithms.
+                    	    </blockquote>
                         </div>
+			<div class="span4 box">
+			<!-- doc versions -->
+			    <h2>Other Versions</h2>
+                        <ul>
+                            <li><a href="http://scikit-learn.org/stable/user_guide.html">scikit-learn 0.14 (stable)</a></li>
+                            <li>scikit-learn 0.15 (development)</li>
+                            <li><a href="http://scikit-learn.org/0.13/user_guide.html">scikit-learn 0.13</a></li>
+                            <li><a href="http://scikit-learn.org/0.12/user_guide.html">scikit-learn 0.12</a></li>
+                            <li><a href="http://scikit-learn.org/0.11/user_guide.html">scikit-learn 0.11</a></li>
+			    <li id="other-versions">Older versions
+			    	<a class="btn dropdown-toggle" data-toggle="dropdown">
+			           <span class="caret"></span>
+			    	   </a>
+		      	    <ul class="dropdown-menu">
+			        <li><a href="http://scikit-learn.org/0.10/user_guide.html">scikit-learn 0.10</a></li>
+                            	<li><a href="http://scikit-learn.org/0.9/user_guide.html">scikit-learn 0.9</a></li>
+				<li><a href="http://scikit-learn.org/0.8/user_guide.html">scikit-learn 0.8</a></li>
+                        	<li><a href="http://scikit-learn.org/0.7/user_guide.html">scikit-learn 0.7</a></li>
+                        	<li><a href="http://scikit-learn.org/0.6/user_guide.html">scikit-learn 0.6</a></li>
+                        	<li><a href="http://scikit-learn.org/0.5/user_guide.html">scikit-learn 0.5</a></li>
+		      	    </ul>
+			</li>
+                    </ul>
 
+			</div>
                         <!-- row -->
                     </div>
+	    </div>
 
-                    <div class="row-fluid">
-                        <div class="span6 box">
-                            <h2><a href="developers/index.html">Contributing</a></h2>
-                            <blockquote>Information on how to contribute. This also
-                            contains useful information for advanced users, for example
-                            how to build their own estimators.
-                            </blockquote>
-                        </div>
-                        <div class="span6 box">
-                            <h2><a href="presentations.html">Additional Resources</a></h2>
-                            <blockquote>Talks given, slide-sets and other information relevant to scikit-learn.
-                            </blockquote>
-                        </div>
-                        <!-- row -->
-                    </div>
-
-                    <!-- doc overview -->
+          <!-- row -->
+            <div class="row-fluid">
+                <div class="span4 box">
+		    <h2><a href="presentations.html">Additional Resources</a></h2>
+                        <blockquote>Talks given, slide-sets and other information relevant to scikit-learn.
+                        </blockquote>
                 </div>
-
-                <!-- doc versions -->
-
                 <div class="span4 box">
-                    <h2>Other Versions</h2>
-                    <ul>
-                        <li><a href="http://scikit-learn.org/stable/user_guide.html">scikit-learn 0.14 (stable)</a></li>
-                        <li>scikit-learn 0.15 (development)</li>
-
-                        <li><a href="http://scikit-learn.org/0.13/user_guide.html">scikit-learn 0.13</a></li>
-                        <li><a href="http://scikit-learn.org/0.12/user_guide.html">scikit-learn 0.12</a></li>
-                        <li><a href="http://scikit-learn.org/0.11/user_guide.html">scikit-learn 0.11</a></li>
-                        <li><a href="http://scikit-learn.org/0.10/user_guide.html">scikit-learn 0.10</a></li>
-                        <li><a href="http://scikit-learn.org/0.9/user_guide.html">scikit-learn 0.9</a></li>
-                        <li><a href="http://scikit-learn.org/0.8/user_guide.html">scikit-learn 0.8</a></li>
-                        <li><a href="http://scikit-learn.org/0.7/user_guide.html">scikit-learn 0.7</a></li>
-                        <li><a href="http://scikit-learn.org/0.6/user_guide.html">scikit-learn 0.6</a></li>
-                        <li><a href="http://scikit-learn.org/0.5/user_guide.html">scikit-learn 0.5</a></li>
-                    </ul>
-
+		    <h2><a href="developers/index.html">Contributing</a></h2>
+                    <blockquote>Information on how to contribute. This also
+                    contains useful information for advanced users, for example
+                    how to build their own estimators.
+                    </blockquote>
                 </div>
-
-
-                <!-- row -->
+
             </div>
-    </div>
+
diff --git a/doc/themes/scikit-learn/layout.html b/doc/themes/scikit-learn/layout.html
@@ -79,7 +79,6 @@
 		      </a>
 		      <ul class="dropdown-menu">
 			<li class="link-title">Scikit-learn 0.14 (stable)</li>
-			<li><a href="{{ pathto('tutorial/basic/tutorial') }}">Quick Start</a></li>
 			<li><a href="{{ pathto('tutorial/index') }}">Tutorials</a></li>
 			<li><a href="{{ pathto('user_guide') }}">User guide</a></li>
 			<li><a href="{{ pathto('modules/classes') }}">API</a></li>

diff --git a/doc/themes/scikit-learn/static/nature.css_t b/doc/themes/scikit-learn/static/nature.css_t
@@ -294,7 +294,7 @@ div.buttonPrevious, div.buttonNext {
 
 div.buttonPrevious {
     border-top-right-radius: .8em;
-    right: 1;
+    left: 0;
 }
 
 div.buttonNext {
@@ -1110,6 +1110,35 @@ a.btn.dropdown-toggle,  a.btn.dropdown-toggle:hover{
   vertical-align: baseline;
 }
 
+li#other-versions {
+  position: absolute;
+  left: inherit;
+  right: inherit;
+  top: inherit;
+}
+
+#other-versions a.btn.dropdown-toggle {
+  margin-left: 0.5em;
+  padding: 4px 6px 4px 11px;
+}
+
+#other-versions .caret {
+  border-left: 5px solid transparent;
+  border-right: 6px solid transparent;
+  border-bottom: 8px solid black;
+  border-top: 0px;
+  margin-right: 5px;
+  margin-top: 0px;
+  vertical-align: middle;
+}
+
+#other-versions .dropdown-menu {
+  position: absolute;
+  right: -60%;
+  top: -805%;
+  left: initial;
+}
+
 .navbar .btn, .navbar .open>.btn, .navbar .btn:hover{
   display: inline-block;
   padding: 4px 12px;

diff --git a/doc/tutorial/index.rst b/doc/tutorial/index.rst
@@ -4,15 +4,31 @@
 
 .. include:: ../includes/big_toc_css.rst
 
-Tutorial: Statistical Learning for scientific data analysis
-=======================================================================
+======================
+scikit-learn Tutorials
+======================
+
+|
+
+.. toctree::
+   :maxdepth: 2
+
+   basic/tutorial.rst
+
 
 .. toctree::
    :maxdepth: 2
 
    statistical_inference/index.rst
 
 
+.. toctree::
+   :maxdepth: 2
+
+   text_analytics/working_with_text_data.rst
+
+|
+
 .. note:: **Doctest Mode**
 
    The code-examples in the above tutorials are written in a

diff --git a/doc/tutorial/text_analytics/.gitignore b/doc/tutorial/text_analytics/.gitignore
@@ -0,0 +1,25 @@
+# cruft
+.*.swp
+*.pyc
+.DS_Store
+*.pdf
+
+# folder to be used for working on the exercises
+workspace
+
+# output of the sphinx build of the documentation
+tutorial/_build
+
+# datasets to be fetched from the web and cached locally
+data/twenty_newsgroups/20news-bydate.tar.gz
+data/twenty_newsgroups/20news-bydate-train
+data/twenty_newsgroups/20news-bydate-test
+
+data/movie_reviews/txt_sentoken
+data/movie_reviews/poldata.README.2.0
+
+data/languages/paragraphs
+data/languages/short_paragraphs
+data/languages/html
+
+data/labeled_faces_wild/lfw_preprocessed/
diff --git a/doc/tutorial/text_analytics/data/languages/fetch_data.py b/doc/tutorial/text_analytics/data/languages/fetch_data.py
@@ -0,0 +1,105 @@
+
+# simple python script to collect text paragraphs from various languages on the
+# same topic namely the Wikipedia encyclopedia itself
+
+import os
+try:
+    # Python 2 compat
+    from urllib2 import Request, build_opener
+except ImportError:
+    # Python 3
+    from urllib.request import Request, build_opener
+
+import lxml.html
+from lxml.etree import ElementTree
+import numpy as np
+
+pages = {
+    u'ar': u'http://ar.wikipedia.org/wiki/%D9%88%D9%8A%D9%83%D9%8A%D8%A8%D9%8A%D8%AF%D9%8A%D8%A7',
+    u'de': u'http://de.wikipedia.org/wiki/Wikipedia',
+    u'en': u'http://en.wikipedia.org/wiki/Wikipedia',
+    u'es': u'http://es.wikipedia.org/wiki/Wikipedia',
+    u'fr': u'http://fr.wikipedia.org/wiki/Wikip%C3%A9dia',
+    u'it': u'http://it.wikipedia.org/wiki/Wikipedia',
+    u'ja': u'http://ja.wikipedia.org/wiki/Wikipedia',
+    u'nl': u'http://nl.wikipedia.org/wiki/Wikipedia',
+    u'pl': u'http://pl.wikipedia.org/wiki/Wikipedia',
+    u'pt': u'http://pt.wikipedia.org/wiki/Wikip%C3%A9dia',
+    u'ru': u'http://ru.wikipedia.org/wiki/%D0%92%D0%B8%D0%BA%D0%B8%D0%BF%D0%B5%D0%B4%D0%B8%D1%8F',
+#    u'zh': u'http://zh.wikipedia.org/wiki/Wikipedia',
+}
+
+html_folder = u'html'
+text_folder = u'paragraphs'
+short_text_folder = u'short_paragraphs'
+n_words_per_short_text = 5
+
+
+if not os.path.exists(html_folder):
+    os.makedirs(html_folder)
+
+for lang, page in pages.items():
+
+    text_lang_folder = os.path.join(text_folder, lang)
+    if not os.path.exists(text_lang_folder):
+        os.makedirs(text_lang_folder)
+
+    short_text_lang_folder = os.path.join(short_text_folder, lang)
+    if not os.path.exists(short_text_lang_folder):
+        os.makedirs(short_text_lang_folder)
+
+    opener = build_opener()
+    html_filename = os.path.join(html_folder, lang + '.html')
+    if not os.path.exists(html_filename):
+        print("Downloading %s" % page)
+        request = Request(page)
+        # change the User Agent to avoid being blocked by Wikipedia
+        # downloading a couple of articles ones should not be abusive
+        request.add_header('User-Agent', 'OpenAnything/1.0')
+        html_content = opener.open(request).read()
+        open(html_filename, 'wb').write(html_content)
+
+    # decode the payload explicitly as UTF-8 since lxml is confused for some
+    # reason
+    html_content = open(html_filename).read()
+    if hasattr(html_content, 'decode'):
+        html_content = html_content.decode('utf-8')
+    tree = ElementTree(lxml.html.document_fromstring(html_content))
+    i = 0
+    j = 0
+    for p in tree.findall('//p'):
+        content = p.text_content()
+        if len(content) < 100:
+            # skip paragraphs that are too short - probably too noisy and not
+            # representative of the actual language
+            continue
+
+        text_filename = os.path.join(text_lang_folder,
+                                     '%s_%04d.txt' % (lang, i))
+        print("Writing %s" % text_filename)
+        open(text_filename, 'wb').write(content.encode('utf-8', 'ignore'))
+        i += 1
+
+        # split the paragraph into fake smaller paragraphs to make the
+        # problem harder e.g. more similar to tweets
+        if lang in ('zh', 'ja'):
+        # FIXME: whitespace tokenizing does not work on chinese and japanese
+            continue
+        words = content.split()
+        n_groups = len(words) / n_words_per_short_text
+        if n_groups < 1:
+            continue
+        groups = np.array_split(words, n_groups)
+
+        for group in groups:
+            small_content = u" ".join(group)
+
+            short_text_filename = os.path.join(short_text_lang_folder,
+                                               '%s_%04d.txt' % (lang, j))
+            print("Writing %s" % short_text_filename)
+            open(short_text_filename, 'wb').write(
+                small_content.encode('utf-8', 'ignore'))
+            j += 1
+            if j >= 1000:
+                break
+
diff --git a/doc/tutorial/text_analytics/data/movie_reviews/fetch_data.py b/doc/tutorial/text_analytics/data/movie_reviews/fetch_data.py
@@ -0,0 +1,27 @@
+"""Script to download the movie review dataset"""
+
+import os
+import tarfile
+try:
+    from urllib import urlopen
+except ImportError:
+    from urllib.request import urlopen
+
+
+URL = ("http://www.cs.cornell.edu/people/pabo/"
+       "movie-review-data/review_polarity.tar.gz")
+
+ARCHIVE_NAME = URL.rsplit('/', 1)[1]
+DATA_FOLDER = "txt_sentoken"
+
+
+if not os.path.exists(DATA_FOLDER):
+
+    if not os.path.exists(ARCHIVE_NAME):
+        print("Downloading dataset from %s (3 MB)" % URL)
+        opener = urlopen(URL)
+        open(ARCHIVE_NAME, 'wb').write(opener.read())
+
+    print("Decompressing %s" % ARCHIVE_NAME)
+    tarfile.open(ARCHIVE_NAME, "r:gz").extractall(path='.')
+    os.remove(ARCHIVE_NAME)