# Introduction
This is a very rough first draft at importing and cleansing the data. Solution if heavily inspired by (okay... Completely ripped off) from https://gist.github.com/mbforbes/cee3fd5bb3a797b059524fe8c8ccdc2b


## Getting the content
Start by downloading the repository of (english) books. This is done in bash. Only tested on Ubuntu, but mac should work the same

```
wget -m -H -nd "http://www.gutenberg.org/robot/harvest?filetypes[]=txt&langs[]=en"

                http://www.gutenberg.org/robot/harvest?offset=40532&filetypes[]=txt&langs[]=en
```
Takes a few hours to run, and is stored in a folder called rawContent. 
This is then copied to another folder, and we can start to clean up the mess

First we delete some dublications of the same books:
```
ls | grep "\-8.zip" | xargs rm
ls | grep "\-0.zip" | xargs rm
```
We can then unzip the files, and remove the zip files
```
unzip "*zip"
rm *.zip
```

Next we take care of some nested foldering
```
mv */*.txt ./
```
And finally, we remove all rubbish that isn't a real book:

```
ls | grep -v "\.txt" | xargs rm -rf
```


# Start the data cleansing

## Start with all imports at one place

In [1]:
from __future__ import absolute_import
from builtins import str
import os
from six import u

from os import listdir
from os.path import isfile, join

import nltk
import re
from operator import itemgetter    
import pandas as pd
from functools import reduce

## Next we define some constants
Much more are probably needed. only been testing on a few books at a time

In [2]:
file_path = "processedData"

TEXT_START_MARKERS = frozenset((u(_) for _ in (
    "*END*THE SMALL PRINT",
    "*** START OF THE PROJECT GUTENBERG",
    "*** START OF THIS PROJECT GUTENBERG",
    "This etext was prepared by",
    "E-text prepared by",
    "Produced by",
    "Distributed Proofreading Team",
    "Proofreading Team at http://www.pgdp.net",
    "http://gallica.bnf.fr)",
    "      http://archive.org/details/",
    "http://www.pgdp.net",
    "by The Internet Archive)",
    "by The Internet Archive/Canadian Libraries",
    "by The Internet Archive/American Libraries",
    "public domain material from the Internet Archive",
    "Internet Archive)",
    "Internet Archive/Canadian Libraries",
    "Internet Archive/American Libraries",
    "material from the Google Print project",
    "*END THE SMALL PRINT",
    "***START OF THE PROJECT GUTENBERG",
    "This etext was produced by",
    "*** START OF THE COPYRIGHTED",
    "The Project Gutenberg",
    "http://gutenberg.spiegel.de/ erreichbar.",
    "Project Runeberg publishes",
    "Beginning of this Project Gutenberg",
    "Project Gutenberg Online Distributed",
    "Gutenberg Online Distributed",
    "the Project Gutenberg Online Distributed",
    "Project Gutenberg TEI",
    "This eBook was prepared by",
    "http://gutenberg2000.de erreichbar.",
    "This Etext was prepared by",
    "This Project Gutenberg Etext was prepared by",
    "Gutenberg Distributed Proofreaders",
    "Project Gutenberg Distributed Proofreaders",
    "the Project Gutenberg Online Distributed Proofreading Team",
    "**The Project Gutenberg",
    "*SMALL PRINT!",
    "More information about this book is at the top of this file.",
    "tells you about restrictions in how the file may be used.",
    "l'authorization à les utilizer pour preparer ce texte.",
    "of the etext through OCR.",
    "*****These eBooks Were Prepared By Thousands of Volunteers!*****",
    "We need your donations more than ever!",
    " *** START OF THIS PROJECT GUTENBERG",
    "****     SMALL PRINT!",
    '["Small Print" V.',
    '      (http://www.ibiblio.org/gutenberg/',
    'and the Project Gutenberg Online Distributed Proofreading Team',
    'Mary Meehan, and the Project Gutenberg Online Distributed Proofreading',
    '                this Project Gutenberg edition.',
)))


TEXT_END_MARKERS = frozenset((u(_) for _ in (
    "*** END OF THE PROJECT GUTENBERG",
    "*** END OF THIS PROJECT GUTENBERG",
    "***END OF THE PROJECT GUTENBERG",
    "End of the Project Gutenberg",
    "End of The Project Gutenberg",
    "Ende dieses Project Gutenberg",
    "by Project Gutenberg",
    "End of Project Gutenberg",
    "End of this Project Gutenberg",
    "Ende dieses Projekt Gutenberg",
    "        ***END OF THE PROJECT GUTENBERG",
    "*** END OF THE COPYRIGHTED",
    "End of this is COPYRIGHTED",
    "Ende dieses Etextes ",
    "Ende dieses Project Gutenber",
    "Ende diese Project Gutenberg",
    "**This is a COPYRIGHTED Project Gutenberg Etext, Details Above**",
    "Fin de Project Gutenberg",
    "The Project Gutenberg Etext of ",
    "Ce document fut presente en lecture",
    "Ce document fut présenté en lecture",
    "More information about this book is at the top of this file.",
    "We need your donations more than ever!",
    "END OF PROJECT GUTENBERG",
    " End of the Project Gutenberg",
    " *** END OF THIS PROJECT GUTENBERG",
)))


LEGALESE_START_MARKERS = frozenset((u(_) for _ in (
    "<<THIS ELECTRONIC VERSION OF",
)))


LEGALESE_END_MARKERS = frozenset((u(_) for _ in (
    "SERVICE THAT CHARGES FOR DOWNLOAD",
)))

TITLE_MARKERS = frozenset((u(_) for _ in (
    "Title:",
)))

AUTHOR_MARKERS = frozenset((u(_) for _ in (
    "Author:",
)))
DATE_MARKERS = frozenset((u(_) for _ in (
    "Release Date:","Release Date:"
)))
LANGUAGE_MARKERS = frozenset((u(_) for _ in (
    "Language:",
)))
ENCODING_MARKERS = frozenset((u(_) for _ in (
    "Character set encoding:",
)))


# Define functions for preprocessing

## Read a single file

In [3]:
def read_file(file_name):
    file = open(file_name, encoding="ISO-8859-1")
    file_content = file.read()

    lines = file_content.splitlines()
    sep = str(os.linesep)

    # Initialize results for single book
    content_lines = []
    i = 0
    footer_found = False
    ignore_section = False

    title = ""
    author = ""
    date = ""
    language = ""
    encoding = ""
    year = 0

    # Reset flags for each book
    title_found = False
    author_found = False
    date_found = False
    language_found = False
    encoding_found = False

    for line in lines:
            reset = False

            #print(line)
            if i <= 600:
                # Shamelessly stolen
                if any(line.startswith(token) for token in TEXT_START_MARKERS):
                    reset = True

                # Extract Metadata
                if title_found == False:
                    if any(line.startswith(token) for token in TITLE_MARKERS):
                        title_found = True
                        title = line
                if author_found == False:
                    if any(line.startswith(token) for token in AUTHOR_MARKERS):
                        author_found = True
                        author = line
                if date_found == False:
                    if any(line.startswith(token) for token in DATE_MARKERS):
                        date_found = True
                        date = line
                        year = int(re.findall(r'\d{4}', date)[0])
                if language_found == False:
                    if any(line.startswith(token) for token in LANGUAGE_MARKERS):
                        language_found = True
                        language = line
                if encoding_found == False:
                    if any(line.startswith(token) for token in ENCODING_MARKERS):
                        encoding_found = True
                        encoding = line

                # More theft from above
                if reset:
                    content_lines = []
                    continue

            # I feel like a criminal by now. Guess what? Also stolen
            if i >= 100:
                if any(line.startswith(token) for token in TEXT_END_MARKERS):
                    footer_found = True

                if footer_found:
                    break

            if any(line.startswith(token) for token in LEGALESE_START_MARKERS):
                ignore_section = True
                continue
            elif any(line.startswith(token) for token in LEGALESE_END_MARKERS):
                ignore_section = False
                continue

            if not ignore_section:
                if line != "": # Screw the blank lines
                    content_lines.append(line.rstrip(sep))
                i += 1

            sep.join(content_lines)

    # Do more cleaning
    for token in TITLE_MARKERS:
        title = title.replace(token, '').lstrip().rstrip()
    for token in AUTHOR_MARKERS:
        author = author.replace(token, '').lstrip().rstrip()
    for token in LANGUAGE_MARKERS:
        language = language.replace(token, '').lstrip().rstrip()
    for token in DATE_MARKERS:
        date = date.replace(token, '').lstrip().rstrip()
    for token in ENCODING_MARKERS:
        encoding = encoding.replace(token, '').lstrip().rstrip()
    return title, author, date, year, language, encoding, content_lines
    

## Return list of all words
Currently quite an empty function. However, I assume that some cleaning of cases etc. will be done here

In [4]:
def get_words(content_lines):
    all_text_lower = " ".join(content_lines).lower()
    words = re.findall(r'(\b[A-Za-z][a-z]{2,9}\b)', all_text_lower)

    # Do more cleansing. E.g. cases and stuff
    
    return words

## First attempt at actually creating statistics
Currently only a simple counting

In [5]:
def get_word_frequencies(words):
    frequency = {}
    for word in words:
        count = frequency.get(word,0)
        frequency[word] = count + 1

    word_count = len(words)
    unique_word_count = 0
    word_list = []
    word_list_count = []
    for key, value in reversed(sorted(frequency.items(), key = itemgetter(1))):
        word_list.append(key)
        word_list_count.append(value)
        unique_word_count = unique_word_count + 1
    
    word_list_freq = [freq / word_count for freq in word_list_count]
    
    word_freq = pd.DataFrame(list(zip(word_list, word_list_count, word_list_freq))
                             , columns = ['Word', 'count', 'freq'])
    
    word_freq['rank'] = word_freq['count'].rank(ascending = False, method="dense")

    return(word_freq, unique_word_count)

# Read all files, and do preprocessing
Well... Only five files currently

In [19]:
import random
    
# Get all filenames
files = [f for f in listdir(file_path) if isfile(join(file_path, f))]
files = list(filter(lambda file: file[0].isdigit(), files))
random.shuffle(files)


# Do only subset
files = files[0:500]

list_of_file = []
list_of_title = []
list_of_author = []
list_of_date = []
list_of_year = []
list_of_language = []
list_of_encoding = []
list_of_word_count = []
list_of_unique_word_count = []
list_of_word_frequencies = []
iter_ = 0

for file in files:
    print("FILENAME:" + file)
    # Read in basic information from file
    title, author, date, year, language, encoding, content_lines = read_file(file_path + "/" + file)
    line_count = len(content_lines)

    # Not sure if we want this for later:
    #content_all = " ".join(content_lines)
    
    # Split into words (and do various cleaning)
    #words = get_words(content_lines)
    #word_count = len(words)

    # First analysis, but should do something proper
    #word_frequencies_table, unique_word_count = get_word_frequencies(words)

    
    # Append to results
    list_of_file.append(file)
    list_of_title.append(title)
    list_of_author.append(author)
    list_of_date.append(date)
    list_of_year.append(year)
    list_of_language.append(language)
    list_of_encoding.append(encoding)
    #list_of_word_count.append(word_count)
    #list_of_unique_word_count.append(unique_word_count)
    #list_of_word_frequencies.append(word_frequencies_table)
    
    
    # Show basic information
    print(iter_)
    iter_ = iter_ + 1
    #print("################################")
    #print("################################")
    #print("Filename: " + str(file))
    #print("Title: " + str(title))
    #print("Author(s): " + str(author))
    #print("Date: " + str(date))
    #print("Year: " + str(year))
    #print("Language: " + str(language))
    #print("Encoding: " + str(encoding))
    #print("################################")
    #print("Words in book: " + str(word_count))
    #print("Unique words in book: " + str(unique_word_count))
    #print("################################")
    #print(word_frequencies_table)

# Feel free to change to dict? list? separate files?
## nested dataframes works, but looks super ungly when printing
### Fuck it - This is tooo useless killing it again
#all_res = pd.DataFrame(list(zip(list_of_file
#                                , list_of_title
#                                , list_of_author
#                                , list_of_date
#                                , list_of_language
#                                , list_of_encoding
#                                , list_of_word_count
#                                , list_of_unique_word_count
#                                , list_of_word_frequencies
#                                ))
#                             , columns = ['file'
#                                          , 'title'
#                                          , 'author'
#                                          , 'date'
#                                          , 'language'
#                                          , 'encoding'
#                                          , 'word_count'
#                                          , 'unique_word_count'
#                                          , 'word_frequencies'
#                                         ]
#                      )
                

FILENAME:5403.txt
0
FILENAME:34934.txt
1
FILENAME:31991.txt
2
FILENAME:18587.txt
3
FILENAME:28091.txt
4
FILENAME:29983.txt
5
FILENAME:29614.txt
6
FILENAME:12038.txt
7
FILENAME:9212.txt
8
FILENAME:15829.txt
9
FILENAME:668.txt
10
FILENAME:60698.txt
11
FILENAME:15709.txt
12
FILENAME:60309.txt
13
FILENAME:33464.txt
14
FILENAME:37557.txt
15
FILENAME:3926.txt
16
FILENAME:11261.txt
17
FILENAME:36538.txt
18
FILENAME:25747.txt
19
FILENAME:37429.txt
20
FILENAME:5120.txt
21
FILENAME:40675.txt
22
FILENAME:11219.txt
23
FILENAME:22236.txt
24
FILENAME:22211.txt
25
FILENAME:32706.txt
26
FILENAME:4979.txt
27
FILENAME:7513.txt
28
FILENAME:28442.txt
29
FILENAME:39918.txt
30
FILENAME:46846.txt
31
FILENAME:10873.txt
32
FILENAME:31790.txt
33
FILENAME:36585.txt
34
FILENAME:60846.txt
35
FILENAME:9591.txt
36
FILENAME:28156.txt
37
FILENAME:18309.txt
38
FILENAME:45842.txt
39
FILENAME:31752.txt
40
FILENAME:41698.txt
41
FILENAME:13634.txt
42
FILENAME:11921.txt
43
FILENAME:35545.txt
44
FILENAME:17866.txt
45
FILENAM

364
FILENAME:18579.txt
365
FILENAME:10801.txt
366
FILENAME:35396.txt
367
FILENAME:38936.txt
368
FILENAME:64012.txt
369
FILENAME:43530.txt
370
FILENAME:5534.txt
371
FILENAME:26090.txt
372
FILENAME:10800.txt
373
FILENAME:10121.txt
374
FILENAME:15919.txt
375
FILENAME:46889.txt
376
FILENAME:37458.txt
377
FILENAME:15145.txt
378
FILENAME:8378.txt
379
FILENAME:9373.txt
380
FILENAME:2193.txt
381
FILENAME:38534.txt
382
FILENAME:44724.txt
383
FILENAME:20070.txt
384
FILENAME:44089.txt
385
FILENAME:4057.txt
386
FILENAME:286.txt
387
FILENAME:14301.txt
388
FILENAME:47006.txt
389
FILENAME:46437.txt
390
FILENAME:40404.txt
391
FILENAME:5296.txt
392
FILENAME:39742.txt
393
FILENAME:17125.txt
394
FILENAME:28391.txt
395
FILENAME:26119.txt
396
FILENAME:18421.txt
397
FILENAME:31397.txt
398
FILENAME:28226.txt
399
FILENAME:38869.txt
400
FILENAME:25659.txt
401
FILENAME:46581.txt
402
FILENAME:23432.txt
403
FILENAME:13587.txt
404
FILENAME:32356.txt
405
FILENAME:42641.txt
406
FILENAME:7497.txt
407
FILENAME:13242.t

# Compare Word ranking between titles

In [7]:
list_count= []
list_freq = []
list_rank = []


col_names = list_of_title.copy()
col_names.insert(0,'Word')



for df in list_of_word_frequencies:
    list_count.append(df[['Word', 'count']])
    list_freq.append(df[['Word', 'freq']])
    list_rank.append(df[['Word', 'rank']])
    
df_count = reduce(lambda left, right: pd.merge(left, right, on="Word", how='outer'), list_count)
df_count.columns = col_names
df_count['Sum'] = df_count.drop('Word', axis=1).apply(lambda x: x.sum(), axis=1)
df_count = df_count.sort_values(ascending = False, by=['Sum'])

df_freq = reduce(lambda left, right: pd.merge(left, right, on="Word", how='outer'), list_freq)
df_freq.columns = col_names
df_freq['Avg'] = df_freq.drop('Word', axis=1).apply(lambda x: x.mean(), axis=1)
df_freq = df_freq.sort_values(ascending = False, by=['Avg'])

df_rank = reduce(lambda left, right: pd.merge(left, right, on="Word", how='outer'), list_rank)
df_rank.columns = col_names
df_rank['Avg'] = df_rank.drop('Word', axis=1).apply(lambda x: x.mean(), axis=1)
df_rank = df_rank.sort_values(by=['Avg'])


In [8]:
pd.set_option('display.max_rows', None)
df_rank.head(30)

Unnamed: 0,Word,The Simpkins Plot,The Expositor's Bible: The Pastoral Epistles,A Lecture on the Study of History,Antwerp to Gallipoli,Heroines of Mormondom,Avg
0,the,1.0,1.0,1.0,1.0,1.0,1.0
2,and,3.0,2.0,2.0,2.0,2.0,2.2
3,that,4.0,3.0,3.0,5.0,10.0,5.0
6,meldon,7.0,,,,,7.0
5,was,6.0,11.0,19.0,3.0,3.0,8.4
7,for,8.0,6.0,11.0,10.0,7.0,8.4
11,with,12.0,12.0,17.0,4.0,11.0,11.2
14,his,15.0,7.0,15.0,14.0,9.0,12.0
15,not,16.0,5.0,12.0,15.0,17.0,13.0
13,but,14.0,9.0,21.0,18.0,13.0,15.0


In [9]:
#df_freq['Avg'] = df_freq.drop('Word', axis=1).apply(lambda x: x.mean(), axis=1)
df_freq = df_freq.sort_values(ascending = False, by=['Avg'])

df_freq.head(20)

Unnamed: 0,Word,The Simpkins Plot,The Expositor's Bible: The Pastoral Epistles,A Lecture on the Study of History,Antwerp to Gallipoli,Heroines of Mormondom,Avg
0,the,0.064162,0.094727,0.0635,0.088397,0.079708,0.078099
2,and,0.029723,0.045328,0.030223,0.051238,0.047835,0.040869
3,that,0.026043,0.021651,0.012532,0.01207,0.010768,0.016613
5,was,0.013752,0.008932,0.004528,0.013272,0.020509,0.012199
6,meldon,0.0116,,,,,0.0116
1,you,0.035737,0.001204,0.001211,0.004967,0.006223,0.009868
7,for,0.011337,0.010683,0.006318,0.008099,0.011364,0.00956
11,with,0.009053,0.008472,0.004949,0.012149,0.009307,0.008786
14,his,0.008232,0.0103,0.005581,0.005932,0.010877,0.008184
15,not,0.008067,0.012599,0.006213,0.005742,0.006494,0.007823


In [10]:
df_count.head(20)

Unnamed: 0,Word,The Simpkins Plot,The Expositor's Bible: The Pastoral Epistles,A Lecture on the Study of History,Antwerp to Gallipoli,Heroines of Mormondom,Sum
0,the,3905.0,8654.0,1206.0,5588.0,1473.0,20826.0
2,and,1809.0,4141.0,574.0,3239.0,884.0,10647.0
3,that,1585.0,1978.0,238.0,763.0,199.0,4763.0
5,was,837.0,816.0,86.0,839.0,379.0,2957.0
1,you,2175.0,110.0,23.0,314.0,115.0,2737.0
7,for,690.0,976.0,120.0,512.0,210.0,2508.0
11,with,551.0,774.0,94.0,768.0,172.0,2359.0
15,not,491.0,1151.0,118.0,363.0,120.0,2243.0
14,his,501.0,941.0,106.0,375.0,201.0,2124.0
38,which,221.0,1222.0,132.0,298.0,83.0,1956.0


# I tried something completely different
This definately needs some proper refactoring, but Was curious whether we get anything decent from reading a bunch of random books in

Requires an additional folder "decades" in the root directory

## Edit:
Don't know which file it failed at, and too tired to care right now... Got a small GB before crashing; should be plenty for the first test

In [11]:
import math


# Get all filenames
files = [f for f in listdir(file_path) if isfile(join(file_path, f))]

# Do only subset
files = files[0:5000]

counter = 0
for file in files:
    counter = counter + 1
    #print(str(counter) + ": FILENAME:" + file)
    # Read in basic information from file
    title, author, date, year, language, encoding, content_lines = read_file(file_path + "/" + file)
    #line_count = len(content_lines)
    decade = math.floor(year / 10) * 10
    decade_file = "decades/" + str(decade) + ".txt"
    # Not sure if we want this for later:
    content_all = " ".join(content_lines)
    
    if os.path.exists(decade_file):
        append_write = 'a' # append if already exists
    else:
        append_write = 'w' # make a new file if not

    fileWriter = open(decade_file,append_write)
    fileWriter.write(content_all + '\n')
    fileWriter.close()

IndexError: list index out of range

In [15]:
# Get all filenames
files = [f for f in listdir("decades") if isfile(join("decades", f))]
print(files)
files.sort(reverse=True)


col_names = []
col_names.append("Word")

tables = []

for file_name in files:
    print(file_name)
    
    file = open("decades/" + file_name, encoding="ISO-8859-1")
    file_content = file.read()
    
    # Split into words (and do various cleaning)
    all_text_lower = file_content.lower()
    words = re.findall(r'(\b[A-Za-z][a-z]{2,9}\b)', all_text_lower)

    # First analysis, but should do something proper
    word_frequencies_table, unique_word_count = get_word_frequencies(words)
    tables.append(word_frequencies_table)
    col_names.append(file_name)


['00.txt', '0.txt', '2010.txt', '2000.txt', '2020.txt', '1990.txt']
2020.txt
2010.txt
2000.txt
1990.txt
00.txt
0.txt


In [16]:
list_count= []
list_freq = []
list_rank = []



for df in tables:
    #list_count.append(df[['Word', 'count']])
    #list_freq.append(df[['Word', 'freq']])
    list_rank.append(df[['Word', 'rank']])
    
#df_count = reduce(lambda left, right: pd.merge(left, right, on="Word", how='outer'), list_count)
#df_count.columns = col_names

#df_freq = reduce(lambda left, right: pd.merge(left, right, on="Word", how='outer'), list_freq)
#df_freq.columns = col_names

df_rank = reduce(lambda left, right: pd.merge(left, right, on="Word", how='outer'), list_rank)
df_rank.columns = col_names


In [17]:
df_rank.head(100)

Unnamed: 0,Word,2020.txt,2010.txt,2000.txt,1990.txt,00.txt,0.txt
0,the,1.0,1.0,1.0,1.0,1.0,1.0
1,and,2.0,2.0,2.0,2.0,2.0,2.0
2,that,3.0,3.0,3.0,3.0,4.0,3.0
3,was,4.0,4.0,4.0,4.0,23.0,5.0
4,you,5.0,9.0,8.0,5.0,163.0,19.0
5,with,6.0,6.0,6.0,7.0,3.0,6.0
6,for,7.0,7.0,7.0,11.0,13.0,8.0
7,his,8.0,5.0,5.0,6.0,5.0,4.0
8,not,9.0,10.0,12.0,12.0,11.0,9.0
9,had,10.0,8.0,9.0,10.0,40.0,14.0


Ngrams, Zipf's law... alright
Model idea: https://www.quora.com/Can-NLTK-be-used-for-multinomial-Naive-Bayes-classification

In [99]:
from sklearn.feature_extraction.text import TfidfTransformer

file_contents = []
targets = []

files = [f for f in listdir(file_path) if isfile(join(file_path, f))]
files = list(filter(lambda file: file[0].isdigit(), files))
random.shuffle(files)

targets_=['70','80','90','00','10']
iter_ = 0

for f in files[:120]:
    file = open("processedData/" + f, encoding="ISO-8859-1")
    file_contents.append(file.read())
    iter_ = iter_+1
    targets.append(targets_[iter_%5])

In [109]:
from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.feature_selection import SelectKBest, chi2

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('kbest', SelectKBest(chi2, k=100)),
    ('nb', MultinomialNB()),
])

parameters = {
    #'vect__max_df': [1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    #'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    #'clf__max_iter': (20),
    #'clf__alpha': (0.00001),
    #'clf__penalty': ('l2'),
    # 'clf__max_iter': (10, 50, 80),
}

grid_search = GridSearchCV(pipeline, parameters, verbose=1)

grid_search.fit(file_contents, targets)
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   18.8s finished
