# Importing our preprocessed data

In [28]:
import pandas as pd
import numpy as np

# Dataset URL:
# https://www.kaggle.com/datasets/athu1105/book-genre-prediction

# Read the data into dataframe
df = pd.read_csv('../data/book_genre_dataset.csv')

# Create a column with the combined title and summary
df['combined'] = df['title'] + '. ' + df['summary']

# Feature engineering

In [29]:
from sklearn.base import TransformerMixin, BaseEstimator

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.loc[:,self.variables]

In [30]:
import langdetect

class LangDetection(BaseEstimator, TransformerMixin):
    def __init__(self, lang='en'):
        self.lang = lang
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_ = X.copy()
        X_['lang'] = X_.apply(lambda x: langdetect.detect(x))
        X_lang_only = X[X_['lang'] == self.lang]
        return X_lang_only

In [31]:
class LowercaseTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_lower = X.apply(lambda x: x.lower())
        return X_lower

In [32]:
import re

class RemovePunctuation(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_no_punct = X.apply(lambda x: re.sub(r'[^\w\s]|_', '', x))
        return X_no_punct

In [33]:
class DropDataEntries(BaseEstimator, TransformerMixin):
    def __init__(self, ids):
        self.ids = ids
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_ = X.copy()
        for i in self.ids:
            X_ = X_.drop(i)
        return X_     

In [34]:
from sklearn.pipeline import Pipeline

nonsense_summaries_idx = [338, 588, 834, 1574, 1772, 2410, 2485]

clean_text_pipeline = Pipeline([
    ('get_combined_text', FeatureSelector('combined')),
    ('detect_lang', LangDetection()),
    ('lowercase',    LowercaseTransformer()),
    ('remove_punctuation', RemovePunctuation()),
    ('drop_nonsense_summaries', DropDataEntries(nonsense_summaries_idx))
])

In [35]:
clean_text_pipeline

In [36]:
X_combined_clean = clean_text_pipeline.fit_transform(df)

#'338' in X_combined_clean
df.drop([i for i in range(len(df)) if i not in X_combined_clean])
X = df[['title','summary','combined']]
y = df['genre']
X.shape, y.shape

((4657, 3), (4657,))

In [37]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

class RemoveStopwords(BaseEstimator, TransformerMixin):
    def __init__(self, stopwords):
        self.stopwords = stopwords
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_no_stopwords = X.apply(lambda x: ' '.join([w for w in x.split() if w not in self.stopwords]))
        return X_no_stopwords

[nltk_data] Downloading package stopwords to /home/mia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
from nltk.stem import WordNetLemmatizer

class Lemmatizer(BaseEstimator, TransformerMixin):
    def __init__(self, lemmatizer):
        self.lemmatizer = lemmatizer
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        lemmatize_func = lambda x: ' '.join(
            [self.lemmatizer.lemmatize(w) for w in x.split()]
        )
        X_lemmatized = X.apply(lemmatize_func)
        return X_lemmatized

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import StandardScaler
from mlxtend.preprocessing import DenseTransformer

preprocess_text_pipeline = Pipeline([
    ('remove_stopwords', RemoveStopwords(stop_words)),
    ('lemmatize', Lemmatizer(WordNetLemmatizer())),
    ('tfidf', TfidfVectorizer()),
    ('select_k_best', SelectKBest(k=5000))
])

In [41]:
from sklearn.pipeline import FeatureUnion

text_pipeline = Pipeline([
    ('select_combined', FeatureSelector('combined')),
    ('preprocess', preprocess_text_pipeline),
])

In [42]:
def stopword_count(text):
    stopwords_in_text = [w for w in text.split() if w in stop_words]
    return len(stopwords_in_text)

In [43]:
class TitleSummaryFE(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_ = X.copy()
        feats = pd.DataFrame()
        feats['char_count'] = X_.apply(lambda x: len(x))
        feats['word_count'] = X_.apply(lambda x: len(x.split()))
        feats['avg_word_len'] = feats['char_count'] / feats['word_count']
        feats['stopword_count'] = X_.apply(stopword_count)
        return feats

In [44]:
from sklearn.preprocessing import FunctionTransformer

title_features = Pipeline([
    ('select_title', FeatureSelector('title')),
    ('title_features', TitleSummaryFE()),
])

In [45]:
summary_features = Pipeline([
    ('select_summary', FeatureSelector('summary')),
    ('summary_features', TitleSummaryFE()),
])

In [46]:
from nltk.sentiment import SentimentIntensityAnalyzer

class CombinedFE(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        feats = pd.DataFrame()
        sia = SentimentIntensityAnalyzer()
        X_ = X.copy()
        feats['unique_word_count'] = X_.apply(lambda x: len(set(x.split())))
        feats['unique_word_ratio'] = feats['unique_word_count'] / X_.apply(lambda x: len(x.split()))
        feats['sentiment_score'] = X_.apply(lambda x: sia.polarity_scores(x)['compound'])
        return feats

In [47]:
combined_features = Pipeline([
    ('select_combined', FeatureSelector('combined')),
    ('combined_features', CombinedFE()),
])

In [48]:
features = FeatureUnion([
    ('text', text_pipeline),
    ('title', title_features),
    ('summary', summary_features),
    ('combined', combined_features)
])

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

def run_experiment(X, y, pipeline, num_expts=5):
    scores = list()
    for i in range(num_expts):
        X_train, X_test, y_train, y_true = train_test_split(X, y, test_size=0.2)
        model = pipeline.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = accuracy_score(y_true, y_pred)
        scores.append(score)
    
    print(f"Average accuracy over {num_expts} experiments: {sum(scores) / num_expts} \n")
    print("Classification report for the last experiment:\n")
    print(classification_report(y_true, y_pred))

In [50]:
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

def model_pipeline(model):
    pipeline = Pipeline([
        ('features', features),
        ('model', model)
    ])
    return pipeline

In [51]:
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

svc = LinearSVC(dual=False)
knn = KNeighborsClassifier()
dtree = DecisionTreeClassifier()

run_experiment(X, y, model_pipeline(svc), num_expts=1)
run_experiment(X, y, model_pipeline(knn), num_expts=1)
run_experiment(X, y, model_pipeline(dtree), num_expts=1)

Average accuracy over 1 experiments: 0.6706008583690987 

Classification report for the last experiment:

              precision    recall  f1-score   support

       crime       0.71      0.66      0.69       106
     fantasy       0.66      0.71      0.69       168
     history       0.67      0.80      0.73       137
      horror       0.83      0.48      0.61       119
  psychology       0.68      0.57      0.62        23
     romance       0.27      0.13      0.18        23
     science       0.67      0.77      0.72       120
      sports       0.93      0.72      0.81        18
    thriller       0.60      0.71      0.65       196
      travel       0.90      0.41      0.56        22

    accuracy                           0.67       932
   macro avg       0.69      0.60      0.62       932
weighted avg       0.68      0.67      0.66       932

Average accuracy over 1 experiments: 0.20493562231759657 

Classification report for the last experiment:

              precision    r

In [63]:
from sklearn.model_selection import GridSearchCV

svc = LinearSVC(dual=False)

params = {
    # tf-idf params
    "features__text__preprocess__tfidf__max_df" : [0.6, 0.8, 1.0],
    "features__text__preprocess__tfidf__min_df" : [1, 3, 5],
    "features__text__preprocess__tfidf__ngram_range" : [(1, 1), (1, 2)],
    "features__text__preprocess__tfidf__norm" : ["l1", "l2"],    
    # select k-best params
    "features__text__preprocess__select_k_best__k" : [100,1000,5000],
    # svc model params
    "model__C" : [0.1, 1],
}

grid = GridSearchCV(
    estimator = model_pipeline(svc),
    param_grid = params,
    cv = 3,
    verbose = 2
)

In [64]:
X_train, X_test, y_train, y_true = train_test_split(X, y, test_size=0.2)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 216 candidates, totalling 648 fits
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  25.7s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  31.7s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  32.3s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__t

[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=1; total time=  26.8s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l2, model__C=0.1; total time=  27.2s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l2, model__C=0.1; total time=  27.4s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=3, fe

[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l2, model__C=1; total time=  16.7s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  17.3s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  17.4s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=5, fe

[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l1, model__C=1; total time=  18.9s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l2, model__C=0.1; total time=  18.9s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l2, model__C=0.1; total time=  18.8s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=1, fe

[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l2, model__C=1; total time=  29.1s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  27.5s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  26.6s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=5, fe

[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=1; total time=  18.9s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l2, model__C=0.1; total time=  17.1s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l2, model__C=0.1; total time=  16.7s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=1, fe

[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l2, model__C=1; total time=  16.9s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  17.4s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  17.7s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=3, fe

[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l1, model__C=1; total time=  17.4s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l2, model__C=0.1; total time=  17.5s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l2, model__C=0.1; total time=  17.8s
[CV] END features__text__preprocess__select_k_best__k=100, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=5, fe

[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l2, model__C=1; total time=  19.1s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l2, model__C=1; total time=  19.1s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  17.2s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=3, 

[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=1; total time=  19.8s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=1; total time=  19.9s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=1; total time=  19.9s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=5, fe

[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l2, model__C=0.1; total time=  20.3s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l2, model__C=1; total time=  20.4s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l2, model__C=1; total time=  20.1s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=1, 

[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  20.6s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  20.5s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l1, model__C=1; total time=  20.5s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=3

[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l2, model__C=0.1; total time=  17.5s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l2, model__C=0.1; total time=  17.5s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l2, model__C=0.1; total time=  17.5s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df

[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l2, model__C=1; total time=  18.4s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  16.3s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  16.3s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=3

[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=1; total time=  16.5s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=1; total time=  16.5s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l2, model__C=0.1; total time=  16.9s
[CV] END features__text__preprocess__select_k_best__k=1000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=5, 

[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l2, model__C=1; total time=  21.5s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l2, model__C=1; total time=  21.6s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l2, model__C=1; total time=  21.4s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=1, fe

[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  21.3s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l1, model__C=1; total time=  21.2s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l1, model__C=1; total time=  21.2s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=3, 

[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l2, model__C=0.1; total time=  22.3s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l2, model__C=0.1; total time=  21.9s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l2, model__C=1; total time=  22.7s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.6, features__text__preprocess__tfidf__min_df=5

[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  20.4s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  20.2s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  17.7s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df

[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l1, model__C=1; total time=  20.6s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l2, model__C=0.1; total time=  21.3s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l2, model__C=0.1; total time=  21.6s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=0.8, features__text__preprocess__tfidf__min_df=5

[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l2, model__C=1; total time=  21.5s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 1), features__text__preprocess__tfidf__norm=l2, model__C=1; total time=  21.4s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=1, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l1, model__C=0.1; total time=  22.4s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=1, 

[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l1, model__C=1; total time=  21.3s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l1, model__C=1; total time=  21.4s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=3, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l1, model__C=1; total time=  21.4s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=3, fe

[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l2, model__C=0.1; total time=  22.5s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l2, model__C=1; total time=  23.0s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=5, features__text__preprocess__tfidf__ngram_range=(1, 2), features__text__preprocess__tfidf__norm=l2, model__C=1; total time=  23.0s
[CV] END features__text__preprocess__select_k_best__k=5000, features__text__preprocess__tfidf__max_df=1.0, features__text__preprocess__tfidf__min_df=5, 

In [66]:
grid.best_params_, grid.best_score_

({'features__text__preprocess__select_k_best__k': 5000,
  'features__text__preprocess__tfidf__max_df': 0.6,
  'features__text__preprocess__tfidf__min_df': 5,
  'features__text__preprocess__tfidf__ngram_range': (1, 1),
  'features__text__preprocess__tfidf__norm': 'l2',
  'model__C': 1},
 0.6953072319303386)