In [1]:
import os

from nltk.stem import PorterStemmer
import numpy as np
from sklearn import datasets, metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
poems_dt = datasets.load_files(
    '/opt/data', description=None,
    load_content=True, encoding='utf-8', shuffle=True
)

In [3]:
x_train, x_test, y_train, y_test = train_test_split(
    poems_dt.data, poems_dt.target, test_size=0.2
)

In [47]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'vect__stop_words': (None, 'english'),
    'tfidf__use_idf': (True, False),
}

## No preprocessing

### Best params

In [48]:
text_clf_bp = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
gs_clf = GridSearchCV(text_clf_bp, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

{'clf__alpha': 0.01,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 1),
 'vect__stop_words': 'english'}

### Model evaluation

In [11]:
text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', MultinomialNB()),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.00      0.00      0.00       142
   1781-1900       0.50      0.00      0.01       233
   1901-1950       0.00      0.00      0.00       212
1951-present       0.78      1.00      0.88      2066

    accuracy                           0.78      2653
   macro avg       0.32      0.25      0.22      2653
weighted avg       0.65      0.78      0.68      2653



  _warn_prf(average, modifier, msg_start, len(result))


## Snowball  stemmer

In [12]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class SnowballStemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

### Best params

In [51]:
text_snow_clf_bp = Pipeline([
    ('vect', SnowballStemmedCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

{'clf__alpha': 0.01,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 1),
 'vect__stop_words': 'english'}

### Model evaluation

In [13]:
text_clf = Pipeline([
    ('vect', SnowballStemmedCountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', MultinomialNB()),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.00      0.00      0.00       142
   1781-1900       0.50      0.00      0.01       233
   1901-1950       0.00      0.00      0.00       212
1951-present       0.78      1.00      0.88      2066

    accuracy                           0.78      2653
   macro avg       0.32      0.25      0.22      2653
weighted avg       0.65      0.78      0.68      2653



  _warn_prf(average, modifier, msg_start, len(result))


## Porter's stemmer

In [14]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
class PorterStemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

### Best params

In [54]:
text_clf = Pipeline([
    ('vect', PorterStemmedCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

{'clf__alpha': 0.01,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 1),
 'vect__stop_words': 'english'}

### Model evaluation

In [15]:
text_clf = Pipeline([
    ('vect', PorterStemmedCountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', MultinomialNB()),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.00      0.00      0.00       142
   1781-1900       0.50      0.00      0.01       233
   1901-1950       0.00      0.00      0.00       212
1951-present       0.78      1.00      0.88      2066

    accuracy                           0.78      2653
   macro avg       0.32      0.25      0.22      2653
weighted avg       0.65      0.78      0.68      2653



  _warn_prf(average, modifier, msg_start, len(result))


## WordNet lemmatizer with stop words

In [16]:
from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: ([lemma.lemmatize(w) for w in analyzer(doc)])

### Best params

In [None]:
text_clf = Pipeline([
    ('vect', LemmaCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

In [18]:
text_clf = Pipeline([
    ('vect', LemmaCountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', MultinomialNB()),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.00      0.00      0.00       142
   1781-1900       0.50      0.00      0.01       233
   1901-1950       0.00      0.00      0.00       212
1951-present       0.78      1.00      0.88      2066

    accuracy                           0.78      2653
   macro avg       0.32      0.25      0.22      2653
weighted avg       0.65      0.78      0.68      2653



  _warn_prf(average, modifier, msg_start, len(result))


In [1]:
import os

from nltk.stem import PorterStemmer
import numpy as np
from sklearn import datasets, metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
poems_dt = datasets.load_files(
    '/opt/data', description=None,
    load_content=True, encoding='utf-8', shuffle=True
)

In [3]:
x_train, x_test, y_train, y_test = train_test_split(
    poems_dt.data, poems_dt.target, test_size=0.2
)

In [47]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'vect__stop_words': (None, 'english'),
    'tfidf__use_idf': (True, False),
}

## No preprocessing

### Best params

In [48]:
text_clf_bp = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
gs_clf = GridSearchCV(text_clf_bp, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

{'clf__alpha': 0.01,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 1),
 'vect__stop_words': 'english'}

### Model evaluation

In [11]:
text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', MultinomialNB()),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.00      0.00      0.00       142
   1781-1900       0.50      0.00      0.01       233
   1901-1950       0.00      0.00      0.00       212
1951-present       0.78      1.00      0.88      2066

    accuracy                           0.78      2653
   macro avg       0.32      0.25      0.22      2653
weighted avg       0.65      0.78      0.68      2653



  _warn_prf(average, modifier, msg_start, len(result))


## Snowball  stemmer

In [12]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class SnowballStemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

### Best params

In [51]:
text_snow_clf_bp = Pipeline([
    ('vect', SnowballStemmedCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

{'clf__alpha': 0.01,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 1),
 'vect__stop_words': 'english'}

### Model evaluation

In [13]:
text_clf = Pipeline([
    ('vect', SnowballStemmedCountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', MultinomialNB()),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.00      0.00      0.00       142
   1781-1900       0.50      0.00      0.01       233
   1901-1950       0.00      0.00      0.00       212
1951-present       0.78      1.00      0.88      2066

    accuracy                           0.78      2653
   macro avg       0.32      0.25      0.22      2653
weighted avg       0.65      0.78      0.68      2653



  _warn_prf(average, modifier, msg_start, len(result))


## Porter's stemmer

In [14]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
class PorterStemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

### Best params

In [54]:
text_clf = Pipeline([
    ('vect', PorterStemmedCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

{'clf__alpha': 0.01,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 1),
 'vect__stop_words': 'english'}

### Model evaluation

In [15]:
text_clf = Pipeline([
    ('vect', PorterStemmedCountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', MultinomialNB()),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.00      0.00      0.00       142
   1781-1900       0.50      0.00      0.01       233
   1901-1950       0.00      0.00      0.00       212
1951-present       0.78      1.00      0.88      2066

    accuracy                           0.78      2653
   macro avg       0.32      0.25      0.22      2653
weighted avg       0.65      0.78      0.68      2653



  _warn_prf(average, modifier, msg_start, len(result))


## WordNet lemmatizer with stop words

In [16]:
from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: ([lemma.lemmatize(w) for w in analyzer(doc)])

### Best params

In [None]:
text_clf = Pipeline([
    ('vect', LemmaCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

In [18]:
text_clf = Pipeline([
    ('vect', LemmaCountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', MultinomialNB()),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.00      0.00      0.00       142
   1781-1900       0.50      0.00      0.01       233
   1901-1950       0.00      0.00      0.00       212
1951-present       0.78      1.00      0.88      2066

    accuracy                           0.78      2653
   macro avg       0.32      0.25      0.22      2653
weighted avg       0.65      0.78      0.68      2653



  _warn_prf(average, modifier, msg_start, len(result))
