In [1]:
import os

from nltk.stem import PorterStemmer
import numpy as np
from sklearn import datasets, metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
poems_dt = datasets.load_files(
    '/opt/data', description=None,
    load_content=True, encoding='utf-8', shuffle=True
)

In [3]:
x_train, x_test, y_train, y_test = train_test_split(
    poems_dt.data, poems_dt.target, test_size=0.2
)

In [4]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'vect__stop_words': (None, 'english'),
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3, 1e-4, 1e-5, 1e-6),
    'clf__tol': (1e-2, 1e-3, 1e-4), 
}

## No preprocessing

### Best params

In [5]:
text_clf_bp = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])
gs_clf = GridSearchCV(text_clf_bp, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

{'clf__alpha': 1e-06,
 'clf__tol': 0.01,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 2),
 'vect__stop_words': None}

### Model evaluation

In [4]:
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(alpha=1e-06, tol=1e-02)),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.78      0.77      0.77       150
   1781-1900       0.72      0.69      0.70       220
   1901-1950       0.69      0.18      0.28       229
1951-present       0.91      0.99      0.95      2054

    accuracy                           0.88      2653
   macro avg       0.77      0.66      0.68      2653
weighted avg       0.87      0.88      0.86      2653



## Snowball  stemmer

In [6]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class SnowballStemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

### Best params

In [8]:
text_snow_clf_bp = Pipeline([
    ('vect', SnowballStemmedCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

{'clf__alpha': 1e-06,
 'clf__tol': 0.001,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 2),
 'vect__stop_words': None}

### Model evaluation

In [7]:
text_clf = Pipeline([
    ('vect', SnowballStemmedCountVectorizer(ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(alpha=1e-06)),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.76      0.83      0.79       150
   1781-1900       0.77      0.64      0.69       220
   1901-1950       0.63      0.20      0.30       229
1951-present       0.91      0.99      0.95      2054

    accuracy                           0.88      2653
   macro avg       0.77      0.66      0.69      2653
weighted avg       0.87      0.88      0.86      2653



## Porter's stemmer

In [9]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
class PorterStemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

### Best params

In [11]:
text_clf = Pipeline([
    ('vect', PorterStemmedCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

{'clf__alpha': 1e-06,
 'clf__tol': 0.01,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 2),
 'vect__stop_words': None}

### Model evaluation

In [10]:
text_clf = Pipeline([
    ('vect', PorterStemmedCountVectorizer(ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(alpha=1e-06, tol=1e-02)),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.75      0.82      0.78       150
   1781-1900       0.74      0.64      0.69       220
   1901-1950       0.56      0.20      0.29       229
1951-present       0.91      0.99      0.95      2054

    accuracy                           0.88      2653
   macro avg       0.74      0.66      0.68      2653
weighted avg       0.86      0.88      0.86      2653



## WordNet lemmatizer with stop words

In [11]:
from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: ([lemma.lemmatize(w) for w in analyzer(doc)])

### Best params

In [14]:
text_clf = Pipeline([
    ('vect', LemmaCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

{'clf__alpha': 1e-06,
 'clf__tol': 0.01,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 2),
 'vect__stop_words': None}

In [12]:
text_clf = Pipeline([
    ('vect', LemmaCountVectorizer(ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(alpha=1e-06, tol=1e-02)),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.71      0.82      0.76       150
   1781-1900       0.70      0.63      0.67       220
   1901-1950       0.65      0.14      0.24       229
1951-present       0.91      0.99      0.95      2054

    accuracy                           0.88      2653
   macro avg       0.74      0.65      0.65      2653
weighted avg       0.86      0.88      0.86      2653

