In [3]:
import os

from nltk.stem import PorterStemmer
import numpy as np
from sklearn import datasets, metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

In [4]:
poems_dt = datasets.load_files(
    '/opt/data', description=None,
    load_content=True, encoding='utf-8', shuffle=True
)

In [5]:
x_train, x_test, y_train, y_test = train_test_split(
    poems_dt.data, poems_dt.target, test_size=0.2
)

In [4]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'vect__stop_words': (None, 'english'),
    'tfidf__use_idf': (True, False),
}

## No preprocessing

### Best params

In [5]:
text_clf_bp = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', KNeighborsClassifier()),
])
gs_clf = GridSearchCV(text_clf_bp, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

{'tfidf__use_idf': True,
 'vect__ngram_range': (1, 1),
 'vect__stop_words': 'english'}

### Model evaluation

In [6]:
text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', KNeighborsClassifier()),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.55      0.63      0.58       132
   1781-1900       0.40      0.38      0.39       248
   1901-1950       0.28      0.07      0.12       242
1951-present       0.87      0.94      0.90      2031

    accuracy                           0.79      2653
   macro avg       0.52      0.50      0.50      2653
weighted avg       0.75      0.79      0.77      2653



## Snowball  stemmer

In [7]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class SnowballStemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

### Best params

In [8]:
text_snow_clf_bp = Pipeline([
    ('vect', SnowballStemmedCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', KNeighborsClassifier()),
])
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

{'tfidf__use_idf': True,
 'vect__ngram_range': (1, 1),
 'vect__stop_words': 'english'}

### Model evaluation

In [16]:
text_clf = Pipeline([
    ('vect', SnowballStemmedCountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', KNeighborsClassifier()),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.57      0.64      0.60       132
   1781-1900       0.44      0.44      0.44       248
   1901-1950       0.18      0.05      0.07       242
1951-present       0.87      0.94      0.90      2031

    accuracy                           0.80      2653
   macro avg       0.51      0.52      0.50      2653
weighted avg       0.75      0.80      0.77      2653



## Porter's stemmer

In [9]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
class PorterStemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

### Best params

In [11]:
text_clf = Pipeline([
    ('vect', PorterStemmedCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', KNeighborsClassifier()),
])
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

{'tfidf__use_idf': True,
 'vect__ngram_range': (1, 1),
 'vect__stop_words': 'english'}

### Model evaluation

In [14]:
text_clf = Pipeline([
    ('vect', PorterStemmedCountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', KNeighborsClassifier()),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.57      0.64      0.60       132
   1781-1900       0.44      0.44      0.44       248
   1901-1950       0.18      0.05      0.07       242
1951-present       0.87      0.94      0.90      2031

    accuracy                           0.80      2653
   macro avg       0.51      0.52      0.50      2653
weighted avg       0.75      0.80      0.77      2653



## WordNet lemmatizer with stop words

In [11]:
from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: ([lemma.lemmatize(w) for w in analyzer(doc)])

### Best params

In [None]:
text_clf = Pipeline([
    ('vect', LemmaCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', KNeighborsClassifier()),
])
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

In [15]:
text_clf = Pipeline([
    ('vect', LemmaCountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', KNeighborsClassifier()),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.55      0.62      0.58       132
   1781-1900       0.41      0.42      0.41       248
   1901-1950       0.25      0.06      0.09       242
1951-present       0.87      0.94      0.90      2031

    accuracy                           0.80      2653
   macro avg       0.52      0.51      0.50      2653
weighted avg       0.76      0.80      0.77      2653

