In [2]:
import os

from nltk.stem import PorterStemmer
import numpy as np
from sklearn import datasets, metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

In [3]:
poems_dt = datasets.load_files(
    '/opt/data', description=None,
    load_content=True, encoding='utf-8', shuffle=True
)

In [4]:
x_train, x_test, y_train, y_test = train_test_split(
    poems_dt.data, poems_dt.target, test_size=0.2
)

In [5]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'vect__stop_words': (None, 'english'),
    'tfidf__use_idf': (True, False),
    'clf__tol': (1e-2, 1e-3, 1e-4, 1e-5, 1e-6),
    'clf__dual': (True, False), 
}

## No preprocessing

### Best params

In [6]:
text_clf_bp = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])
gs_clf = GridSearchCV(text_clf_bp, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

{'clf__dual': True,
 'clf__tol': 0.01,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 1),
 'vect__stop_words': None}

### Model evaluation

In [5]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', LinearSVC(tol=1e-2)),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.88      0.77      0.82       140
   1781-1900       0.74      0.65      0.69       258
   1901-1950       0.56      0.20      0.29       242
1951-present       0.90      0.99      0.94      2013

    accuracy                           0.87      2653
   macro avg       0.77      0.65      0.69      2653
weighted avg       0.85      0.87      0.85      2653



## Snowball  stemmer

In [6]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class SnowballStemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

### Best params

In [10]:
text_snow_clf_bp = Pipeline([
    ('vect', SnowballStemmedCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

{'clf__dual': True,
 'clf__tol': 0.01,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 1),
 'vect__stop_words': None}

### Model evaluation

In [7]:
text_clf = Pipeline([
    ('vect', SnowballStemmedCountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', LinearSVC(tol=1e-2)),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.86      0.79      0.82       140
   1781-1900       0.75      0.66      0.70       258
   1901-1950       0.56      0.24      0.33       242
1951-present       0.90      0.99      0.94      2013

    accuracy                           0.88      2653
   macro avg       0.77      0.67      0.70      2653
weighted avg       0.86      0.88      0.86      2653



## Porter's stemmer

In [8]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
class PorterStemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

### Best params

In [13]:
text_clf = Pipeline([
    ('vect', PorterStemmedCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

{'clf__dual': False,
 'clf__tol': 0.01,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 1),
 'vect__stop_words': None}

### Model evaluation

In [9]:
text_clf = Pipeline([
    ('vect', PorterStemmedCountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', LinearSVC(tol=1e-2, dual=False)),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.87      0.78      0.82       140
   1781-1900       0.75      0.64      0.69       258
   1901-1950       0.53      0.23      0.32       242
1951-present       0.90      0.99      0.94      2013

    accuracy                           0.87      2653
   macro avg       0.76      0.66      0.69      2653
weighted avg       0.85      0.87      0.85      2653



## WordNet lemmatizer with stop words

In [10]:
from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: ([lemma.lemmatize(w) for w in analyzer(doc)])

### Best params

In [16]:
text_clf = Pipeline([
    ('vect', LemmaCountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(poems_dt.data, poems_dt.target)
gs_clf.best_params_

{'clf__dual': False,
 'clf__tol': 0.01,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 1),
 'vect__stop_words': None}

In [11]:
text_clf = Pipeline([
    ('vect', LemmaCountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', LinearSVC(tol=1e-2, dual=False)),
])
text_clf.fit(x_train, y_train)
predicted = text_clf.predict(x_test)
print(metrics.classification_report(y_test, predicted, target_names=poems_dt.target_names))

              precision    recall  f1-score   support

   1550-1780       0.86      0.75      0.80       140
   1781-1900       0.73      0.66      0.69       258
   1901-1950       0.57      0.23      0.33       242
1951-present       0.90      0.99      0.94      2013

    accuracy                           0.87      2653
   macro avg       0.77      0.66      0.69      2653
weighted avg       0.85      0.87      0.86      2653

