# lab8 - Text classification

The task concentrates on content-based text the classification.

In [22]:
from pathlib import Path
from sklearn.model_selection import train_test_split
import numpy as np
import dataclasses as dc
from typing import List
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import sklearn.metrics as skm
from fasttext import supervised as fasttext
from flair.data import TaggedCorpus, Sentence
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pprint import pprint

NameError: name '_C' is not defined

In [3]:
@dc.dataclass
class Dataset:
    X_train: List
    y_train: List
    X_val: List
    y_val: List    
    X_test: List
    y_test: List
        
    def preprocess(self, fn):
        return dc.replace(
            self,
            X_train = [fn(x) for x in self.X_train],
            X_val = [fn(x) for x in self.X_val],
            X_test = [fn(x) for x in self.X_test],
         )
    
    @property
    def X(self):
        return self.X_train + self.X_val + self.X_test
    
    @property
    def y(self):
        return self.y_train + self.y_val + self.y_test

## Tasks

1. Divide the set of bills into two exclusive sets:
   1. the set of bills amending other bills (their title starts with `o zmianie ustawy`),
   1. the set of bills not amending other bills.

In [4]:
data_files = Path("../data").glob("*.txt")
bills = [f.open().read().replace("  ", " ").lower() for f in data_files]

In [5]:
def is_changing_bill(text):
    return "o zmianie ustawy" in text.replace("\n", " ")

In [6]:
positives_raw = [b for b in bills if is_changing_bill(b)]
negatives_raw = [b for b in bills if not is_changing_bill(b)]

len(positives_raw), len(negatives_raw)

(692, 488)

2. Change the contents of the bill by removing the date of publication and the title (so the words `o zmianie ustawy`
   are removed).
   

In [7]:
positives, negatives = [["".join(b.split("art. 1")[1:]) for b in ds] for ds in [positives_raw, negatives_raw]]
X_ = positives + negatives
y_ = ([1] * len(positives)) + ([0] * len(negatives))

X = [x for x in X_ if len(x) > 0]
y = [y for (x, y) in zip(X_, y_) if len(x) > 0 ]

3. Split the sets of documents into the following groups by randomly selecting the documents:
   1. 60% training
   1. 20% validation
   1. 20% testing
   

In [8]:
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.4)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5)

original_ds = Dataset(
    X_train,
    y_train,
    X_val,
    y_val,
    X_test,
    y_test
)

4. Do not change these groups during the following experiments.
5. Prepare the following variants of the documents:

   a. full text of the document
   

In [9]:
def full_text(text):
    return text

   b. randomly selected 10% of the lines of the document
   

In [10]:
def tenpercentlines(text):
    lines = [l for l in text.split("\n") if l != ""]
    return "\n".join(np.random.choice(lines, (len(lines) // 10) + 1))

   c. randomly selected 10 lines of the document
   

In [11]:
def tenlines(text):
    lines = [l for l in text.split("\n") if l != ""]
    return " ".join(np.random.choice(lines, 10))

   d. randomly selected 1 line of the document
   

In [12]:
def oneline(text):
    lines = [l for l in text.split("\n") if l != ""]
    return " ".join(np.random.choice(lines, 1))

6. Train the following classifiers on the documents:

   a. SVM with TF•IDF

In [13]:
def tfidf_svm(ds: Dataset):
    
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer().fit(ds.X)),
        ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
    ])
    parameters = {
        'tfidf__max_df': (0.25, 0.5, 0.75),
        'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
        "clf__estimator__C": [0.01, 0.1, 0.5, 1],
        "clf__estimator__class_weight": ['balanced', None],
    }
    grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=12, verbose=10)
    grid_search_tune.fit(ds.X_train, ds.y_train)
    
    return grid_search_tune.best_estimator_.predict(ds.X_val)

b. Fasttext

In [14]:
def fasttext_ds(ds: Dataset):
    out = Path("/tmp") / str(np.random.randn())
    out.mkdir()
    for d in ["train", "val", "test"]:
        X_d, y_d = getattr(ds, f"X_{d}"), getattr(ds, f"y_{d}")
        txt = "\n".join([
          f"__label__{y} {x}"  
            for (x,y) in zip(X_d, y_d)
        ])
        with (out / d).open("w") as f:
            f.write(txt)
    return out

In [15]:
def fasttext_supervised(ds: Dataset):
    ds_path = fasttext_ds(original_ds.preprocess(oneline))
    ft = fasttext(
        str(ds_path / "train"), str(ds_path / "model"),
        epoch=10, 
#         word_ngrams=2, #kills the kernel
    )
    return [int(y[0]) for y in ft.predict(ds.X_val)]

c. Flair with Polish language model
   

In [16]:
def flair_model(ds: Dataset):

    data_folder = fasttext_ds(ds)
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus(
        str(data_folder),
        test_file='test',
        dev_file='val',
        train_file='train'
    )


    # 3. make a list of word embeddings
    word_embeddings = [
        WordEmbeddings('pl'), 
        FlairEmbeddings('polish-forward'),
       FlairEmbeddings('polish-backward')
    ]
        

    # 4. initialize document embedding by passing list of word embeddings
    # Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
        word_embeddings,
        hidden_size=512,
        reproject_words=True,
        reproject_words_dimension=256,
        )

    # 5. create the text classifier
    classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)

    # 6. initialize the text classifier trainer
    trainer = ModelTrainer(
        classifier, 
        corpus, 
#         optimizer=torch.optim.Adam()
    )

    # 7. start the training
    model_path =  str(data_folder / "model"/ "best-model.pt")
    trainer.train( 
        model_path,
        learning_rate=0.1,
        mini_batch_size=32,
        anneal_factor=0.5,
        patience=5,
        max_epochs=1,
        monitor_train=False
    )    
    return [
        y.labels[0]
        for y in 
        trainer.model.predict([Sentence(x) for x in ds.X_val])
    ]

7. Report Precision, Recall and F1 for each variant of the experiment (12 variants altogether).

In [17]:
def metrics(y_gt, y_pred):
    return {
        fn.__name__: fn(y_gt, y_pred)
        for fn in [
            skm.accuracy_score,
            skm.precision_score,
            skm.recall_score,
            skm.f1_score
        ]
    }

In [18]:
results = {
    ds_name: {
        model_fn.__name__: metrics(ds.y_val, model_fn(ds))
        for model_fn in [
            tfidf_svm,
            fasttext_supervised,
#             flair_model
        ]
    }
    for (ds_name, ds) in {
        fn.__name__: original_ds.preprocess(fn) 
        for fn in
        [
            oneline,
            tenlines, 
            tenpercentlines, 
            full_text, 
        ]
    }.items()
}

Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:    1.9s
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    2.2s
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    2.4s
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    2.6s
[Parallel(n_jobs=12)]: Done  37 tasks      | elapsed:    2.8s
[Parallel(n_jobs=12)]: Done  48 tasks      | elapsed:    3.0s
[Parallel(n_jobs=12)]: Done  61 tasks      | elapsed:    3.2s
[Parallel(n_jobs=12)]: Done  74 tasks      | elapsed:    3.4s
[Parallel(n_jobs=12)]: Done  89 tasks      | elapsed:    3.6s
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:    3.9s
[Parallel(n_jobs=12)]: Done 121 tasks      | elapsed:    4.2s
[Parallel(n_jobs=12)]: Done 136 out of 144 | elapsed:    4.4s remaining:    0.3s
[Parallel(n_jobs=12)]: Done 144 out of 144 | elapsed:    4.5s finished


Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.8s
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    1.6s
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    2.2s
[Parallel(n_jobs=12)]: Done  37 tasks      | elapsed:    3.1s
[Parallel(n_jobs=12)]: Done  48 tasks      | elapsed:    3.9s
[Parallel(n_jobs=12)]: Done  61 tasks      | elapsed:    4.9s
[Parallel(n_jobs=12)]: Done  74 tasks      | elapsed:    6.1s
[Parallel(n_jobs=12)]: Done  89 tasks      | elapsed:    7.3s
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:    8.3s
[Parallel(n_jobs=12)]: Done 121 tasks      | elapsed:    9.8s
[Parallel(n_jobs=12)]: Done 136 out of 144 | elapsed:   10.9s remaining:    0.6s
[Parallel(n_jobs=12)]: Done 144 out of 144 | elapsed:   11.4s finished


Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:    1.6s
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    4.0s
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    7.3s
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:   10.5s
[Parallel(n_jobs=12)]: Done  37 tasks      | elapsed:   15.1s
[Parallel(n_jobs=12)]: Done  48 tasks      | elapsed:   19.0s
[Parallel(n_jobs=12)]: Done  61 tasks      | elapsed:   23.5s
[Parallel(n_jobs=12)]: Done  74 tasks      | elapsed:   28.4s
[Parallel(n_jobs=12)]: Done  89 tasks      | elapsed:   33.9s
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:   38.8s
[Parallel(n_jobs=12)]: Done 121 tasks      | elapsed:   45.8s
[Parallel(n_jobs=12)]: Done 136 out of 144 | elapsed:   51.1s remaining:    3.0s
[Parallel(n_jobs=12)]: Done 144 out of 144 | elapsed:   53.6s finished


Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:   14.4s
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:   38.9s
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:  1.1min
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  1.7min
[Parallel(n_jobs=12)]: Done  37 tasks      | elapsed:  2.4min
[Parallel(n_jobs=12)]: Done  48 tasks      | elapsed:  3.1min
[Parallel(n_jobs=12)]: Done  61 tasks      | elapsed:  3.8min
[Parallel(n_jobs=12)]: Done  74 tasks      | elapsed:  4.5min
[Parallel(n_jobs=12)]: Done  89 tasks      | elapsed:  5.4min
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:  6.2min
[Parallel(n_jobs=12)]: Done 121 tasks      | elapsed:  7.1min
[Parallel(n_jobs=12)]: Done 136 out of 144 | elapsed:  7.9min remaining:   27.8s
[Parallel(n_jobs=12)]: Done 144 out of 144 | elapsed:  8.4min finished


NameError: name 'pprint' is not defined

In [20]:
results

{'oneline': {'tfidf_svm': {'accuracy_score': 0.591304347826087,
   'precision_score': 0.5714285714285714,
   'recall_score': 0.943089430894309,
   'f1_score': 0.7116564417177914},
  'fasttext_supervised': {'accuracy_score': 0.5826086956521739,
   'precision_score': 0.5678391959798995,
   'recall_score': 0.9186991869918699,
   'f1_score': 0.7018633540372671}},
 'tenlines': {'tfidf_svm': {'accuracy_score': 0.8,
   'precision_score': 0.7851851851851852,
   'recall_score': 0.8617886178861789,
   'f1_score': 0.8217054263565892},
  'fasttext_supervised': {'accuracy_score': 0.5478260869565217,
   'precision_score': 0.5418502202643172,
   'recall_score': 1.0,
   'f1_score': 0.7028571428571428}},
 'tenpercentlines': {'tfidf_svm': {'accuracy_score': 0.782608695652174,
   'precision_score': 0.7588652482269503,
   'recall_score': 0.8699186991869918,
   'f1_score': 0.8106060606060606},
  'fasttext_supervised': {'accuracy_score': 0.6,
   'precision_score': 0.5720930232558139,
   'recall_score': 1.0,

## Hints


1. Application of SVM classifier with TF•IDF is described in 
   [David Batista](http://www.davidsbatista.net/blog/2017/04/01/document_classification/) blog post.
1. [Fasttext](https://fasttext.cc/) is a popular basline classifier. Don't report the Precision/Recall/F1 provided by
   Fasttext since they might be [wrong](https://github.com/facebookresearch/fastText/issues/261).
1. [Flair](https://towardsdatascience.com/text-classification-with-state-of-the-art-nlp-library-flair-b541d7add21f) 
   is another library for text processing. Flair classification is based on a language model.