# lab8 - Text classification

The task concentrates on content-based text the classification.

In [36]:
from pathlib import Path
from sklearn.model_selection import train_test_split
import numpy as np
import dataclasses as dc
from typing import List
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import sklearn.metrics as skm
from fasttext import supervised as fasttext
from flair.data import TaggedCorpus, Sentence
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
import torch
from pprint import pprint

In [2]:
@dc.dataclass
class Dataset:
    X_train: List
    y_train: List
    X_val: List
    y_val: List    
    X_test: List
    y_test: List
        
    def preprocess(self, fn):
        return dc.replace(
            self,
            X_train = [fn(x) for x in self.X_train],
            X_val = [fn(x) for x in self.X_val],
            X_test = [fn(x) for x in self.X_test],
         )
    
    @property
    def X(self):
        return self.X_train + self.X_val + self.X_test
    
    @property
    def y(self):
        return self.y_train + self.y_val + self.y_test

## Tasks

1. Divide the set of bills into two exclusive sets:
   1. the set of bills amending other bills (their title starts with `o zmianie ustawy`),
   1. the set of bills not amending other bills.

In [3]:
data_files = Path("../data").glob("*.txt")
bills = [f.open().read().replace("  ", " ").lower() for f in data_files]

In [4]:
def is_changing_bill(text):
    return "o zmianie ustawy" in text.replace("\n", " ")

In [5]:
positives_raw = [b for b in bills if is_changing_bill(b)]
negatives_raw = [b for b in bills if not is_changing_bill(b)]

len(positives_raw), len(negatives_raw)

(692, 488)

2. Change the contents of the bill by removing the date of publication and the title (so the words `o zmianie ustawy`
   are removed).
   

In [6]:
positives, negatives = [["".join(b.split("art. 1")[1:]) for b in ds] for ds in [positives_raw, negatives_raw]]
X_ = positives + negatives
y_ = ([1] * len(positives)) + ([0] * len(negatives))

X = [x for x in X_ if len(x) > 0]
y = [y for (x, y) in zip(X_, y_) if len(x) > 0 ]

3. Split the sets of documents into the following groups by randomly selecting the documents:
   1. 60% training
   1. 20% validation
   1. 20% testing
   

In [7]:
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.4)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5)

original_ds = Dataset(
    X_train,
    y_train,
    X_val,
    y_val,
    X_test,
    y_test
)

4. Do not change these groups during the following experiments.
5. Prepare the following variants of the documents:

   a. full text of the document
   

In [8]:
def full_text(text):
    return text

   b. randomly selected 10% of the lines of the document
   

In [9]:
def tenpercentlines(text):
    lines = [l for l in text.split("\n") if l != ""]
    return "\n".join(np.random.choice(lines, (len(lines) // 10) + 1))

   c. randomly selected 10 lines of the document
   

In [10]:
def tenlines(text):
    lines = [l for l in text.split("\n") if l != ""]
    return " ".join(np.random.choice(lines, 10))

   d. randomly selected 1 line of the document
   

In [11]:
def oneline(text):
    lines = [l for l in text.split("\n") if l != ""]
    return " ".join(np.random.choice(lines, 1))

6. Train the following classifiers on the documents:

   a. SVM with TF•IDF

In [12]:
def tfidf_svm(ds: Dataset):
    
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer().fit(ds.X)),
        ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
    ])
    parameters = {
        'tfidf__max_df': (0.25, 0.5, 0.75),
        'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
        "clf__estimator__C": [0.01, 0.1, 0.5, 1],
        "clf__estimator__class_weight": ['balanced', None],
    }
    grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=12, verbose=10)
    grid_search_tune.fit(ds.X_train, ds.y_train)
    
    return grid_search_tune.best_estimator_.predict(ds.X_val)

b. Fasttext

In [13]:
def fasttext_ds(ds: Dataset):
    out = Path("/tmp") / str(np.random.randn())
    out.mkdir()
    for d in ["train", "val", "test"]:
        X_d, y_d = getattr(ds, f"X_{d}"), getattr(ds, f"y_{d}")
        txt = "\n".join([
          f"__label__{y} {x}"  
            for (x,y) in zip(X_d, y_d)
        ])
        with (out / d).open("w") as f:
            f.write(txt)
    return out

In [14]:
def fasttext_supervised(ds: Dataset):
    ds_path = fasttext_ds(original_ds.preprocess(oneline))
    ft = fasttext(
        str(ds_path / "train"), str(ds_path / "model"),
        epoch=10, 
#         word_ngrams=2, #kills the kernel
    )
    return [int(y[0]) for y in ft.predict(ds.X_val)]

c. Flair with Polish language model
   

In [61]:
def flair_model(ds: Dataset):

    data_folder = fasttext_ds(ds)
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus(
        str(data_folder),
        test_file='test',
        dev_file='val',
        train_file='train'
    )


    # 3. make a list of word embeddings
    word_embeddings = [
        WordEmbeddings('pl'), 
        FlairEmbeddings('polish-forward'),
       FlairEmbeddings('polish-backward')
    ]
        

    # 4. initialize document embedding by passing list of word embeddings
    # Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
        word_embeddings,
        hidden_size=512,
        reproject_words=True,
        reproject_words_dimension=256,
        )

    # 5. create the text classifier
    classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)

    # 6. initialize the text classifier trainer
    trainer = ModelTrainer(
        classifier, 
        corpus, 
#         optimizer=torch.optim.Adam()
    )

    # 7. start the training
    model_path =  str(data_folder / "model"/ "best-model.pt")
    trainer.train( 
        model_path,
        learning_rate=0.1,
        mini_batch_size=32,
        anneal_factor=0.5,
        patience=5,
        max_epochs=1,
        monitor_train=False
    )    
    return [
        y.labels[0]
        for y in 
        trainer.model.predict([Sentence(x) for x in ds.X_val])
    ]

7. Report Precision, Recall and F1 for each variant of the experiment (12 variants altogether).

In [63]:
def metrics(y_gt, y_pred):
    return {
        fn.__name__: fn(y_gt, y_pred)
        for fn in [
            skm.accuracy_score,
            skm.precision_score,
            skm.recall_score,
            skm.f1_score
        ]
    }

In [None]:
results = {
    ds_name: {
        model_fn.__name__: metrics(ds.y_val, model_fn(ds))
        for model_fn in [
#             tfidf_svm,
#             fasttext_supervised,
            flair_model
        ]
    }
    for (ds_name, ds) in {
        fn.__name__: original_ds.preprocess(fn) 
        for fn in
        [
            oneline,
#             tenlines, 
#             tenpercentlines, 
#             full_text, 
        ]
    }.items()
}
pprint(results)

2019-05-12 22:21:58,214 Reading data from /tmp/0.8919283111595587
2019-05-12 22:21:58,215 Train: /tmp/0.8919283111595587/train
2019-05-12 22:21:58,217 Dev: /tmp/0.8919283111595587/val
2019-05-12 22:21:58,218 Test: /tmp/0.8919283111595587/test
2019-05-12 22:21:58,446 this function is deprecated, use smart_open.open instead
2019-05-12 22:22:15,332 ----------------------------------------------------------------------------------------------------
2019-05-12 22:22:15,333 Evaluation method: MICRO_F1_SCORE
2019-05-12 22:22:15,336 ----------------------------------------------------------------------------------------------------




2019-05-12 22:22:17,974 epoch 1 - iter 0/21 - loss 0.02180532
2019-05-12 22:22:23,465 epoch 1 - iter 2/21 - loss 0.02127855
2019-05-12 22:22:28,253 epoch 1 - iter 4/21 - loss 0.02142913
2019-05-12 22:22:33,335 epoch 1 - iter 6/21 - loss 0.02142570
2019-05-12 22:22:38,547 epoch 1 - iter 8/21 - loss 0.02142534
2019-05-12 22:22:42,854 epoch 1 - iter 10/21 - loss 0.02119910
2019-05-12 22:22:48,316 epoch 1 - iter 12/21 - loss 0.02100137
2019-05-12 22:22:53,847 epoch 1 - iter 14/21 - loss 0.02089274
2019-05-12 22:22:58,177 epoch 1 - iter 16/21 - loss 0.02095952
2019-05-12 22:23:02,833 epoch 1 - iter 18/21 - loss 0.02100788
2019-05-12 22:23:07,041 epoch 1 - iter 20/21 - loss 0.02179562
2019-05-12 22:23:07,063 ----------------------------------------------------------------------------------------------------
2019-05-12 22:23:07,065 EPOCH 1 done: loss 0.0218 - lr 0.1000 - bad epochs 0
2019-05-12 22:23:22,328 DEV  : loss 0.02183958 - f-score 0.6179 - acc 0.4471
2019-05-12 22:23:37,858 TEST : lo

## Hints


1. Application of SVM classifier with TF•IDF is described in 
   [David Batista](http://www.davidsbatista.net/blog/2017/04/01/document_classification/) blog post.
1. [Fasttext](https://fasttext.cc/) is a popular basline classifier. Don't report the Precision/Recall/F1 provided by
   Fasttext since they might be [wrong](https://github.com/facebookresearch/fastText/issues/261).
1. [Flair](https://towardsdatascience.com/text-classification-with-state-of-the-art-nlp-library-flair-b541d7add21f) 
   is another library for text processing. Flair classification is based on a language model.