In [None]:
%load_ext autoreload
%autoreload 2

### Experiments with the label classification method presented in "CLASSIFICATION MODELS FOR RST DISCOURSE PARSING OF TEXTS IN RUSSIAN"
http://www.dialog-21.ru/media/4595/chistovaevplusetal-076.pdf

### 1. Get code for feature extraction

In [None]:
%%bash

rm -r rurst2019
mkdir rurst2019
cd rurst2019
wget -q http://nlp.isa.ru/paper_dialog2019/utils/meaningfulwords_v3.py
wget -q http://nlp.isa.ru/paper_dialog2019/utils/language_features.py
wget -q http://nlp.isa.ru/paper_dialog2019/utils/features_processor.py

# some external modules are structurally the same but have other paths
sed -i "s|utils/tf_idf_pipeline.save|models/tf_idf/pipeline.pkl|g" features_processor.py  # tf-idf pipeline
sed -i "s|models_w2v/model2_tokenized|models/w2v/segmentator/model2_tokenized|g" features_processor.py  # w2v model

# also some fixes of the feature extractor
sed -i "s|'common_root_fpos',|\n|g" features_processor.py
sed -i "s|'common_root_att',|\n|g" features_processor.py
sed -i "s|'common_root'|\n|g" features_processor.py
sed -i "s|/ len(row))|/ (len(row) + 1e-8))|g" features_processor.py
sed -i "s|'tokens_x', 'tokens_y',|\n|g" features_processor.py

# change the function manually (!):
#sed -i "s|return [self.annotations['tokens'][i].text for i in range(begin, end)]|result = [self.annotations['tokens'][i].text for i in range(begin, end)]\n        if result:\n            return result\n        return ['_']|g"

### 2. Extract features 
Same way as in ``1_data_extraction.ipynb`` but with another interface

In [None]:
from rurst2019.features_processor import FeaturesProcessor

features_processor = FeaturesProcessor(verbose=False)

In [None]:
import glob
import pandas as pd
import pickle
from tqdm.autonotebook import tqdm
from utils.file_reading import read_gold, read_annotation


IN_PATH = 'data/'
for file in tqdm(glob.glob("%s*.json" % IN_PATH)):
    # print(file)
    table = read_gold(file.replace('.json', ''))
    table = table[table.snippet_x.map(len) > 0]
    table = table[table.snippet_y.map(len) > 0]
    annot = read_annotation(file.replace('.json', ''))
    features = features_processor(table, annot)
    features.to_pickle(file.replace('.json', '.gold.pkl.oldf'))

### 3. Classification model 

In [None]:
import os

from catboost import CatBoostClassifier
from catboost import Pool
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

random_state = 45
TARGET = 'relation'

In [None]:
from utils.train_test_split import split_train_dev_test

train, dev, test = split_train_dev_test('./data')

In [None]:
import pandas as pd
from utils.file_reading import read_gold


train_samples = []
test_samples = []
dev_samples = []

for file in train:
    train_samples.append(pd.read_pickle(file.replace('.edus', '.gold.pkl.oldf')))

for file in dev:
    dev_samples.append(pd.read_pickle(file.replace('.edus', '.gold.pkl.oldf')))
    
for file in test:
    test_samples.append(pd.read_pickle(file.replace('.edus', '.gold.pkl.oldf')))

train_samples = pd.concat(train_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)
dev_samples = pd.concat(dev_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)
test_samples = pd.concat(test_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)

In [None]:
def prepare_data(data, max_len=100):
    target_map = {
        'relation': 'joint',
        'antithesis': 'contrast',
        'cause': 'cause-effect',
        'effect': 'cause-effect',
        'conclusion': 'restatement',
        'interpretation': 'interpretation-evaluation',
        'evaluation': 'interpretation-evaluation',
        'motivation': 'condition',
    }

    relation_map = {
        'restatement_SN': 'restatement_NN',
        'restatement_NS': 'restatement_NN',
        'contrast_SN': 'contrast_NN',
        'contrast_NS': 'contrast_NN',
        'solutionhood_NS': 'elaboration_NS',
        'preparation_NS': 'elaboration_NS',
        'concession_SN': 'preparation_SN',
        'evaluation_SN': 'preparation_SN',
        'elaboration_SN': 'preparation_SN',
        'evidence_SN': 'preparation_SN',
        'background_SN': 'preparation_SN'
    }

    data = data[data.tokens_x.map(len) < max_len]
    data = data[data.tokens_y.map(len) < max_len]
    
    data['snippet_x'] = data.tokens_x.map(lambda row: ' '.join(row))
    data['snippet_y'] = data.tokens_y.map(lambda row: ' '.join(row))
    
    data = data[data.snippet_x.map(len) > 0]
    data = data[data.snippet_y.map(len) > 0]

    data['category_id'] = data['category_id'].map(lambda row: row.split('_')[0])
    data['category_id'] = data['category_id'].replace([0.0], 'same-unit')
    data['order'] = data['order'].replace([0.0], 'NN')
    data['category_id'] = data['category_id'].replace(target_map, regex=False)

    data['relation'] = data['category_id'].map(lambda row: row) + '_' + data['order']
    data['relation'] = data['relation'].replace(relation_map, regex=False)
    
    return data

In [None]:
train_samples = prepare_data(train_samples)
dev_samples = prepare_data(dev_samples)
test_samples = prepare_data(test_samples)

counts = train_samples[TARGET].value_counts(normalize=False).values
counts

In [None]:
drop_columns = ['snippet_x', 'snippet_y', 'category_id', 
                #'snippet_x_tmp', 'snippet_y_tmp', 
                'filename', 'order', #'postags_x', 'postags_y',
                #'is_broken', 
                'tokens_x', 'tokens_y']
y_train, X_train = train_samples[TARGET].to_frame(), train_samples.drop(TARGET, axis=1).drop(
    columns=drop_columns + ['category_id'])
y_dev, X_dev = dev_samples[TARGET].to_frame(), dev_samples.drop(TARGET, axis=1).drop(
    columns=drop_columns + ['category_id'])
y_test, X_test = test_samples[TARGET].to_frame(), test_samples.drop(TARGET, axis=1).drop(
    columns=drop_columns + ['category_id'])

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)

X_scaled_np = scaler.transform(X_train)
X_train = pd.DataFrame(X_scaled_np, index=X_train.index)

X_scaled_np = scaler.transform(X_dev)
X_dev = pd.DataFrame(X_scaled_np, index=X_dev.index)

X_scaled_np = scaler.transform(X_test)
X_test = pd.DataFrame(X_scaled_np, index=X_test.index)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

lab_encoder = LabelEncoder()
y_train = lab_encoder.fit_transform(y_train)

In [None]:
logreg = LogisticRegression(random_state=random_state,
                            solver='lbfgs',
                            n_jobs=8,
                            C=0.002,
                            multi_class='multinomial',
                            class_weight='balanced')

eval_dataset = Pool(data=X_dev,
                    label=y_dev)

catboost = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.1,
    custom_loss=['F1'],
    random_seed=random_state,
    verbose=0,
    loss_function='MultiClass',
    class_weights=counts / counts[-1]
)

fs_catboost = Pipeline([
  ('feature_selection', SelectFromModel(LogisticRegression(solver='saga', penalty='l1', C=1., n_jobs=-1))),
  ('classification', catboost)
])

logreg = LogisticRegression(random_state=random_state,
                            solver='lbfgs',
                            n_jobs=-1,
                            C=0.002,
                            multi_class='multinomial',
                            class_weight='balanced')

fs_catboost_plus_logreg = VotingClassifier([('fs_catboost', fs_catboost), ('logreg', logreg)], voting='soft')

In [None]:
fs_catboost_plus_logreg.fit(X_train, y_train)

In [None]:
from sklearn import metrics

predicted = lab_encoder.inverse_transform(fs_catboost_plus_logreg.predict(X_dev))

print('weighted f1: ', metrics.f1_score(y_dev.values, predicted, average='weighted'))
print('macro f1: ', metrics.f1_score(y_dev.values, predicted, average='macro'))
print('accuracy: ', metrics.accuracy_score(y_dev.values, predicted))
print()
print(metrics.classification_report(y_dev, predicted, digits=4))
print('macro precision: %.2f'%(metrics.precision_score(y_dev, predicted, average='macro')*100.))
print('macro recall: %.2f'%(metrics.recall_score(y_dev, predicted, average='macro')*100.))

In [None]:
from sklearn import metrics

predicted = lab_encoder.inverse_transform(fs_catboost_plus_logreg.predict(X_test))

print('weighted f1: ', metrics.f1_score(y_test.values, predicted, average='weighted'))
print('macro f1: ', metrics.f1_score(y_test.values, predicted, average='macro'))
print('accuracy: ', metrics.accuracy_score(y_test.values, predicted))
print()
print(metrics.classification_report(y_test, predicted, digits=4))
print('macro precision: %.2f'%(metrics.precision_score(y_test, predicted, average='macro')*100.))
print('macro recall: %.2f'%(metrics.recall_score(y_test, predicted, average='macro')*100.))

In [None]:
import numpy as np

test_metrics = metrics.classification_report(y_test, predicted, digits=4, output_dict=True)
test_f1 = np.array(
    [test_metrics[label].get('f1-score') for label in test_metrics if type(test_metrics[label]) == dict]) * 100

test_f1

In [None]:
import pickle

pickle.dump(fs_catboost_plus_logreg, open('models/dialog_model.pkl', 'wb'))