In [1]:
import os
import sys

import pandas as pd
import xml.etree.ElementTree as ET

sys.path.append('..')

from tagger import ClarinTagger

### Load data

In [86]:
data_path = '../wiki_data'

processed_train_path = os.path.join(data_path, 'wiki_train_34_categories_data_processed')
processed_test_path = os.path.join(data_path, 'wiki_test_34_categories_data_processed')

In [87]:
def load_data(path):
    df_data = pd.DataFrame()
    with os.scandir(path) as it:
        for entry in it:
            if not entry.name.startswith('.') and entry.is_file():
                df_data = df_data.append(pd.read_pickle(entry.path), ignore_index=True)
    return df_data

In [88]:
df_test = load_data(processed_test_path)

In [89]:
len(df_test)

2957

In [90]:
df_test.head()

Unnamed: 0,text,cat,filename,tokens
0,Giora Feidman (ur. 26 marca 1936 w Buenos Aire...,Zydzi,Zydzi_1625774.txt,"[(Giora, ign), (Feidman, ign), ((, interp), (u..."
1,Cztery rodzaje samadhi – praktyka medytacyjna ...,Kultura-Chin,Kultura-Chin_3106092.txt,"[(cztery, num:pl:nom:m1:rec), (rodzaj, subst:p..."
2,Dimetylotryptamina (DMT) – organiczny związek ...,Narkomania,Narkomania_67602.txt,"[(Dimetylotryptamina, ign), ((, interp), (DMT,..."
3,John C. Wright (ur. 1961) – amerykański autor ...,Amerykanscy-prozaicy,Amerykanscy-prozaicy_1887767.txt,"[(John, subst:sg:nom:m1), (C, subst:sg:nom:m3)..."
4,Force India VJM06''' – bolid Formuły 1 zespołu...,Samochody,Samochody_2867788.txt,"[(Force, ign), (India, ign), (VJM06, ign), (',..."


In [92]:
df_train = load_data(processed_train_path)

In [93]:
len(df_train)

6509

In [94]:
df_train.head()

Unnamed: 0,text,cat,filename,tokens
0,"Pierwiastki biofilne, pierwiastki biogenne, nu...",Pierwiastki-chemiczne,Pierwiastki-chemiczne_1857348.txt,"[(pierwiastek, subst:pl:nom:m3), (biofilny, ad..."
1,"Robocizna, to praca włożona w wykonanie określ...",Rachunkowosc,Rachunkowosc_1691781.txt,"[(robocizna, subst:sg:nom:f), (,, interp), (to..."
2,Jonathan Cheecho (ur. 15 lipca 1980 w Moose Fa...,Pilka-nozna,Pilka-nozna_3022011.txt,"[(Jonathan, ign), (Cheecho, ign), ((, interp),..."
3,Florin Daniel Bratu (ur. 2 stycznia 1980 w Buk...,Pilka-nozna,Pilka-nozna_2293521.txt,"[(Florin, ign), (Daniel, subst:sg:nom:m1), (br..."
4,"Iterb (Yb, łac. ytterbium) – pierwiastek chemi...",Pierwiastki-chemiczne,Pierwiastki-chemiczne_30496.txt,"[(iterb, subst:sg:nom:m3), ((, interp), (Yb, i..."


### Filtering

In [163]:
def filter_tags(tokens, tags=None):
    if tags:
        return [x for x in tokens if x.tag.split(':')[0] in tags]
    else:
        return [x for x in tokens]

In [164]:
def text_from_tokens(tokens):
    return [x.text for x in tokens]

In [165]:
def get_filtered_text(data, tags):
    data['filtered_tokens'] = data['tokens'].apply(partial(filter_tags, tags=tags))
    data['filtered_tokens_text'] = data['filtered_tokens'].apply(text_from_tokens)
    data['filtered_tokens_text'] = data['filtered_tokens_text'].str.join(' ')
    return data

In [115]:
noun_forms = ['subst', 'depr']
verb_forms = ['fin', 'bedzie', 'aglt', 'praet', 'impt', 'imps', 'inf', 'pcon', 'pant', 'ger', 'pact', 'ppas',]
adjective_froms = ['adj', 'adja', 'adjc', 'adjp']

In [166]:
df_train_base = get_filtered_text(df_train.copy(), None)
df_test_base = get_filtered_text(df_test.copy(), None)

df_train_nouns = get_filtered_text(df_train.copy(), noun_forms)
df_test_nouns = get_filtered_text(df_test.copy(), noun_forms)

df_train_verbs = get_filtered_text(df_train.copy(), verb_forms)
df_test_verbs = get_filtered_text(df_test.copy(), verb_forms)

df_train_adjs = get_filtered_text(df_train.copy(), adjective_froms)
df_test_adjs = get_filtered_text(df_test.copy(), adjective_froms)

In [167]:
df_train_base.head()

Unnamed: 0,text,cat,filename,tokens,tokens_text,filtered_tokens,filtered_tokens_text
0,"Pierwiastki biofilne, pierwiastki biogenne, nu...",Pierwiastki-chemiczne,Pierwiastki-chemiczne_1857348.txt,"[(pierwiastek, subst:pl:nom:m3), (biofilny, ad...","pierwiastek biofilny , pierwiastek biogenny , ...","[(pierwiastek, subst:pl:nom:m3), (biofilny, ad...","pierwiastek biofilny , pierwiastek biogenny , ..."
1,"Robocizna, to praca włożona w wykonanie określ...",Rachunkowosc,Rachunkowosc_1691781.txt,"[(robocizna, subst:sg:nom:f), (,, interp), (to...","robocizna , to praca włożyć w wykonanie określ...","[(robocizna, subst:sg:nom:f), (,, interp), (to...","robocizna , to praca włożyć w wykonanie określ..."
2,Jonathan Cheecho (ur. 15 lipca 1980 w Moose Fa...,Pilka-nozna,Pilka-nozna_3022011.txt,"[(Jonathan, ign), (Cheecho, ign), ((, interp),...",Jonathan Cheecho ( ur . 15 lipiec 1980 w Moose...,"[(Jonathan, ign), (Cheecho, ign), ((, interp),...",Jonathan Cheecho ( ur . 15 lipiec 1980 w Moose...
3,Florin Daniel Bratu (ur. 2 stycznia 1980 w Buk...,Pilka-nozna,Pilka-nozna_2293521.txt,"[(Florin, ign), (Daniel, subst:sg:nom:m1), (br...",Florin Daniel brat ( ur . 2 styczeń 1980 w Buk...,"[(Florin, ign), (Daniel, subst:sg:nom:m1), (br...",Florin Daniel brat ( ur . 2 styczeń 1980 w Buk...
4,"Iterb (Yb, łac. ytterbium) – pierwiastek chemi...",Pierwiastki-chemiczne,Pierwiastki-chemiczne_30496.txt,"[(iterb, subst:sg:nom:m3), ((, interp), (Yb, i...","iterb ( Yb , łac . ytterbium ) – pierwiastek c...","[(iterb, subst:sg:nom:m3), ((, interp), (Yb, i...","iterb ( Yb , łac . ytterbium ) – pierwiastek c..."


### Classification

In [150]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.pipeline import Pipeline

In [151]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=True, smooth_idf=True)),
    ('clf', MultinomialNB(alpha=1))
])

In [161]:
def compare_results(pipeline, datasets, descriptions, tagger_name):
    df_results = pd.DataFrame(columns=['desc', 'f1', 'acc', 'prec', 'rec'])
    for (train, test), desc in zip(datasets, descriptions):
        train_X = train['tokens_text'].values
        train_y = train['cat']
        test_X = test['tokens_text'].values
        test_y = test['cat']
        pipeline.fit(train_X, train_y)
        preds = pipeline.predict(test_X)
        f1 = metrics.f1_score(y_true=y_test, y_pred=preds, average='macro')
        rec = metrics.recall_score(y_true=y_test, y_pred=preds, average='macro')
        prec = metrics.precision_score(y_true=y_test, y_pred=preds, average='macro')
        acc = metrics.accuracy_score(y_true=y_test, y_pred=preds)
        df_results.append({
            'tagger': tagger_name,
            'desc': desc,
            'f1': f1,
            'rec': rec,
            'prec': prec,
            'acc': acc
        })
    return df_results

In [None]:
df_results = compare_results(
    pipeline=pipeline,
    datasets=[[]]
)

In [152]:
train_X = df_train['tokens_text'].values
train_y = df_train['cat']
test_X = df_test['tokens_text'].values
test_y = df_test['cat']      

In [155]:
pipeline.fit(train_X, train_y)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB(alpha=1))])

In [156]:
preds = pipeline.predict(test_X)

In [159]:
print(metrics.classification_report(y_true=test_y, y_pred=preds))

                                   precision    recall  f1-score   support

                          Albania       0.90      0.61      0.73        92
             Amerykanscy-prozaicy       0.76      0.99      0.86       103
                         Arabowie       1.00      0.21      0.35        33
                     Astronautyka       0.90      0.85      0.87        93
                          Choroby       0.91      0.89      0.90        81
                            Egipt       0.88      0.71      0.78        92
                  Ekologia-roslin       0.85      0.95      0.90        86
                  Filmy-animowane       0.96      0.85      0.90        92
                    Galezie-prawa       0.95      0.41      0.57        90
                  Gry-komputerowe       0.94      0.90      0.92        98
                       Karkonosze       0.86      0.99      0.92        82
                       Katolicyzm       0.78      0.95      0.86        84
                        

In [138]:
# loading CountVectorizer
tf_vectorizer = CountVectorizer() # or term frequency

X_train_tf = tf_vectorizer.fit_transform(df_train['tokens_text'])


