In [1]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import precision_recall_fscore_support, classification_report
from itertools import chain
import os
from xml.etree import ElementTree as ET
from tempfile import NamedTemporaryFile
import numpy as np
import csv
from collections import defaultdict,Counter,namedtuple
import fastText 

In [2]:
def fullname(header_elem):
    _id = "#"  + header_elem.attrib['{http://www.w3.org/XML/1998/namespace}id']
    fullname =header_elem.find('./nsa:persName',ns).text 
    return _id, fullname

In [3]:
root = "PSC/Posiedzenia/kadencja8"
dirs = os.listdir(root)
ns = {'nsa': 'http://www.tei-c.org/ns/1.0'}

texts = map(lambda d: ET.parse(f"{root}/{d}/text_structure.xml").getroot(),dirs)
headers = map(lambda d: ET.parse(f"{root}/{d}/header.xml").getroot(),dirs)

person_elems = chain.from_iterable(map(lambda x: x.findall(".//nsa:person",ns), headers))
persons = map(fullname, person_elems)
id_name= [(x[0]," ".join(x[1].split()[-2:]) ) for x in set(persons)]
name_id_dict = dict()
for x in id_name:
    name_id_dict[x[1]] = x[0]

texts_elems = chain.from_iterable(map(lambda e: e.findall('.//nsa:u', ns),texts))
sentences  = list(map(lambda e: (e.get('who'),e.text),texts_elems))
counted_sentences = Counter((map(lambda x: x[0], sentences)))


In [4]:
name_partia = dict()
id_partia = dict()
with open('poslowie.csv') as csv_file:
    reader = csv.reader(csv_file, delimiter=';')
    for x in reader:
        if x[0] in name_id_dict:
            _id = name_id_dict[x[0]]
            name_partia[_id] = x[1]
    

In [5]:
id_partia = dict()
for x in id_name:
    id_partia[x[0]] = name_partia.get(x[1],x[0])
id_partia

counting = defaultdict(int)
for e in [(id_partia[x[0]],x[1]) for x in counted_sentences.most_common()]:
    counting[e[0]]+= e[1]
    

In [6]:
tagged_sentences = list()
Sentence = namedtuple('Sentence',['partia','text','person'])
for s in sentences:
    if s[0] in name_partia:
        tagged_sentences.append(Sentence(name_partia[s[0]],s[1],s[0]))
        

In [7]:
data = list(map(lambda s:(f"__label__{s.partia} {s.text}",s.partia) ,tagged_sentences))

data_x, data_y = zip(*data)
x_data = np.array(data_x)

In [8]:
def predict(test_file):
    return check_output([FASTTEXT_PATH, 'predict', f'{MODEL_NAME}.bin', test_file]).decode().split('\r\n')[:-1]

In [9]:
def print_results(y_true, predictions, average):
    precision, recall, fscore, _ = precision_recall_fscore_support(y_true, predictions, average=average)
    print(average)
    print('Precision', precision)
    print('Recall', recall)
    print('F1', fscore)

In [16]:
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.25)

for train_index, test_index in stratified_split.split(data_x,data_y):
    x_train, x_test = x_data[train_index], x_data[test_index]
    train_file = NamedTemporaryFile(mode = 'w', delete=False)
    test_file = NamedTemporaryFile(mode = 'w', delete=False)
    for x in x_train:
        train_file.write(x + "\n")
    for x in x_test:
        test_file.write(x + "\n")
    model = fastText.train_supervised(
        input=train_file.name, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1)
    predictions = list(map(lambda s: model.predict(s)[0][0],x_test))
    y_true = [sentence.split()[0]  for sentence in x_test]
    print_results(y_true,predictions,average='micro')
    print(classification_report(y_true,predictions))
    
    
 


micro
Precision 0.551045093864912
Recall 0.551045093864912
F1 0.551045093864912
                     precision    recall  f1-score   support

   __label__Kukiz15       0.46      0.29      0.35      2768
__label__Nowoczesna       0.43      0.32      0.37      4108
       __label__PIS       0.57      0.57      0.57     12479
        __label__PO       0.57      0.73      0.64     16177
       __label__PSL       0.49      0.33      0.40      3157
       __label__WIS       0.67      0.41      0.51      1544
     __label__other       0.33      0.11      0.17      1103

        avg / total       0.54      0.55      0.54     41336



In [18]:
predictions

['__label__PIS',
 '__label__PO',
 '__label__Nowoczesna',
 '__label__PO',
 '__label__PIS',
 '__label__PIS',
 '__label__PIS',
 '__label__PIS',
 '__label__PIS',
 '__label__WIS',
 '__label__PO',
 '__label__PO',
 '__label__PIS',
 '__label__PIS',
 '__label__PO',
 '__label__PO',
 '__label__PIS',
 '__label__PSL',
 '__label__PO',
 '__label__PIS',
 '__label__PO',
 '__label__Kukiz15',
 '__label__PO',
 '__label__Nowoczesna',
 '__label__PIS',
 '__label__PO',
 '__label__PO',
 '__label__Nowoczesna',
 '__label__Kukiz15',
 '__label__PIS',
 '__label__PIS',
 '__label__PO',
 '__label__Nowoczesna',
 '__label__PIS',
 '__label__PO',
 '__label__PO',
 '__label__Kukiz15',
 '__label__PO',
 '__label__PO',
 '__label__PO',
 '__label__PO',
 '__label__PO',
 '__label__PSL',
 '__label__PIS',
 '__label__Nowoczesna',
 '__label__PO',
 '__label__PO',
 '__label__Nowoczesna',
 '__label__PO',
 '__label__PIS',
 '__label__PO',
 '__label__PO',
 '__label__PIS',
 '__label__PO',
 '__label__PO',
 '__label__PSL',
 '__label__PIS',
 '_