In [14]:
from nltk.classify import NaiveBayesClassifier
import sqlite3
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

import utils.data_prep as data_prep

In [2]:
data_set = data_prep.get_dataset()
data_set.head()

Unnamed: 0,company,year,page,paragraph_nr,paragraph,section,class
0,BASF,2016,4,64,154 Consolidated Financial Statements BASF R...,auditors report,3
1,BASF,2016,4,65,Auditor’s report,auditors report,3
2,BASF,2016,4,66,Auditor’s report,auditors report,3
3,BASF,2016,4,67,We have audited the consolidated financial sta...,auditors report,3
4,BASF,2016,4,68,assurance. Knowledge of the business activitie...,auditors report,3


In [65]:
sections = (data_set.groupby(['company','year','section','class','page'])['paragraph']
            .apply(lambda x: ' '.join(x))
            .reset_index())
sections.head(20)

Unnamed: 0,company,year,section,class,page,paragraph
0,Adva,2016,auditors fee,2,61,Supervisory \nBoard Management \nBoard Welcome...
1,Adva,2016,auditors report,3,71,Supervisory \nBoard Management \nBoard Welcome...
2,Adva,2016,business combinations,5,25,The final purchase price allocation according ...
3,Adva,2016,business combinations,5,26,106 ADVA Optical Networking – Annua...
4,Adva,2016,business combinations,5,27,Supervisory \nBoard Management \nBoard Welcome...
5,Adva,2016,cash flow notes,6,50,130 ADVA Optical Networking – Annua...
6,Adva,2016,cash_equiv,7,28,108 ADVA Optical Networking – Annua...
7,Adva,2016,eps,11,50,130 ADVA Optical Networking – Annua...
8,Adva,2016,equity,12,41,Supervisory \nBoard Management \nBoard Welcome...
9,Adva,2016,equity,12,42,122 ADVA Optical Networking – Annua...


In [72]:
all_words = sections.groupby(lambda x: True)['paragraph'].apply(lambda x: ' '.join(x)).tolist()
all_words = nltk.word_tokenize(all_words[0])
words = nltk.FreqDist(w.lower() for w in all_words)
# if w.lower() not in stopwords.words('english')

In [73]:
word_features = list(words)[:3000]

def document_features(section):
    section_words = set([word.lower() for word in nltk.word_tokenize(section)])
    features = {}
    for word in word_features:
        features[word] = (word.lower() in section_words)
    return features


sections['features'] = sections['paragraph'].apply(document_features)
sections.head()

Unnamed: 0,company,year,section,class,page,paragraph,features
0,Adva,2016,auditors fee,2,61,Supervisory \nBoard Management \nBoard Welcome...,"{'supervisory': True, 'board': True, 'manageme..."
1,Adva,2016,auditors report,3,71,Supervisory \nBoard Management \nBoard Welcome...,"{'supervisory': True, 'board': True, 'manageme..."
2,Adva,2016,business combinations,5,25,The final purchase price allocation according ...,"{'supervisory': True, 'board': True, 'manageme..."
3,Adva,2016,business combinations,5,26,106 ADVA Optical Networking – Annua...,"{'supervisory': False, 'board': False, 'manage..."
4,Adva,2016,business combinations,5,27,Supervisory \nBoard Management \nBoard Welcome...,"{'supervisory': True, 'board': True, 'manageme..."


In [74]:
feature_set = list(zip(sections['features'].values,sections['class'].values))
len(feature_set)
train_set, test_set = feature_set[:2000], feature_set[2000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.627731092437


In [71]:
classifier.show_most_informative_features(30)

Most Informative Features
                     fee = True               19 : 17     =    182.0 : 1.0
                  berlin = True               19 : 17     =    182.0 : 1.0
                      61 = True               19 : 17     =    182.0 : 1.0
                     607 = True               19 : 17     =    182.0 : 1.0
               workforce = True               19 : 17     =    182.0 : 1.0
                  behalf = True               19 : 17     =    182.0 : 1.0
                     317 = True                3 : 17     =    134.0 : 1.0
                    fees = True                2 : 15     =    121.0 : 1.0
                       , = False              37 : 17     =    118.0 : 1.0
                    days = True               47 : 17     =    110.0 : 1.0
                 network = True               19 : 17     =    109.2 : 1.0
               subsidies = True               19 : 15     =    103.0 : 1.0
              government = True               19 : 15     =    103.0 : 1.0