In [376]:
import os
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils.class_weight import compute_class_weight

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer

from sklearn.metrics import classification_report, confusion_matrix
import itertools

class Document:
    def __init__(self, id, text):
        self.text = text
        self.id = id
        self.sentence_delim = " . "
        self.splitted_text = self.__split_text()

    def __split_text(self):
        return self.text.split(self.sentence_delim)

    def get_sentence(self, id):
        return self.splitted_text[id-1]


In [377]:
def load_documents():
    texts_path = '../data/texts'
    files = os.listdir(texts_path)
    documents = list()
    for filename in files:
        text = open(os.path.join(texts_path, filename), 'r').read()
        document = Document(filename, text)
        documents.append(document)
    return documents

In [522]:
documents = load_documents()
annotations = pd.read_csv('../data/gsml.csv')
dataset = list()

X = list()
y = list()

for document in documents:
    document_annotations = annotations[annotations['TEXT_ID'] == document.id]
    for line_id in range(1, len(document.splitted_text)+1):
        sentence = document.get_sentence(line_id)
        sentence_annotations = document_annotations[document_annotations['SENTENCE_ID'] == line_id]
        if len(sentence_annotations):
            classes = set(sentence_annotations['TYPE'].tolist())
            classes = classes - {'OTHER'}
#             classes = classes - {'CONTEXT', 'NOT_CHECKABLE', 'OTHER'}
            if len(classes):
                classes = list(classes)
                X.append(sentence)
                y.append(classes)
#         else:
#             classes = ['NONE']
#             X.append(sentence)
#             y.append(classes)

In [523]:
classes = sorted(list(set([c for cs in y for c in cs])))

In [524]:
classes

['CONTEXT', 'NAME', 'NOT_CHECKABLE', 'NUMBER', 'WORD']

In [525]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)

In [526]:
tokenizer = RegexpTokenizer(r'\w+')
text_clf = Pipeline([
        ('tfidf', TfidfVectorizer(
            tokenizer=lambda x: tokenizer.tokenize(x),
            lowercase=True,
#             ngram_range=(1, 3),
            stop_words=stopwords.words('english')
        )),
        ('clf', OneVsRestClassifier(LogisticRegression(C=0.5, class_weight='balanced'))),
    ])

In [527]:
cross_val_score(text_clf, X, y, cv=10)

array([0.27586207, 0.31034483, 0.32758621, 0.27586207, 0.25862069,
       0.31034483, 0.27586207, 0.27586207, 0.40350877, 0.1754386 ])

In [528]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [529]:
text_clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function <lambda> at 0x7fb3729a2d30>)),
                ('clf',
                 OneVsRestClassifier(estimator=LogisticRegression(C=0.5,
                                                                  class_weight='balanced')))])

In [530]:
y_train

array([[1, 0, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 0, 1, 1],
       ...,
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 1],
       [0, 0, 0, 0, 1]])

In [555]:
def evaluate_baseline(model, X_test, y_test):
#     logging.info("Predicting on test...")
    predicted = model.predict(X_test)
    acc = np.mean(predicted == y_test)
#     logging.info("Accuracy on test: {}".format(acc))
    print(classification_report(y_test, predicted, target_names=classes))
#     plot_confusion_matrix(confusion_matrix(y_test, predicted), sorted(set(y_test)))


    # Analyzing results here
    clf = model.steps[1][1]
    tf_idf_vectorizer = model.steps[0][1]
    feature_names = tf_idf_vectorizer.get_feature_names()
    for cls, coefs in zip(clf.classes_, clf.coef_):
        print("="*20)
        print(classes[cls])
        weighted_coefs = tf_idf_vectorizer.idf_ * coefs
        sorted_coefs = weighted_coefs.argsort()
#         sorted_coefs = coefs.argsort()

        topk_good_words = sorted_coefs[-10:][::-1]
        good_words = {feature_names[i] for i in topk_good_words}
        print("Top good words: {}".format(good_words))

        topk_bad_words = sorted_coefs[:10][::-1]
        bad_words = {feature_names[i] for i in topk_bad_words}
        print("Top bad words: {}".format(bad_words))
        print("=" * 20)

In [556]:
evaluate_baseline(text_clf, X_test, y_test)

               precision    recall  f1-score   support

      CONTEXT       0.17      0.55      0.26        11
         NAME       0.82      0.74      0.78        69
NOT_CHECKABLE       0.50      0.60      0.55        10
       NUMBER       0.69      0.85      0.76        95
         WORD       0.61      0.65      0.63        99

    micro avg       0.63      0.73      0.68       284
    macro avg       0.56      0.68      0.60       284
 weighted avg       0.67      0.73      0.69       284
  samples avg       0.65      0.75      0.66       284

CONTEXT
Top good words: {'lopez', 'johnson', 'chipped', 'followed', '10', 'minutes', '2', 'unit', 'brook', 'simply'}
Top bad words: {'games', 'supplemented', 'friday', '11', 'last', 'percent', 'team', 'home', 'high', 'averaging'}
NAME
Top good words: {'host', 'kings', 'look', 'rockets', 'wednesday', 'friday', 'monday', 'center', 'home', 'defeated'}
Top bad words: {'games', 'points', 'half', 'last', 'three', 'second', 'two', 'double', 'four', '

  _warn_prf(average, modifier, msg_start, len(result))


In [533]:
text_clf.predict(X_train)

array([[1, 0, 0, 1, 0],
       [1, 0, 0, 1, 1],
       [0, 0, 0, 1, 1],
       ...,
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 1],
       [0, 0, 0, 0, 1]])

In [456]:
y_test

array([[1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 1],
       [0, 0, 1],
       [0, 1, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0,

In [540]:
text_clf.steps[0][1].idf_

array([4.127792  , 5.57471098, 3.62880083, 3.06918504, 5.86239305,
       6.26785816, 6.26785816, 6.26785816, 5.57471098, 5.57471098,
       6.26785816, 5.57471098, 5.86239305, 5.86239305, 3.32341918,
       6.26785816, 5.57471098, 6.26785816, 5.86239305, 5.57471098,
       5.86239305, 3.43464482, 6.26785816, 5.86239305, 5.86239305,
       5.57471098, 6.26785816, 3.40565728, 6.26785816, 3.55980796,
       6.26785816, 6.26785816, 3.96527307, 3.74212951, 3.62880083,
       3.9164829 , 4.01656636, 3.52701814, 4.32194801, 4.32194801,
       4.07063358, 4.07063358, 4.32194801, 5.01509519, 5.57471098,
       4.56311007, 4.76378076, 4.07063358, 3.96527307, 4.65842025,
       4.32194801, 4.56311007, 5.16924587, 4.56311007, 4.76378076,
       5.35156743, 5.16924587, 4.47609869, 4.65842025, 3.43464482,
       3.62880083, 4.8815638 , 5.35156743, 4.56311007, 5.16924587,
       5.57471098, 4.8815638 , 5.57471098, 5.57471098, 5.57471098,
       3.9164829 , 5.35156743, 6.26785816, 5.86239305, 6.26785

In [541]:
text_clf.steps[1][1].coef_

array([[ 0.90683954, -0.06557132,  0.06806231, ..., -0.06548445,
         0.15646182,  0.24146739],
       [-0.29855802,  0.03809915, -0.3960506 , ...,  0.20346196,
        -0.16129374, -0.03164938],
       [-0.24179801,  1.00901827, -0.38692467, ..., -0.07314323,
        -0.08111813, -0.02167191],
       [-0.35060858, -0.18657543,  0.05381775, ...,  0.06789263,
        -0.15446902, -0.06211635],
       [ 0.12793554, -0.09466666,  0.4922605 , ..., -0.09109563,
         0.26933729,  0.06484329]])

In [546]:
text_clf.steps[1][1].coef_.dot(text_clf.steps[0][1].idf_)

array([16.51800645, 21.11435334, 26.13184551, 10.39371625, 20.50593536])