In [22]:
import os
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils.class_weight import compute_class_weight

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer

from sklearn.metrics import classification_report, confusion_matrix
import itertools

class Document:
    def __init__(self, id, text):
        self.text = text
        self.id = id
        self.sentence_delim = " . "
        self.splitted_text = self.__split_text()

    def __split_text(self):
        return self.text.split(self.sentence_delim)

    def get_sentence(self, id):
        return self.splitted_text[id-1]


In [23]:
def load_documents():
    texts_path = '../data/texts'
    files = os.listdir(texts_path)
    documents = list()
    for filename in files:
        text = open(os.path.join(texts_path, filename), 'r').read()
        document = Document(filename, text)
        documents.append(document)
    return documents

In [89]:
def filter_name_numbers(text):
    
    word_numbers = {
        'first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'nineth', 'tenth',
        'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
        '1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th',
        'pair'
    }
    
    tokens = text.split()
    filtered_tokens = list()
    
#     filtered_tokens.append(tokens[0])  # Append the first token
    
    for i, token in enumerate(tokens):
        if i == 0 and token.lower() not in stopwords.words('english'):  # Most likely a name
            filtered_tokens.append("##name##")
        if token[0].isupper():  # It's a name
            filtered_tokens.append("##name##")
        elif token.isdigit() or token in word_numbers:  # It's a number
            filtered_tokens.append("##number##")
        else:
            filtered_tokens.append(token)
    return " ".join(filtered_tokens)

In [90]:
documents = load_documents()
annotations = pd.read_csv('../data/gsml.csv')
dataset = list()

X = list()
y = list()

for document in documents:
    document_annotations = annotations[annotations['TEXT_ID'] == document.id]
    for line_id in range(1, len(document.splitted_text)+1):
        sentence = document.get_sentence(line_id)
        sentence_annotations = document_annotations[document_annotations['SENTENCE_ID'] == line_id]
        if len(sentence_annotations):
            classes = set(sentence_annotations['TYPE'].tolist())
            classes = classes - {'NAME', 'OTHER', 'NUMBER'}
#             classes = classes - {'CONTEXT', 'NOT_CHECKABLE', 'OTHER'}
            if len(classes):
                classes = list(classes)
                X.append(filter_name_numbers(sentence))
                y.append(classes)
#         else:
#             classes = ['NONE']
#             X.append(sentence)
#             y.append(classes)

In [91]:
classes = sorted(list(set([c for cs in y for c in cs])))

In [92]:
classes

['CONTEXT', 'NOT_CHECKABLE', 'WORD']

In [93]:
X

['##name## ##name## ##name## defeated the host ##name## ##name## , ##number## - ##number## , at ##name## ##name## ##name## on ##name##',
 '##name## ##name## , the ##name## battled back in the ##number## , outscoring the ##name## by a ##number## - ##number## margin over the final ##number## minutes',
 "##name## ##name## were led by ##name## ##name## 's ##number## points , which he supplemented with ##number## rebounds , ##number## assists and a ##number## of blocks",
 '##name## ##name## ##name## was right behind him with ##number## points , ##number## assists , ##number## rebounds and a steal',
 '##name## ##name## ##name## led the bench with ##number## points , ##number## rebounds , an assist and a steal',
 "##name## ##name## was led by ##name## ##name## 's ##number## points , which he supplemented with ##number## rebounds , ##number## assists , a steal and a block",
 '##name## ##name## ##name## was productive in a reserve role as well with ##number## points , ##number## rebounds , ##nu

In [94]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)

In [119]:
tokenizer = RegexpTokenizer(r'\w+')
text_clf = Pipeline([
        ('tfidf', TfidfVectorizer(
            tokenizer=tokenizer.tokenize,
            lowercase=True,
#             ngram_range=(1, 3),
            stop_words=stopwords.words('english')
        )),
        ('clf', OneVsRestClassifier(LogisticRegression(C=0.5, class_weight='balanced'))),
    ])

In [120]:
cross_val_score(text_clf, X, y, cv=10)

array([0.61764706, 0.67647059, 0.61764706, 0.58823529, 0.55882353,
       0.61764706, 0.51515152, 0.6969697 , 0.42424242, 0.51515152])

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [122]:
text_clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='\\w+', gaps=False, discard_empty=True, flags=re.UNICODE|re.MULTILINE|re.DOTALL)>)),
                ('clf',
                 OneVsRestClassifier(estimator=LogisticRegression(C=0.5,
                                                               

In [123]:
y_train

array([[0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 1],
       [0, 0, 1],
       [1, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0,

In [124]:
def evaluate_baseline(model, X_test, y_test):
#     logging.info("Predicting on test...")
    predicted = model.predict(X_test)
    acc = np.mean(predicted == y_test)
#     logging.info("Accuracy on test: {}".format(acc))
    print(classification_report(y_test, predicted, target_names=classes))
#     plot_confusion_matrix(confusion_matrix(y_test, predicted), sorted(set(y_test)))


    # Analyzing results here
    clf = model.steps[1][1]
    tf_idf_vectorizer = model.steps[0][1]
    feature_names = tf_idf_vectorizer.get_feature_names()
    for cls, coefs in zip(clf.classes_, clf.coef_):
        print("="*20)
        print(classes[cls])
        weighted_coefs = tf_idf_vectorizer.idf_ * coefs
        sorted_coefs = weighted_coefs.argsort()
#         sorted_coefs = coefs.argsort()

        topk_good_words = sorted_coefs[-10:][::-1]
        good_words = {feature_names[i] for i in topk_good_words}
        print("Top good words: {}".format(good_words))

        topk_bad_words = sorted_coefs[:10][::-1]
        bad_words = {feature_names[i] for i in topk_bad_words}
        print("Top bad words: {}".format(bad_words))
        print("=" * 20)

In [125]:
evaluate_baseline(text_clf, X_test, y_test)

               precision    recall  f1-score   support

      CONTEXT       0.30      0.62      0.41        16
NOT_CHECKABLE       0.65      0.92      0.76        12
         WORD       0.89      0.83      0.86        89

    micro avg       0.71      0.81      0.76       117
    macro avg       0.61      0.79      0.68       117
 weighted avg       0.79      0.81      0.79       117
  samples avg       0.74      0.82      0.76       117

CONTEXT
Top good words: {'n', 'added', 'shooting', 'without', 'simply', 'followed', 'enough', 'unit', 'chipped', 'number'}
Top bad words: {'last', 'double', 'team', 'back', 'half', 'defeated', 'games', 'averaging', 'home', 'high'}
NOT_CHECKABLE
Top good words: {'last', 'year', 'seed', 'time', 'win', 'remain', 'games', 'averaging', 'place', '000s'}
Top bad words: {'double', 'figures', 'steal', 'bench', 'back', 'struggle', 'next', 'led', 'assists', 'home'}
WORD
Top good words: {'starter', 'double', 'blocks', 'figures', 'team', 'back', 'half', 'defeated'

  _warn_prf(average, modifier, msg_start, len(result))


In [126]:
text_clf.predict(X_train)

array([[0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 1],
       [1, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0,

In [127]:
y_test

array([[1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 1],
       [0, 0, 1],
       [0, 1, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0,

In [128]:
text_clf.steps[0][1].idf_

array([4.81109709, 3.42480273, 5.03424064, 3.47609602, 4.81109709,
       5.32192271, 5.72738782, 5.72738782, 4.62877553, 5.72738782,
       5.32192271, 5.72738782, 5.32192271, 4.34109346, 3.58732166,
       4.81109709, 5.72738782, 5.72738782, 5.03424064, 3.58732166,
       2.17203976, 5.72738782, 4.62877553, 5.32192271, 4.34109346,
       3.47609602, 5.72738782, 5.03424064, 4.81109709, 5.72738782,
       5.72738782, 4.81109709, 3.08833049, 5.72738782, 5.72738782,
       3.64794628, 5.72738782, 3.7124848 , 5.32192271, 5.72738782,
       5.72738782, 5.72738782, 5.72738782, 5.72738782, 3.93562835,
       5.72738782, 5.72738782, 5.32192271, 4.47462485, 5.72738782,
       5.32192271, 5.72738782, 5.32192271, 5.72738782, 5.72738782,
       5.72738782, 5.72738782, 5.32192271, 5.03424064, 4.47462485,
       5.32192271, 5.72738782, 5.72738782, 5.72738782, 5.03424064,
       5.32192271, 5.72738782, 5.32192271, 4.81109709, 5.72738782,
       4.22331042, 5.72738782, 5.72738782, 5.72738782, 5.72738

In [129]:
text_clf.steps[1][1].coef_

array([[-0.10529897,  0.43721532, -0.11865629, -0.18012573, -0.21671495,
        -0.13052843, -0.031392  , -0.02611262,  0.47998985, -0.0485983 ,
        -0.05414666, -0.02428087, -0.06951219,  0.22730077, -0.13053578,
         0.20726975, -0.01971013, -0.02451283, -0.06835176,  0.31789151,
         0.23690675, -0.02986193, -0.25969381, -0.03097394, -0.18991776,
        -0.34295572, -0.02787007, -0.07684   , -0.13808456, -0.02986193,
        -0.02053417,  0.24250026, -0.07966806, -0.031392  , -0.02804636,
         0.30131039, -0.04793233, -0.18606396, -0.04701649, -0.04469599,
        -0.0300361 , -0.02655533,  0.33785611, -0.01996466, -0.16157814,
        -0.01996466, -0.02951538, -0.07516957,  0.6435621 , -0.03641557,
        -0.04721984, -0.0742293 , -0.04937287, -0.02951538, -0.03209369,
        -0.02871391, -0.02969189, -0.0504429 , -0.1296845 , -0.20639206,
        -0.12959711, -0.02611262, -0.02751997, -0.02451283, -0.06694758,
        -0.0504429 , -0.01878504, -0.05985994, -0.1

In [130]:
text_clf.steps[1][1].coef_.dot(text_clf.steps[0][1].idf_)

array([-10.53367851,  29.3479231 ,   4.44039555])

In [131]:
tf_idf = text_clf.steps[0][1]
reg = text_clf.steps[1][1]
X[0]
X[0], text_clf.predict_proba([X[0]])

('##name## ##name## ##name## defeated the host ##name## ##name## , ##number## - ##number## , at ##name## ##name## ##name## on ##name##',
 array([[0.26796216, 0.26320074, 0.64754493]]))

In [132]:
tf_idf.vocabulary_.get('on')

In [133]:
tf_idf.idf_[127], reg.coef_[0][127]

(4.34109345759245, -0.1903564942678355)

In [134]:
example = 5
instance = X[example]
print(instance)
print(y[example])
probas = text_clf.predict_proba([instance])

scores = list()
for i, p in enumerate(probas[0]):
    print(classes[i])
    print(p)
    class_scores = list()
    for w in instance.split():
        idx = tf_idf.vocabulary_.get(w)
        if idx:
            idf = tf_idf.idf_[idx]
            coef = reg.coef_[i][idx]
            class_scores.append(idf * coef)
        else:
            class_scores.append(0)
    print(class_scores)
    scores.append(class_scores)

# for i, class_scores in enumerate(scores):
#     print(classes[i])
#     print(class_scores)
#     print(X[0])

##name## ##name## was led by ##name## ##name## 's ##number## points , which he supplemented with ##number## rebounds , ##number## assists , a steal and a block
[0 0 1]
CONTEXT
0.4862691086297272
[0, 0, 0, 0.30441943765438606, 0, 0, 0, 0, 0, 2.0139107914378087, 0, 0, 0, -0.7682749655520532, 0, 0, 0.8182756493615077, 0, 0, 0.5145708731680021, 0, 0, 1.0448437076129473, 0, 0, 1.0991641070134657]
NOT_CHECKABLE
0.19721862222649178
[0, 0, 0, -1.6069081980192832, 0, 0, 0, 0, 0, -0.9632059030865238, 0, 0, 0, -0.29843789683835914, 0, 0, -0.8865456790495801, 0, 0, -1.7874152000202066, 0, 0, -1.2946191699556442, 0, 0, -1.0181164583964157]
WORD
0.5476658701879848
[0, 0, 0, 1.1542372816209336, 0, 0, 0, 0, 0, -1.0183458018930225, 0, 0, 0, 0.680458908182392, 0, 0, -0.6426821051637145, 0, 0, -0.30763924056532926, 0, 0, -0.4950268238076635, 0, 0, 0.14972142683017545]


In [135]:
import joblib
joblib.dump(text_clf, 'text_clf.joblib')

['text_clf.joblib']

In [136]:
new_clf = joblib.load('text_clf.joblib')

In [137]:
new_clf.predict_proba([instance])

array([[0.48626911, 0.19721862, 0.54766587]])