In [3]:
import os
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils.class_weight import compute_class_weight

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer

from sklearn.metrics import classification_report, confusion_matrix
import itertools

class Document:
    def __init__(self, id, text):
        self.text = text
        self.id = id
        self.sentence_delim = " . "
        self.splitted_text = self.__split_text()

    def __split_text(self):
        return self.text.split(self.sentence_delim)

    def get_sentence(self, id):
        return self.splitted_text[id-1]


In [4]:
def load_documents():
    texts_path = '../data/texts'
    files = os.listdir(texts_path)
    documents = list()
    for filename in files:
        text = open(os.path.join(texts_path, filename), 'r').read()
        document = Document(filename, text)
        documents.append(document)
    return documents

In [13]:
documents = load_documents()
annotations = pd.read_csv('../data/gsml.csv')
dataset = list()

X = list()
y = list()

for document in documents:
    document_annotations = annotations[annotations['TEXT_ID'] == document.id]
    for line_id in range(1, len(document.splitted_text)+1):
        sentence = document.get_sentence(line_id)
        sentence_annotations = document_annotations[document_annotations['SENTENCE_ID'] == line_id]
        if len(sentence_annotations):
            classes = set(sentence_annotations['TYPE'].tolist())
            intersect = classes & {'CONTEXT', 'WORD', 'NOT_CHECKABLE'}
#             classes = classes - {'CONTEXT', 'NOT_CHECKABLE', 'OTHER'}
            if len(intersect):
#                 classes = list(classes)
                X.append(sentence)
                y.append(1)
            else:
                X.append(sentence)
                y.append(0)
        else:
            X.append(sentence)
            y.append(0)

In [14]:
classes = sorted(list(set(y)))

In [15]:
classes

[0, 1]

In [30]:
tokenizer = RegexpTokenizer(r'\w+')
text_clf = Pipeline([
        ('tfidf', TfidfVectorizer(
            tokenizer=tokenizer.tokenize,
            lowercase=True,
#             ngram_range=(1, 3),
            stop_words=stopwords.words('english')
        )),
        ('clf', LogisticRegression(C=0.5, class_weight='balanced')),
    ])

In [31]:
cross_val_score(text_clf, X, y, cv=10)

array([0.57894737, 0.53947368, 0.62666667, 0.58666667, 0.70666667,
       0.66666667, 0.6       , 0.68      , 0.66666667, 0.45333333])

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [33]:
text_clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='\\w+', gaps=False, discard_empty=True, flags=re.UNICODE|re.MULTILINE|re.DOTALL)>)),
                ('clf', LogisticRegression(C=0.5, class_weight='balanced'))])

In [34]:
y_train

[0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,


In [35]:
def evaluate_baseline(model, X_test, y_test):
#     logging.info("Predicting on test...")
    predicted = model.predict(X_test)
    acc = np.mean(predicted == y_test)
#     logging.info("Accuracy on test: {}".format(acc))
    print(classification_report(y_test, predicted))
#     plot_confusion_matrix(confusion_matrix(y_test, predicted), sorted(set(y_test)))


    # Analyzing results here
    clf = model.steps[1][1]
    tf_idf_vectorizer = model.steps[0][1]
    feature_names = tf_idf_vectorizer.get_feature_names()
    for cls, coefs in zip(clf.classes_, clf.coef_):
        print("="*20)
        print(classes[cls])
        weighted_coefs = tf_idf_vectorizer.idf_ * coefs
        sorted_coefs = weighted_coefs.argsort()
#         sorted_coefs = coefs.argsort()

        topk_good_words = sorted_coefs[-10:][::-1]
        good_words = {feature_names[i] for i in topk_good_words}
        print("Top good words: {}".format(good_words))

        topk_bad_words = sorted_coefs[:10][::-1]
        bad_words = {feature_names[i] for i in topk_bad_words}
        print("Top bad words: {}".format(bad_words))
        print("=" * 20)

In [36]:
evaluate_baseline(text_clf, X_test, y_test)

              precision    recall  f1-score   support

           0       0.63      0.69      0.66       130
           1       0.63      0.56      0.59       119

    accuracy                           0.63       249
   macro avg       0.63      0.63      0.63       249
weighted avg       0.63      0.63      0.63       249

0
Top good words: {'starter', 'unit', 'next', 'half', 'travel', 'lost', 'figures', 'conference', 'matchup', 'back'}
Top bad words: {'including', 'players', '34', 'shot', 'contributions', 'long', 'defeated', 'percent', 'combined', 'minnesota'}


In [37]:
text_clf.predict(X_train)

array([0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,

In [38]:
y_test

[0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1]

In [39]:
import joblib
joblib.dump(text_clf, 'text_binary_clf.joblib')

['text_binary_clf.joblib']