# Recognize KCs of interest in book text

In [7]:
import pandas as pd
import numpy as np
import pickle
f = open("../dat/parsed_books/mml-book.txt", "r")
raw_text = f.read()

We are going to use the Wikifier (https://wikifier.org) to recognize and link entities to their corresponding Wikipedia page. Then we might need to filter for KCs that are of relevance.

In [2]:
import requests
from dotenv import dotenv_values
config = dotenv_values("../.env")
userKey = config['WIKIFIER_USER_KEY']

response = requests.post("http://www.wikifier.org/annotate-article",
    data={"userKey": userKey,
          "lang": "en",
          "text": raw_text[:1000],
          "support": "true",
          "ranges": "true"},
)
response = response.json()

In [4]:
print(raw_text[0:1000])

MATHEMATICS  FOR 
MACHINE LEARNING
Marc Peter DeisenrothA. Aldo FaisalCheng Soon Ong
MATHEMATICS FOR MACHINE LEARNING DEISENROTH ET AL.
The fundamental mathematical tools needed to understand machine learning include linear algebra, analytic geometry, matrix decompositions, vector calculus, optimization, probability and statistics. These topics are traditionally taught in disparate courses, making it hard for data science or computer science students, or professionals, to efﬁ  ciently learn the mathematics. This self-contained textbook bridges the gap between mathematical and machine learning texts, introducing the mathematical concepts with a minimum of prerequisites. It uses these concepts to derive four central machine learning methods: linear regression, principal component analysis, Gaussian mixture models and support vector machines. For students and others with a mathematical background, these derivations provide a starting point to machine learning texts. For those learning the

In [18]:
import pandas as pd
supports = {
    a['title']: 
        {"url": a['url'],
         "occurences": [
            {"intvl": pd.Interval(s['chFrom'], s['chTo']), "pr": s['pageRank']}
        for s in a['support'] if s['pageRank'] > 0.001] 
    } 
    for a in response['annotations']
}

supports = {k: v for k, v in supports.items() if len(v['occurences'])>0}

Now we have to clean overlaps such as mixture model. 

In [20]:
import numpy as np
import pickle
KCs_cleaned = []
for title, value in supports.items():
    occs_cleaned = []
    for idx, occ in enumerate(value['occurences']):
        # detect overlaps
        current_intvl = occ['intvl']
        overlaps = [o for o in value['occurences'] if o['intvl'].overlaps(current_intvl) and occ['intvl'] != o['intvl']]
        if len(overlaps): # if we have overlaps, only take the one with the highest Page Rank
            if np.all([occ['pr'] > o['pr'] for o in overlaps]): occs_cleaned.append(occ)
        else:
            occs_cleaned.append(occ)

    KCs_cleaned.append({"title": title, "url": value['url'], "occurences": occs_cleaned})


pickle.dump(KCs_cleaned, open("../dat/KCs.pkl", 'wb'), pickle.HIGHEST_PROTOCOL)

Now upscaling to more than first 1000 characters. First some performance tests:

In [21]:
print(len(raw_text))

859742


In [16]:
def get_annotations(text):
    try:
        response = requests.post("http://www.wikifier.org/annotate-article",
            data={"userKey": userKey,
                "lang": "en",
                "text": text,
                "support": "true",
                "ranges": "true"},
        )
        response = response.json()
        return response['annotations']
    except Exception:
        print(Exception)
        return []

In [26]:
lengths = [1000, 2000, 4000, 8000]
execution_times = []
import time
import random

for l in lengths:
    # select random text of length l
    start_pos = int(random.random() * (len(raw_text)-l))
    print(start_pos)
    # call Wikifier
    start_time = time.time()
    get_annotations(raw_text[start_pos:(start_pos+l)])
    execution_times.append(time.time()-start_time)

468075
670469
354203
165199


In [27]:
execution_times

[4.279824256896973, 8.296481132507324, 10.221187829971313, 43.09164214134216]

It looks like a length around 5000 characters has the best time-length-ratio. Let's try to get KCs for the first 100000 characters of the book (about one eigth).

In [17]:
batch_size = 5000
batch_num = 20
start = 0

KCs = {}
for i in range(batch_num):
    print('Batch ' + str(i))
    offset = start + i * batch_size
    #print(raw_text[offset:(offset+batch_size)])
    annotations = get_annotations(raw_text[offset:(offset+batch_size)])
    supports = {
        a['title']: 
            {"url": a['url'],
            "occurences": [
                {"intvl": pd.Interval(s['chFrom'] + offset, s['chTo'] + offset), "pr": s['pageRank']}
            for s in a['support'] if s['pageRank'] > 0.001]}
        for a in annotations
    }

    # offset characters and merge with existing supports
    for title, value in supports.items():
        if len(value['occurences']) > 0: # only for supports with a good page rank
            if not title in KCs: # ensure that KC exists
                KCs[title] = {'url': value['url'], 'occurences': []}

            occs = []
            for idx, occ in enumerate(value['occurences']):
                # detect overlaps
                current_intvl = occ['intvl']
                overlaps = [o for o in value['occurences'] if o['intvl'].overlaps(current_intvl) and occ['intvl'] != o['intvl']]
                if len(overlaps): # if we have overlaps, only take the one with the highest Page Rank
                    if np.all([occ['pr'] > o['pr'] for o in overlaps]): occs.append(occ)
                else:
                    occs.append(occ)

            KCs[title]['occurences'] = KCs[title]['occurences'] + occs  

Batch 0
Batch 1
Batch 2
Batch 3
Batch 4
Batch 5
Batch 6
Batch 7
Batch 8
Batch 9
Batch 10
Batch 11
Batch 12
Batch 13
Batch 14
Batch 15
Batch 16
Batch 17
Batch 18
Batch 19


In [22]:
# save KCs
pickle.dump(KCs, open("../dat/KCs_mml.pkl", 'wb'), pickle.HIGHEST_PROTOCOL)

In [23]:
{title: len(v['occurences']) for title, v in KCs.items() if len(v['occurences']) > 2}

{'Machine learning': 91,
 'Linear algebra': 14,
 'Linear regression': 7,
 'Analytic geometry': 3,
 'Matrix decomposition': 3,
 'Vector calculus': 5,
 'Computer science': 4,
 'Principal component analysis': 5,
 'Mixture model': 3,
 'Support-vector machine': 5,
 'Linear independence': 28,
 'Mathematics': 11,
 'Euclidean vector': 13,
 'Real number': 3,
 'Complex number': 4,
 'Vector space': 40,
 'Dot product': 4,
 'Identity matrix': 5,
 'Kernel (linear algebra)': 3,
 'Subscript and superscript': 13,
 'Random variable': 3,
 'Dimensionality reduction': 7,
 'Tor (rock formation)': 3,
 'System of linear equations': 44,
 'Free variables and bound variables': 4,
 'Matrix (mathematics)': 5,
 'Row and column vectors': 3,
 'Fraction': 48,
 'Matrix multiplication': 11,
 'Gaussian elimination': 11,
 'Augmented matrix': 7,
 'AJB': 3,
 'Row echelon form': 13,
 'Pivot element': 15,
 'Iterative method': 3,
 'Inverse element': 10,
 'Identity element': 10,
 'Abelian group': 4,
 'General linear group': 3,


## Try with TFIDF output

In [25]:
output = """
Introduction and Motivation: 
  learning, data, machine, machine learning, model, book, pillar, vector, concept, chapter, part, mathematical concept, foundation, mathematical, unseen, predictor, label, read, training, two, regression, way, learning system, machine learning system, read book, part ii, way read book, way read, pillar machine, two way read, four pillar machine, four pillar, pillar machine learning, data vector, unseen data, parameter, nd, well, input, density, system, algorithm, mean, estimation, two way, motivation, ing, classi cation, machine learning algorithm, goal
Linear Algebra: 
  vector, linear, matrix, column, basis, space, equation, mapping, subspace, system linear, pivot, system linear equation, linear equation, system, vector space, de, row, linearly, set, coordinate, solution, pivot column, linear mapping, ne, element, rn, linear algebra, transformation, respect, echelon form, echelon, algebra, form, row echelon form, row echelon, af, af ne, transformation matrix, group, inverse, following, example, two, consider, rm, linearly independent, base, equation system, multiplication, combination
Analytic Geometry: 
  projection, vector, product, inner, inner product, orthogonal, basis, subspace, rotation, de, angle, space, hx, norm, matrix, dimensional, onto, orthogonal projection, distance, figure, co, nite, basis vector, projection onto, positive, length, projection matrix, hx yi, yi, de nite, vector space, dot, positive de, sin, two, dot product, product inner product, positive de nite, dimensional subspace, linear, span, orthonormal, de nition, nition, rotate, symmetric, product inner, chapter, dot product inner, analytic geometry
Matrix Decompositions: 
  matrix, svd, eigenvalue, eigenvectors, singular, decomposition, vector, singular value, movie, determinant, det, basis, rank, value, rn, singular vector, eigendecomposition, diagonal, theorem, right singular, mapping, de, spectral, square, rank approximation, right, matrix decomposition, column, cholesky, right singular vector, approximation, linear, trace, det det, orthogonal, section, eigenvector, characteristic, square matrix, tr, left, linear mapping, figure, diagonal matrix, rating, eigenvalue eigenvectors, change, positive, matrix rn, ba
Vector Calculus: 
  derivative, partial, function, taylor, gradient, partial derivative, rule, chain rule, chain, taylor series, vector, series, fk, df, jacobian, differentiation, compute, matrix, de, fi, respect, tensor, xn, dx, fk fk, series expansion, order, polynomial, taylor series expansion, exp, rn, fk fk fk, expansion, yf, taylor polynomial, compute gradient, automatic differentiation, xf, sin, vector calculus, figure, automatic, obtain, variable, section, fm, compute derivative, linear, calculus, df dx
Probability and Distributions: 
  random, random variable, distribution, variable, probability, covariance, variance, gaussian, prior, statistic, mean, discrete, function, de, exponential, family, exponential family, section, two, cov, beta, example, event, rule, space, density, probability distribution, marginal, sample, value, conditional, bernoulli, continuous random, consider, gaussian distribution, posterior, continuous, theorem, continuous random variable, univariate, cdf, xjy, outcome, multivariate, de nition, nition, state, two random, sum, covariance matrix
Continuous Optimization: 
  convex, function, gradient, optimization, descent, gradient descent, convex function, min, optimization problem, dual, problem, minimum, convex set, convex optimization, set, constraint, function convex, legendre, convex conjugate, continuous optimization, multiplier, lagrange, lagrangian, duality, subject, de, differentiable, max, step size, point, value, two, conjugate, consider, objective function, lagrange multiplier, step, constrained, primal, negative, inequality, size, objective, linear program, example, program, line, figure, legendre fenchel, momentum
"""

annos = get_annotations(output)

In [31]:
supports = {
    a['title']: 
        {"url": a['url'],
        "occurences": [
            {"intvl": pd.Interval(s['chFrom'] + offset, s['chTo'] + offset), "pr": s['pageRank']}
        for s in a['support'] if s['pageRank'] > 0.0005]}
    for a in annos
}

KCs_from_tfidf = {}
# offset characters and merge with existing supports
for title, value in supports.items():
    if len(value['occurences']) > 0: # only for supports with a good page rank
        if not title in KCs_from_tfidf: # ensure that KC exists
            KCs_from_tfidf[title] = {'url': value['url'], 'occurences': []}

        occs = []
        for idx, occ in enumerate(value['occurences']):
            # detect overlaps
            current_intvl = occ['intvl']
            overlaps = [o for o in value['occurences'] if o['intvl'].overlaps(current_intvl) and occ['intvl'] != o['intvl']]
            if len(overlaps): # if we have overlaps, only take the one with the highest Page Rank
                if np.all([occ['pr'] > o['pr'] for o in overlaps]): occs.append(occ)
            else:
                occs.append(occ)

        KCs_from_tfidf[title]['occurences'] = KCs_from_tfidf[title]['occurences'] + occs 

In [32]:
{title: len(v['occurences']) for title, v in KCs_from_tfidf.items()}

{'Machine learning': 4,
 'Vector space': 2,
 'Vector calculus': 1,
 'Ion': 1,
 'Linear algebra': 1,
 'Linear equation': 2,
 'Linear map': 2,
 'Linear programming': 1,
 'Matrix (mathematics)': 1,
 'Eigenvalues and eigenvectors': 5,
 'Analytic geometry': 1,
 'Eigendecomposition of a matrix': 1,
 'Row echelon form': 1,
 'Transformation matrix': 1,
 'Linear independence': 1,
 'Projection (linear algebra)': 2,
 'Dot product': 4,
 'Orthonormality': 1,
 'Matrix decomposition': 1,
 'Singular value decomposition': 1,
 'Diagonal matrix': 1,
 'Partial derivative': 1,
 'Gradient descent': 1,
 'Chain rule': 1,
 'Automatic differentiation': 1,
 'Random variable': 2,
 'Probability distribution': 2,
 'Normal distribution': 1,
 'Covariance matrix': 1,
 'Exponential family': 1,
 'Continuous optimization': 1,
 'Mathematical optimization': 2,
 'Convex set': 1,
 'Convex function': 1,
 'Convex optimization': 1,
 'Convex conjugate': 1}