In [1]:
import os
import pickle
import pandas as pd
import spacy
import spotlight
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

input_dir = 'textbooks/'
data_dir = 'data/'
metadata_file = 'data/metadata.csv'
toc_file = 'toc.pkl'
text_file = 'text.pkl'
spotlight_server = 'http://192.168.99.101:2222/rest/annotate'

## Load data and spacy model

In [2]:
isbns = os.listdir(input_dir)

with open(os.path.join(data_dir, toc_file), 'rb') as fp:
    all_toc = pickle.load(fp) 

with open(os.path.join(data_dir, text_file), 'rb') as fp:
    all_text = pickle.load(fp) 
    
nlp = spacy.load('en')

## Load metadata and calculate number of pages

In [3]:
metadata = pd.read_csv(metadata_file, dtype = {'ISBN': 'str'})
metadata['num_pages'] = [len(all_text[isbn]) for isbn in metadata['ISBN']]
metadata

Unnamed: 0,ISBN,title,author,imprint,sold_by,start_page,end_page,num_pages
0,9781429219617,BIOLOGY OF PLANTS,PETER H RAVEN,FREEMAN/WORTH,Macmillan Higher Education,21,747,863
1,9781429242301,INTRODUCING PSYCHOLOGY,DANIEL L SCHACTER,FREEMAN/WORTH,Macmillan Higher Education,38,526,616
2,9781429298643,LIFE: THE SCIENCE OF BIOLOGY,DAVID E SADAVA,FREEMAN/WORTH,Macmillan Higher Education,51,1297,1447
3,9781429298902,PSYCHOLOGY: A CONCISE INTRODUCTION,RICHARD A GRIGGS,WORTH PUBLISHERS,Macmillan Higher Education,22,464,545
4,9781464126147,MOLECULAR BIOLOGY: PRINCIPLES AND PRACTICE,MICHAEL M COX,W. H. FREEMAN,Macmillan Higher Education,30,828,934
5,9781464135958,WHAT IS LIFE? A GUIDE TO BIOLOGY,JAY PHELAN,FREEMAN/WORTH,Macmillan Higher Education,34,718,773
6,9781464140815,PSYCHOLOGY,DAVID G MYERS,FREEMAN/WORTH,Macmillan Higher Education,59,751,985
7,9781464154072,EXPLORING PSYCHOLOGY,DAVID G MYERS,WORTH PUBLISHERS,Macmillan Higher Education,59,662,892
8,9781464171703,ABNORMAL PSYCHOLOGY,RONALD J COMER,WORTH PUBLISHERS,Macmillan Higher Education,33,699,852


## Example: *Introducing Psychology*

In [4]:
isbn = '9781429242301'

start_page = metadata.loc[metadata['ISBN'] == isbn, 'start_page'].values[0]
end_page = metadata.loc[metadata['ISBN'] == isbn, 'end_page'].values[0]

# Extract the content portion of the textbook, and combine the pages
text = all_text[isbn][(start_page-1):(end_page)]
text = ' '.join(text)

# Parse the textbook with spacy
doc = nlp(text)

# Some spacy examples
token = doc[2]
print(token)
sentence = next(doc.sents)
print(sentence)
print([word.lemma_ for word in sentence])

Psychology
1 䉱 Psychology’s Roots:
['1', '䉱', 'psychology', '’s', 'root', ':']


### Use the DBpedia Spotlight server to annotate the text

In [5]:
# Confidence = confidence score for disambiguation / linking
# Support = number of inlinks to the wikipedia entry

# Low support with high confidence

annotations = spotlight.annotate(spotlight_server,
                                 doc.string,
                                 confidence=0.9, support=2)


In [6]:
annotations[1]

{'URI': 'http://dbpedia.org/resource/Brain',
 'offset': 793,
 'percentageOfSecondRank': 4.2737093380401676e-18,
 'similarityScore': 1.0,
 'support': 10026,
 'surfaceForm': 'The Brain',
 'types': ''}

In [7]:
annotation_names = [ann['surfaceForm'] for ann in annotations
                   if ann['surfaceForm'] != '/12']
# TODO:
# Exclude '/12' or certain 'types': 'DBpedia:TimePeriod,DBpedia:Year'

annotation_names

['The French Connection',
 'The Brain',
 'ABRAHAM LINCOLN',
 'Pony Express',
 'Missouri',
 'California',
 'Harvard Medical School',
 'sci',
 'Harvard University',
 'Abraham Lincoln',
 'Pony Express',
 'Kellogg Company',
 'rela',
 'science fiction',
 'functional magnetic resonance imaging',
 'risky business',
 'emo',
 'empiricism',
 'phrenology',
 'tics',
 'happiness',
 'Greek',
 'Plato',
 'Aristotle',
 'Greek',
 'Plato',
 'Aristotle',
 'Aristotle',
 'tabula rasa',
 'tabula rasa',
 'empiricism',
 'empiricism',
 'Plato',
 'Aristotle',
 'Aristotle',
 'The French Connection',
 'René Descartes',
 'pineal gland',
 'Thomas Hobbes',
 'Joseph Gall',
 'phrenology',
 'happiness',
 'hippocampus',
 'amygdala',
 'phrenology',
 'Paul Broca',
 'Joseph Gall',
 'phrenology',
 'Hermann von Helmholtz',
 'Wilhelm Wundt',
 'Berlin',
 'physics',
 'Wilhelm Wundt',
 'structuralism',
 'Paul Broca',
 'structuralism',
 'Wilhelm Wundt',
 'Leipzig',
 'brightness',
 'DNA',
 'structuralism',
 'Charles Darwin',
 'natu

### Basic NLP

In [8]:
words = [token.text for token in doc if 
         token.is_stop != True and 
         token.is_punct != True and 
         token.is_digit != True and 
         token.pos_ == "NOUN" and
         len(token) > 2]

# five most common tokens
word_freq = Counter(words)
common_words = word_freq.most_common(30)

for word, count in common_words:
    print('{} - {}'.format(word, count))

people - 1179
brain - 725
person - 497
behavior - 453
memory - 440
example - 429
time - 393
information - 383
children - 373
disorder - 275
way - 267
study - 263
mind - 255
personality - 253
self - 250
system - 234
disorders - 228
body - 225
life - 224
psychology - 223
experience - 217
group - 209
theory - 208
cortex - 203
intelligence - 200
response - 200
activity - 200
world - 199
fact - 190
participants - 188


## LDA for Topic Modelling (all textbooks)

In [9]:
textbook_text = []

for isbn in isbns:
    start_page = metadata.loc[metadata['ISBN'] == isbn, 'start_page'].values[0]
    end_page = metadata.loc[metadata['ISBN'] == isbn, 'end_page'].values[0]

    # Extract the content portion of the textbook, and combine the pages
    text = all_text[isbn][(start_page-1):(end_page)]
    text = ' '.join(text)

    # Parse the textbook with spacy
    doc = nlp(text)

    text_clean = [token.text for token in doc if 
                  token.is_stop != True and 
                  token.is_punct != True and 
                  token.is_digit != True and 
                  token.pos_ == "NOUN" and
                  len(token) > 2]
    
    textbook_text.append(' '.join(text_clean))

In [10]:
n_features = 1000
n_topics = 10

In [11]:
# Use tf (raw term count) features for LDA
print("Extracting tf features for LDA...")
tf_vectorizer = TfidfVectorizer(min_df=1,
                                max_features=n_features,
                                stop_words='english')


tf = tf_vectorizer.fit_transform(textbook_text)

Extracting tf features for LDA...


In [12]:
print("Fitting LDA models with tf features")
lda = LatentDirichletAllocation(n_topics=n_topics, 
                                max_iter=10,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)

Fitting LDA models with tf features


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_jobs=1, n_topics=10, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [13]:
print("\nTopics in LDA model:")
print()

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()
    
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, 20)


Topics in LDA model:

Topic #0:
water cells cell plants cid species 132 time plant blood sporophyte environments changes heart rate ions tasks genomes segment energy

Topic #1:
rem reaction communication separation rule repair time definition proteins sample potentials bird issues ratio carbohydrates thing job populations failure individuals

Topic #2:
people brain disorder memory person life behavior personality children study disorders arousal age scores family time environment performance activity information

Topic #3:
polypeptide shape complex glands dna male concentrations evidence humans question reactions answers mating death hemisphere chemicals height lack pro core

Topic #4:
atoms branch fear mechanisms resources layers domain skin trees orientation algae schizophrenia past face steps salt methods transport survival pre

Topic #5:
zygote words walls cues environment disorders females ments view matrix source theory mass polypeptide flowering turn energy therapists humans va