In [1]:
import os
import pickle
import pandas as pd
import spacy
import spotlight
from collections import Counter
import time

input_dir = 'textbooks/'
output_dir = 'output/'
data_dir = 'data/'
metadata_file = 'data/metadata.csv'
text_file = 'text.pkl'
spotlight_server = 'http://192.168.99.101:2222/rest/annotate'

## Load data and spacy model

In [2]:
isbns = os.listdir(input_dir)

with open(os.path.join(data_dir, text_file), 'rb') as fp:
    all_text = pickle.load(fp) 
    
nlp = spacy.load('en')

## Load metadata and calculate number of pages

In [3]:
metadata = pd.read_csv(metadata_file, dtype = {'ISBN': 'str'})
metadata['num_pages'] = [len(all_text[isbn]) for isbn in metadata['ISBN']]
metadata

Unnamed: 0,ISBN,title,author,subject,imprint,sold_by,start_page,end_page,num_pages
0,9781429219617,BIOLOGY OF PLANTS,PETER H RAVEN,BIOLOGY,FREEMAN/WORTH,Macmillan Higher Education,21,747,863
1,9781429242301,INTRODUCING PSYCHOLOGY,DANIEL L SCHACTER,PSYCHOLOGY,FREEMAN/WORTH,Macmillan Higher Education,38,526,616
2,9781429298643,LIFE: THE SCIENCE OF BIOLOGY,DAVID E SADAVA,BIOLOGY,FREEMAN/WORTH,Macmillan Higher Education,51,1297,1447
3,9781429298902,PSYCHOLOGY: A CONCISE INTRODUCTION,RICHARD A GRIGGS,PSYCHOLOGY,WORTH PUBLISHERS,Macmillan Higher Education,22,464,545
4,9781464126147,MOLECULAR BIOLOGY: PRINCIPLES AND PRACTICE,MICHAEL M COX,BIOLOGY,W. H. FREEMAN,Macmillan Higher Education,30,828,934
5,9781464135958,WHAT IS LIFE? A GUIDE TO BIOLOGY,JAY PHELAN,BIOLOGY,FREEMAN/WORTH,Macmillan Higher Education,34,718,773
6,9781464140815,PSYCHOLOGY,DAVID G MYERS,PSYCHOLOGY,FREEMAN/WORTH,Macmillan Higher Education,59,751,985
7,9781464154072,EXPLORING PSYCHOLOGY,DAVID G MYERS,PSYCHOLOGY,WORTH PUBLISHERS,Macmillan Higher Education,59,662,892
8,9781464171703,ABNORMAL PSYCHOLOGY,RONALD J COMER,PSYCHOLOGY,WORTH PUBLISHERS,Macmillan Higher Education,33,699,852


## Parse the text of each textbook

In [4]:
all_docs = {}

for isbn in isbns:    
    print('Parsing text for ISBN: {}'.format(isbn))
    start_page = metadata.loc[metadata['ISBN'] == isbn, 'start_page'].values[0]
    end_page = metadata.loc[metadata['ISBN'] == isbn, 'end_page'].values[0]
    
    # Extract the content portion of the textbook, and combine the pages
    text = ' '.join(all_text[isbn][(start_page-1):(end_page)])

    # Parse the textbook with spacy
    all_docs[isbn] = nlp(text)

Parsing text for ISBN: 9781429219617
Parsing text for ISBN: 9781429242301
Parsing text for ISBN: 9781429298643
Parsing text for ISBN: 9781429298902
Parsing text for ISBN: 9781464126147
Parsing text for ISBN: 9781464135958
Parsing text for ISBN: 9781464140815
Parsing text for ISBN: 9781464154072
Parsing text for ISBN: 9781464171703


## Use the DBpedia Spotlight server to annotate the text

In [5]:
def extract_spotlight_concept(annotation):
    return annotation['URI'].split('/resource/')[-1]

def spotlight_annotate_text(doc, confidence=0.5, support=1):
    # Confidence = confidence score for disambiguation / linking
    # Support = number of inlinks to the Wikipedia entry

    # Low support with high confidence
    annotations = spotlight.annotate(spotlight_server,
                                     doc.string,
                                     confidence=confidence, 
                                     support=support)
    
    # Filter out some annotations
    exclude_types = ['DBpedia:TimePeriod,DBpedia:Year']
    
    filtered_annotations = [ann for ann in annotations 
                            if ann['types'] not in exclude_types]
              
    return filtered_annotations

#### Annotate the text at different confidence levels

In [6]:
confidence_levels = [0.4, 0.6, 0.8]
n_top = 100

annotations_df = pd.DataFrame()

for confidence in confidence_levels:
    print('Annotating for confidence: {}'.format(confidence))
    for isbn in isbns:
        print('Annotating text for ISBN: {}'.format(isbn))
        annotations = spotlight_annotate_text(all_docs[isbn],
                                              confidence)
        
        concepts = [extract_spotlight_concept(ann) for ann in annotations]
        freq = Counter(concepts)
        common_concepts, counts = zip(*freq.most_common(n_top))
        
        annotations_df = annotations_df.append(
            pd.DataFrame({
                          'concept': list(common_concepts),
                          'count': list(counts),
                          'confidence': confidence,
                          'ISBN': isbn
                        }))         
        time.sleep(10)

Annotating for confidence: 0.4
Annotating text for ISBN: 9781429219617
Annotating text for ISBN: 9781429242301
Annotating text for ISBN: 9781429298643
Annotating text for ISBN: 9781429298902
Annotating text for ISBN: 9781464126147
Annotating text for ISBN: 9781464135958
Annotating text for ISBN: 9781464140815
Annotating text for ISBN: 9781464154072
Annotating text for ISBN: 9781464171703
Annotating for confidence: 0.6
Annotating text for ISBN: 9781429219617
Annotating text for ISBN: 9781429242301
Annotating text for ISBN: 9781429298643
Annotating text for ISBN: 9781429298902
Annotating text for ISBN: 9781464126147
Annotating text for ISBN: 9781464135958
Annotating text for ISBN: 9781464140815
Annotating text for ISBN: 9781464154072
Annotating text for ISBN: 9781464171703
Annotating for confidence: 0.8
Annotating text for ISBN: 9781429219617
Annotating text for ISBN: 9781429242301
Annotating text for ISBN: 9781429298643
Annotating text for ISBN: 9781429298902
Annotating text for ISBN: 9

#### Output common concepts to file

In [7]:
concepts_df = pd.merge(annotations_df, metadata, on='ISBN')

concepts_df.to_csv(os.path.join(output_dir, 'common_concepts.csv'), 
                      index=False)