In [1]:
import glob
import pandas as pd
import json
import sys  
sys.path.insert(0, '../')

files = glob.glob('../data/json/*.json')
json_files = []

for file in files:
    with open(file) as json_file:
        json_files.append(json.load(json_file))


In [2]:
import re 

def extract_entity(text, entities):
    spans = []
    for label, matches in entities:
        for match in matches:
            for m in re.finditer(match, text, flags=re.IGNORECASE):
                spans += [{ 
                    'start': m.start(), 
                    'end': m.end(), 
                    'label': label, 
                    'text': text[m.start():m.end()] 
                }]
    return { 'text': text, 'spans': spans }


text = str(json_files[1]['abstract'])   
sample_size = ('SAMPLE_SIZE', ['929'])
print(json.dumps(extract_entity(text, [sample_size]), indent=2)+',')

{
  "text": "The objectives of this study were to determine the influence of various motivation variables on task-specific mathematics performance and to explore whether these variables change during the first year of middle school (N \u03ed 273). Students' taskspecific self-efficacy was the only motivation variable to predict performance and did so both at start and end of year. There were no differences in anxiety, selfconcept, or self-efficacy for self-regulation between start and end of year, but, by end of year, students described mathematics as less valuable and reported lower effort and persistence. Gifted students had stronger mathematics self-concept beliefs, and they had more accurate and less overconfident self-efficacy beliefs than did regular education students. There were no gender differences in any of the motivation constructs.",
  "spans": []
},


In [6]:
import spacy
from spacy import displacy

text = str(json_files[40]['abstract']) 

nlp=spacy.load("en_core_web_sm") 
#nlp.from_disk("../data/ner_sample_size")
doc = nlp(text)
ner=nlp.get_pipe('ner')
print(doc.ents, ner.labels)
print(doc.ents.push('test'))
options = {"ents": [], "colors": { 'CARDINAL': '#fab1a0' }}
displacy.render(doc, style="ent")

(two, 1702, 53 %, age 12-14, 1636, 64 %, age 16-18, Finnish) ('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')


AttributeError: 'tuple' object has no attribute 'push'

In [16]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/roberta-base-squad2"
pipe = pipeline('question-answering', model=model_name, tokenizer=model_name)

def get_entity(text, question):
    return pipe({ 'context': text, 'question': question })
    

In [45]:
from spacy.tokens import Span
nlp=spacy.load("en_core_web_sm") 

options = {"ents": ['SAMPLE_SIZE'], "colors": { 'SAMPLE_SIZE': '#fab1a0' }}

for i in range(len(json_files)):
    text = str(json_files[i]['abstract']) 
    if (len(text) > 0):
        res = get_entity(text, 'What is the sample size?')
        doc = nlp(text)
        span = doc.char_span(res['start'], res['end'], "SAMPLE_SIZE")
        doc.set_ents([Span(doc, span.start, span.end, "SAMPLE_SIZE")])
        displacy.render(doc, style="ent", options=options)

KeyboardInterrupt: 

In [44]:
questions = [
    'What is the number N?', 
    'What is the sample size?', 
    'What is the number of students?', 
    'What is the number of teachers?',
    'How many people?'
]

for i in range(len(json_files)):
    context = str(json_files[i]['abstract']) 
    if context is None or context == '': continue 
    results = []
    result_max = 0
    result_score = 0
    for j, question in enumerate(questions):
        res = get_entity(context, question)
        results.append(res)
        if res['score'] > result_score:
            result_max = j 
    out = {
        'id': i,
        'file': files[i],
        'title': json_files[i]['title'],
        'sample_size': results[result_max]['answer']
    }
    print(json.dumps(out, indent=2)+',')
    

{
  "id": 0,
  "file": "../data/json/lazarides 2017.json",
  "title": "Longitudinal Effects of Student-Perceived Classroom Support on Motivation -A Latent Change Model",
  "sample_size": "1088"
},
{
  "id": 1,
  "file": "../data/json/Pajares _ Graham, 1999.json",
  "title": "Self-Efficacy, Motivation Constructs, and Mathematics Performance of Entering Middle School Students",
  "sample_size": "273"
},
{
  "id": 2,
  "file": "../data/json/Heo 2015.json",
  "title": "Autoregressive Cross-Lagged Modeling of the Reciprocal Longitudinal Relationship Between Self-Esteem and Career Maturity",
  "sample_size": "4"
},
{
  "id": 3,
  "file": "../data/json/mailtais_2017.json",
  "title": "Learning climate, academic competence, and anxiety during the transition to middle school: Parental attachment as a protective factor",
  "sample_size": "627"
},
{
  "id": 4,
  "file": "../data/json/Lazarides Rubach Ittel2017-1.json",
  "title": "PARENTS AND MATHEMATICS CAREER PLANS 1 NOTE: This is the author's 

{
  "id": 37,
  "file": "../data/json/Wang_et_al-2009-Child_Development.json",
  "title": "The Motivational Landscape of Early Adolescence in the United States and China: A Longitudinal Investigation",
  "sample_size": "825"
},
{
  "id": 38,
  "file": "../data/json/Simpkins06.json",
  "title": "Math and Science Motivation: A Longitudinal Examination of the Links Between Choices and Beliefs",
  "sample_size": "227"
},
{
  "id": 40,
  "file": "../data/json/Salmela-Arp, 2016.json",
  "title": "The Dark Side of Internet Use: Two Longitudinal Studies of Excessive Internet Use, Depressive Symptoms, School Burnout and Engagement Among Finnish Early and Late Adolescents",
  "sample_size": "1702"
},
{
  "id": 41,
  "file": "../data/json/Birkeland 2012 Trajectories of global self-esteem development during adolescence.json",
  "title": "Trajectories of global self-esteem development during adolescence",
  "sample_size": "1083"
},
{
  "id": 42,
  "file": "../data/json/song 2015.json",
  "title": "

KeyboardInterrupt: 