## This script prepares data of positive and negative examples to train the classifier to differentiate between sentences that mention a data set and sentences which dont mention a dataset.

### Positive Examples: All raw sentences that mention datasets. Mentions are extracted from citations file and complete relevant sentences are extracted from raw text copurs. Complete sentences are used to provide model with context and help it generalize over the liguistic qualitites of a dataset mention.

### Negative Examples: All raw sentences that contained a named entity but are not talking about datasets.

#### Spacy was used for sentence segmentation ang named entity recognition.


In [1]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
import string
import json
import numpy as np
import pandas as pd

In [None]:
## text file containing name of files which were labelled as containing datasets
LABELLED_TEXT_FILES = 'labelledTextFiles.txt'
txtdirectory = '/home/urwa/Documents/Coleridge/ProjectFiles/train_test/train_test/files/text/'
CITATTIONS_FILE = 'data_set_citations.json'
citations_directory = '/home/urwa/Documents/Coleridge/ProjectFiles/train_test/train_test/'

In [2]:
import spacy
# Load the large English NLP model
nlp = spacy.load('en_core_web_lg')

In [3]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [1]:
# save list to file
def save_list(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

In [5]:
#txtFiles = load_doc('sampleTextFiles.txt').split('\n')
txtFiles = load_doc(LABELLED_TEXT_FILES).split('\n')
# extrac publication IDs from text file name
pub_ids = [int(t.split('.')[0]) for t in txtFiles]
pub_ids[:5]

[2108, 2272, 1609, 589, 206]

In [6]:
## loading citations data from json file
text = load_doc(citations_directory+CITATTIONS_FILE)
cit = json.loads(text)

### The cell below divides the citation data in test and train in order to test if model is good at generalizing dataset mentions and correctly classifying them when they talk about unseen datasets.

In [24]:
# extracting unique dataset Ids
datasetIds = list(set([ d['data_set_id'] for d in cit ]))

#splitting into test train
dataIdTrain = datasetIds[150:]
dataIdTest = datasetIds[:150]

# Split the citation information into test and train
citTrain = [c for c in cit if c['data_set_id'] in dataIdTrain]
citTest = [c for c in cit if c['data_set_id'] in dataIdTest]

In [None]:
# extract dataset mention from json 
def getMentionsAll(file):
    pid = int(file.split('.')[0])
    citations = [c for c in cit if c['publication_id'] == pid]
    mentions = []
    for c in citations:
        mentions += c['mention_list']
    return( list(set(mentions)) )

In [26]:
# extract dataset mention from json 
def getMentionsTest(file):
    pid = int(file.split('.')[0])
    citations = [c for c in citTest if c['publication_id'] == pid]
    mentions = []
    for c in citations:
        mentions += c['mention_list']
    return( list(set(mentions)) )

In [27]:
# extract dataset mention from json 
def getMentionsTrain(file):
    pid = int(file.split('.')[0])
    citations = [c for c in citTrain if c['publication_id'] == pid]
    mentions = []
    for c in citations:
        mentions += c['mention_list']
    return( list(set(mentions)) )

In [28]:
def getMentionSentences(mentions,sentences):
    mention_sentences = []
    for m in mentions:
        mention_sentences += [s for s in sentences if m in s.text.replace('\n',' ')] 
    return list(set(mention_sentences))

In [29]:
def containsEntity(entities, sentence):
    for e in entities:
        if e.start >= sentence.start and e.end <= sentence.end:
            return True
    return False

### This cell will preperare positive and negative examples and save them in files.

In [None]:
all_mention_sentences = []
all_entity_sentences = []
i = 0
for file in txtFiles:
    i += 1
    if(i % 100 == 0):
        print(i)
    txt = load_doc(txtdirectory+file)
    doc = nlp(txt)
    sentences = list(doc.sents)
    entities = [e for e in doc.ents if e.label_ == 'ORG']
    mentions = getMentionsAll(file)
    
    mentionSentences = getMentionSentences(mentions,sentences)
    otherSentences = [s for s in sentences if s not in mentionSentences]
    
    entitySentences = [s for s in otherSentences if containsEntity(entities, s)]
    
    all_mention_sentences += [s.text for s in mentionSentences]
    all_entity_sentences += [s.text for s in entitySentences]
    
    
all_mention_sentences = [s.replace('\n',' ') for s in all_mention_sentences]

save_list(all_mention_sentences, 'all_positive_sentences.txt')
save_list(all_entity_sentences, 'all_negative_sentences.txt')

### This cell will preperare positive test and train examples and save them in files.

In [None]:
all_mention_sentences_train = []
all_mention_sentences_test = []
#all_entity_sentences = []
i = 0
for file in txtFiles:
    i += 1
    if(i % 100 == 0):
        print(i)
    txt = load_doc(txtdirectory+file)
    doc = nlp(txt)
    sentences = list(doc.sents)
    #entities = [e for e in doc.ents if e.label_ == 'ORG']
    mentions_train = getMentionsTrain(file)
    mentions_test = getMentionsTest(file)
    
    mentionSentences_train = getMentionSentences(mentions_train,sentences)
    mentionSentences_test = getMentionSentences(mentions_test,sentences)
    otherSentences = [s for s in sentences if s not in mentionSentences]
    
    entitySentences = [s for s in otherSentences if containsEntity(entities, s)]
    
    all_mention_sentences_train += [s.text for s in mentionSentences_train]
    all_mention_sentences_test += [s.text for s in mentionSentences_test]
    #all_entity_sentences += [s.text for s in entitySentences]
    
    
all_mention_sentences_train = [s.replace('\n',' ') for s in all_mention_sentences_train]
all_mention_sentences_test = [s.replace('\n',' ') for s in all_mention_sentences_test]

save_list(all_mention_sentences_train, 'all_train_positive_sentences2.txt')
save_list(all_mention_sentences_test, 'all_test_positive_sentences2.txt')