In [151]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
import string
import json
import numpy as np
import pandas as pd
import re
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

In [152]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Dropout
from keras import regularizers
from keras.models import model_from_json
import pickle

## Helper Functions

In [153]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [154]:
def save_list(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

### load spacy model

In [155]:
import spacy
# Load the large English NLP model
nlp = spacy.load('en_core_web_lg')

## NER based sentence Filter Methods

In [156]:
def containsEntity(entities, sentence):
    for e in entities:
        if e.start >= sentence.start and e.end <= sentence.end:
            return True
    return False

In [157]:
def excludeReference(text):
    tokens = text.split()
    l = len(tokens)
    if 'REFERENCES' in tokens:
        ind = l-1-tokens[::-1].index('REFERENCES')
    elif 'References' in tokens:
        ind = l-1-tokens[::-1].index('References')
    else:
        ind = l
    return ' '.join(tokens[:ind])

### Data Preprocessing Methods

In [158]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
 
ps = PorterStemmer()

In [159]:
## Abbreviation Disambiguation Module

## load abbreviations
file = 'abbreviations.json'
abbtext = load_doc(file)
abbreviations = json.loads(abbtext)

def findAbbreviation(sentence):
    regex = r"\b[A-Z][A-Z]+\b"
    abbreviations = re.findall(regex, sentence)
    return abbreviations

def expandAbbreviation(sentence, abbdict):
    abbs = findAbbreviation(sentence)
    for a in abbs:
        if a in abbdict:
            sentence = sentence.replace(a,abbdict[a][0])
    return sentence

In [160]:
def specialMapping(word):
    if word == 'studi':
        return 'survey'
    else:
        return word

In [161]:
# turn a doc into clean tokens
def clean_doc(doc):
    # abbreviation disambiguation
    doc = expandAbbreviation(doc, abbreviations)
    # split into tokens by white space
    tokens = doc.split()
    # Exclude text below references
    #tokens = excludeReference(tokens)
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make lower case
    tokens = [word.lower() for word in tokens]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # stemming
    tokens = [ps.stem(word) for word in tokens]
    #specialMapping
    tokens = [specialMapping(word) for word in tokens]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [162]:
# load doc, clean and return line of tokens
def doc_to_line(sentence, vocab):
	# clean doc
	tokens = clean_doc(sentence)
	# filter by vocab
	tokens = [w for w in tokens if w in vocab]
	return ' '.join(tokens)

In [163]:
# load all docs in a directory
def process_docs(sentences, vocab):
    lines = list()
    # walk through all files in the folder
    for sentence in sentences:
        # load and clean the doc
        line = doc_to_line(sentence, vocab)
        # add to list
        lines.append(line)
    return lines

### Load Keras Tokenizer

In [164]:
def load_Keras_Tokenizer_CNN(pickleFilePath):
    with open(pickleFilePath, 'rb') as handle:
       tokenizer = pickle.load(handle)
    return tokenizer

### Load CNN and weights

In [165]:
def load_CNN_Sentece_Classifier(modelFile, weightsFile):
    # load json and create model
    json_file = open(modelFile, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)   
    # load weights into new model
    model.load_weights("CNNmodel.h5")
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print("Loaded model from disk")
    return model

### Load Dataset Vocabulary for Ngram

In [166]:
def loadVocab(vocabFile):
    vocab = load_doc(vocabFile)
    vocab = vocab.split()
    vocab = set(vocab)
    return vocab

### Load Dataset Titles and IDs

In [167]:
def loadDataSetTitlesAndIds(datasetJsonFile):
    text = load_doc(datasetJsonFile)
    loaded_json_all = json.loads(text)
              
    dataSetIds = [ dataset['data_set_id'] for dataset in loaded_json_all ]
    dataSetTitles = [ dataset['title'] for dataset in loaded_json_all ]
    dataSetDate = [ dataset['date'] for dataset in loaded_json_all ]
    datasetyears = [int(d[:4]) if d != 'None' else 0 for d in dataSetDate]
    return datasetyears,dataSetIds,dataSetTitles

### Labelling helper function

In [168]:
def getLabelLength(labels):
    return [len(l) for l in labels]

### Group Search Helper Methods

In [169]:
def treatDates(tokens):
    rawDates = []
    for token in tokens:
        if len(token) == 6: #200108
            if re.match(r'([1-2][09][0-9]{4})', token):
                start = int(token[0:4])
                end = int(token[:2]+token[4:6])
                if (end>start):
                    years = list( range(start,end+1) )
                    years = [str(y) for y in years]
                    rawDates.append(token)
                    tokens += years
        if len(token) == 8: #20012008
            if re.match(r'([0-2][09][0-9]{2}[0-2][09][0-9]{2})', token):
                start = int(token[0:4])
                end = int(token[4:8])
                if (end>start):
                    years = list( range(start,end+1) )
                    years = [str(y) for y in years]
                    rawDates.append(token)
                    tokens += years
    tokens = [t for t in tokens if t not in rawDates]
    return tokens

In [219]:
#Fix2
def expandDatesWithTo(text):
    matches = re.findall(r'([1-2][09][0-9]{2} to [1-2][09][0-9]{2})', text)
    for m in matches:
        start = int(m[0:4])
        end = int(m[8:12])
        if (end>start):
            years = list( range(start,end+1) )
            years = [str(y) for y in years]
            text += " " + ' '.join(years)
    return text

In [220]:
def clean_mention(doc):
    # abbreviation disambiguation
    doc = expandAbbreviation(doc, abbreviations)
    
    #expand dates like 2000 to 2002 into 2002 2001 2002 #Fix2
    doc = expandDatesWithTo(doc)
    
    # split into tokens by white space
    tokens = doc.split()

    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    
    # make lower case
    tokens = [word.lower() for word in tokens]
    
    # remove remaining tokens that are not alphabetic
    #tokens = [word for word in tokens if not word.isalpha()]
    tokenTemp = tokens.copy()
    # treat dates
    tokens = treatDates(tokens)
    
    # filter out stop words
    #stop_words = set(stopwords.words('english'))
    #tokens = [w for w in tokens if not w in stop_words]
    
    # stemming
    tokens = [ps.stem(word) for word in tokens]
    
    #specialMapping
    tokens = [specialMapping(word) for word in tokens]
    

    return " ".join(tokens)

In [221]:
def min_max_scale(X):
    mini = np.min(X)
    #mini = 0
    maxi = np.max(X)
    if maxi == 0:
        return X
    return (X - mini) / (maxi - mini)

In [222]:
def min_max_2D(X):
    rowList = []
    for i in range(X.shape[0]):
        rowList.append(min_max_scale(X[i]))
    return np.array(rowList)

In [223]:
def zscore_scale(X):
    #mini = np.min(X)
    mean = np.mean(X)
    std = np.std(X)
    if std == 0:
        return np.zeros(len(X))
    return (X - mean) / (std)

In [224]:
def zscore_scale_2D(X):
    rowList = []
    for i in range(X.shape[0]):
        rowList.append(min_max_scale(X[i]))
    return np.array(rowList)

In [225]:
def getMaxyear(dateEntities):
    maxdate = 1900
    for de in dateEntities:
        dates = re.findall(r'([1-2][09][0-9]{2})', de.text)
        if len(dates) == 0:
            continue
        dates = [int(d) for d in dates]
        date = np.max(dates)
        if date > maxdate:
            maxdate = date
    return maxdate

## Debuf Functions

In [226]:
def print_Hits(sentence,score):
    s,d = zip(*sorted(zip(score, sentence)))
    for i in range(len(s)):
        print(s[i])
        print(d[i])
        print()

In [369]:
def getTrueLabels(samplePublications):
    sampleTextFiles = [p['text_file_name'] for p in samplePublications]
    file = 'publications.json'
    directory = '/home/urwa/Documents/Coleridge/ProjectFiles/train_test/train_test/'
    pub_json = pd.read_json(directory+file)

    pub_json.head()

    pub_json = pub_json[['text_file_name','publication_id']]
    pub_json = pub_json[pub_json.text_file_name.apply(lambda x: x in sampleTextFiles)]
    pub_json.head()

    file = 'data_set_citations.json'
    directory = '/home/urwa/Documents/Coleridge/ProjectFiles/train_test/train_test/'
    cit = pd.read_json(directory+file)

    cit = cit[['data_set_id', 'publication_id']]
    cit.head()

    pub_json = pub_json.merge(cit, left_on=['publication_id'], right_on=['publication_id'], how='left')
    pub_json.head()

    len(pub_json)

    pub_json = pub_json[['text_file_name','data_set_id']]
    pub_json.columns =  ['file','data_set_id']
    pub_json['file'] = pub_json.file.apply(lambda x : x.split('.')[0])
    pub_json['data_set_id'] = pub_json['data_set_id'].astype(int)
    pub_json['file'] = pub_json['file'].astype(int)
    return pub_json

In [354]:
def evaluationMetrics(trueLabels,resultsDF):
    truePair = set(zip(trueLabels.file.values, trueLabels.data_set_id.values))
    predPair = set(zip(resultsDF.publication_id.values, resultsDF.data_set_id.values))
    print(len(set(trueLabels.data_set_id.values).intersection(set(resultsDF.data_set_id.values))))
    print('Intersection : ',len(truePair.intersection(predPair)))
    print(truePair)
    print(predPair)
    recall = len(truePair.intersection(predPair)) / len(truePair)
    precision = len(truePair.intersection(predPair)) / len(predPair)
    if precision == 0 or recall == 0:
        return 0,0,0
    fscore = 2 * (precision * recall) / (precision + recall)
    return recall,precision,fscore

### Directories and Files Contstants

In [229]:
TEXT_DIRECTORY = '/home/urwa/Documents/Coleridge/ProjectFiles/train_test/train_test/files/text/'
CNN_TOKENIZER_File = 'CNNtokenizer.pickle'
CNN_MODEL_FILE = 'CNNmodel.json'
CNN_MODEL_WEIGHTS_FILE = "CNNmodel.h5"
DATASET_VOCAB_FILE = 'dataset_vocab_production.txt'
CNN_VOCAB_FILE = 'bmvocab.txt'
DATSETS_JSON_FILE = '/home/urwa/Documents/Coleridge/ProjectFiles/train_test/train_test/data_sets.json'
PROCESSED_DATASET_LINES = 'datasets_lines_production.txt'
PUBLICATIONS_JSON_FILE = '/home/urwa/Documents/Coleridge/ProjectFiles/train_test/train_test/publications.json'
OUTPUT_DIRECTORY = '/home/urwa/Documents/Coleridge/notebooks/SubsetAnalysis/Notes/'

In [230]:
# MAX_CNN_SEQ_LENGTH = 66
# HIT_THRESHHOLD = 0.9
# SIM_THRESHHOLD = 0.3
# GROUP_SIM_THRESHHOLD = 0.5

In [231]:
def save_citations(DF):
    data = DF.to_dict('records')
    with open(OUTPUT_DIRECTORY+'data_set_citations.json', 'w') as fp:
        json.dump(data, fp)

In [232]:
def save_mentions(DF):
    data = DF.to_dict('records')
    with open(OUTPUT_DIRECTORY+'data_set_mentions.json', 'w') as fp:
        json.dump(data, fp)

## Get the filenames

In [233]:
def getFileNames():
    txtFiles = os.listdir(TEXT_DIRECTORY)
    txtFiles = [t for t in txtFiles if not (t.startswith(".") and t.endswith('.txt')) ]
    labelledFiles = load_doc('labelledTextFiles.txt').split('\n')
    unlabbellledtxtFiles = [t for t in txtFiles if t not in labelledFiles]
    print("FileCount:",len(txtFiles))
    return labelledFiles
#Fix

In [234]:
## Returns ORG entities and sentences
def parseSpacy(text):
    text = excludeReference(text)
    spacydoc = nlp(text)
    sentences = list(spacydoc.sents)
    entities = [e for e in spacydoc.ents if e.label_ == 'ORG']
    dates = [e for e in spacydoc.ents if e.label_ == 'DATE']
    print("Sentences: ",len(sentences))
    return dates,sentences,entities

In [235]:
def filterSentencesByNer(sentences,entities):
    filteredSentences = [s.text for s in sentences if containsEntity(entities, s)]
    filteredSentences = list(set(filteredSentences))
    print("Filtered Sentences: ",len(filteredSentences))
    return filteredSentences

In [236]:
def removeSpecialCharacters(sentences):
    sentences = [s.replace('\n',' ') for s in sentences]
    sentences = [s.replace('\xad', '') for s in sentences]
    return sentences

In [237]:
def getMentionSentencesDF(filteredSentences, y_hat, y_prob, threshHold):
    DF = pd.DataFrame({'sentence':filteredSentences,'Pscore': y_prob})
    DF = DF[DF.Pscore > threshHold]
    print('Dataset Mentions: ',len(DF))
    return DF

In [238]:
def getDataSetProcessedLines():
    # each field as a sentence
    docs = load_doc(PROCESSED_DATASET_LINES).split('\n')
    return docs

In [239]:
def getDatasetNgramVectorizer(docs):
    # create the tokenizer
    vectorizer = TfidfVectorizer(ngram_range=(2, 4))
    # fit the tokenizer on the documents
    tfidVec = vectorizer.fit(docs)
    return tfidVec

In [240]:
def getSimilarityMatrix_sent_datasets(hitSents,vectorizerDataset,dataset_Ngram):   
    # prepare negative reviews
    sents = process_docs(hitSents, dataset_vocab)    
    # encode training data set
    sents_Ngram = vectorizerDataset.transform(sents)
    #print('Sentence TFID shape: ',sents_Ngram.shape)
    Cos_Sim = cosine_similarity(sents_Ngram, dataset_Ngram, dense_output=True)
    print('Cos Sim shape: ',Cos_Sim.shape)
    return Cos_Sim

In [241]:
def getDatasetCandidateMatchesDF(df,Cos_Sim,dataSetIds,dataSetTitles,datasetYears, sim_threshHold,pubId,pubYear):  
    #Cos_Sim = min_max_2D(Cos_Sim)
    DataLabel = []
    DataTitle = []
    sim_score = []
    for i in range(Cos_Sim.shape[0]):
        did = []
        dtit = []
        sscr = []
        for j in range(len(Cos_Sim[0])):
            if(Cos_Sim[i][j] > sim_threshHold and datasetYears[j] <= pubYear ):
                did.append(dataSetIds[j])
                dtit.append(dataSetTitles[j])
                sscr.append(Cos_Sim[i][j])
        DataLabel.append(did)
        DataTitle.append(dtit)
        sim_score.append(sscr)
    
    df['matches'] = getLabelLength(DataLabel)
    df['datasetIds'] = DataLabel
    df['data_sim_scores'] = sim_score
    df['datasetTitles'] = DataTitle
    df['pubID'] = pubId
    return df

In [242]:
def mergeSimilarDatasets(DF):
    datasetgroups = []
    for i in range(len(DF.datasetTitles)):
        tit = DF.datasetTitles.values[i]
        if not tit:
            continue
        found = False
        for j in range(len(datasetgroups)):
            if len(set(datasetgroups[j]).intersection(set(tit))) > 0:
                datasetgroups[j] = list(set(datasetgroups[j]).union(set(tit)))
                found = True
                break
        if not found:
            datasetgroups.append(tit)
    print('Numer of Groups: ',len(datasetgroups))
    print('Group Sizes',[len(d) for d in datasetgroups])
    return datasetgroups

In [243]:
def getGroupHitsSimMatrix(currentgroup,groupHits):
    # each field as a sentence
    docs = [ clean_mention(c) for c in currentgroup ]
    # create the tokenizer
    vectorizer = TfidfVectorizer(ngram_range=(1, 3))
    # fit the tokenizer on the documents
    bow = vectorizer.fit(docs)
    # encode training data set
    sentence_bow = bow.transform(docs)
    #print(sentence_bow.shape)
    #each field as a sentence
    docs = [" ".join([ clean_mention(c) for c in groupHits])] 

    # encode training data set
    doc_bow = bow.transform(docs)
    #print(doc_bow.shape)
    
    data_sim = cosine_similarity(doc_bow, sentence_bow, dense_output=True)
    
    group_sim_scores = min_max_scale(data_sim.reshape(data_sim.shape[1]))
    return group_sim_scores

In [244]:
def citationScoring(mentions,score):
    scrs = []
    for i in range(len(score)):
        if len(mentions[i]) == 1:
            scrs.append(0.6 * score[i])
        if len(mentions[i]) == 2:
            scrs.append(0.7 * score[i])
        if len(mentions[i]) == 3:
            scrs.append(0.8 * score[i])
        if len(mentions[i]) >= 4:
            scrs.append(1 * score[i])       
    return scrs

In [396]:
def generateResults(datasetgroups,dataset_name_to_Id,DF,pubId,GROUP_SIM_THRESHHOLD):
    
    finalLabelList = []
    for i in range(len(datasetgroups)):
        currentgroup = datasetgroups[i]
        
        groupHits = [s for s,t in zip(DF.sentence,DF.datasetTitles) if len(set(t).intersection(set(currentgroup))) > 0 ]  #  
        
        if(len(currentgroup)<2):
            row = {}
            row['publication_id'] = pubId
            row['data_set_id'] = dataset_name_to_Id[currentgroup[0]]
            row['score'] = 1
            row['mention_list'] = groupHits
            finalLabelList.append(row)
            continue
        
        group_sim_scores = getGroupHitsSimMatrix(currentgroup,groupHits)
        print('Group ',i,' : ',group_sim_scores.shape)
        
        #print_Hits(currentgroup, group_sim_scores)
        hit_tit_scr = [ (tit,scr) for tit,scr in zip(currentgroup,group_sim_scores) if scr > GROUP_SIM_THRESHHOLD ]
        
        filterLen = min(len(groupHits) * 2,len(hit_tit_scr))
        hit_tit_scr.sort(key=lambda x: -x[1])
        hit_tit_scr = hit_tit_scr[:filterLen]
    
        for hid in hit_tit_scr:
            row = {}
            row['publication_id'] = pubId
            row['data_set_id'] = dataset_name_to_Id[hid[0]]
            row['score'] = hid[1]
            row['mention_list'] = groupHits
            finalLabelList.append(row)
            
    finalLabelDF = pd.DataFrame(finalLabelList)
    if len(finalLabelDF) > 0:
        finalLabelDF['score'] = citationScoring(finalLabelDF.mention_list.values, finalLabelDF.score.values)
    return finalLabelDF

In [395]:
a = [('one',1),('seven',7),('three',3),('nine',9)]
a.sort(key=lambda x: -x[1])
a

[('nine', 9), ('seven', 7), ('three', 3), ('one', 1)]

In [246]:
def mentionScoring(prob,matches,avgSimScore):
    if matches > 4:
        return 1
    if matches == 0:
        return 0.7 * prob
    if avgSimScore >= 0.5:
        return 0.9 * prob
    if avgSimScore < 0.5:
        return 0.8 * prob

In [247]:
def getMentionsResults(DF):
    DF['AvgSimScore'] = DF.data_sim_scores.apply(lambda x: np.mean(x) if len(x)>0 else 0)
    calScore = np.vectorize(mentionScoring)
    score = calScore(DF.Pscore.values, DF.matches.values, DF.AvgSimScore.values)
    DF['score'] = score
    DF = DF[['pubID','sentence','score']]
    DF.columns = ['publication_id', 'mention', 'score']
    return DF

In [248]:
def runPipeLine(publications, max_seq_len, hit_th , sim_th, group_sim_th):
    citationsDF = None
    mentionsDF = None
    for pub in publications:
        file = pub['text_file_name']
        pubId = pub['publication_id']
        txt = load_doc(TEXT_DIRECTORY+file)
        dates,sentences,entities = parseSpacy(txt)
        filteredSentences = filterSentencesByNer(sentences,entities)
        filteredSentences = removeSpecialCharacters(filteredSentences)
        
        pubYear = getMaxyear(dates)
        
        processed_lines = process_docs(filteredSentences, cnnVocab)
        encoded_docs = cnnTokenizer.texts_to_sequences(processed_lines)
        processed_sequences = pad_sequences(encoded_docs, maxlen=max_seq_len, padding='post')
        y_prob = model.predict(processed_sequences).reshape(len(processed_lines))
        y_hat = model.predict_classes(processed_sequences).reshape(len(processed_lines))

        classifierResultDF = getMentionSentencesDF(filteredSentences, y_hat, y_prob, hit_th)

        if len(classifierResultDF) < 1:
            print("No mentions for file : ",file)
            continue
    
        cosineSim_sent_dataset = getSimilarityMatrix_sent_datasets(classifierResultDF.sentence.values, \
                                                          vectorizerDataset,dataset_Ngram)

        candidateMatchesDF = getDatasetCandidateMatchesDF(classifierResultDF,cosineSim_sent_dataset, \
                                                          dataSetIds,dataSetTitles,dataSetYears, \
                                                          sim_th,pubId,pubYear)
        
        datasetGroupsTitles = mergeSimilarDatasets(candidateMatchesDF)

        resDf = generateResults(datasetGroupsTitles,dataset_name_to_Id,candidateMatchesDF,pubId, \
                                group_sim_th)
        
        
        mentionsdf = getMentionsResults(candidateMatchesDF)
        
        if mentionsDF is None:
            mentionsDF = mentionsdf
        else:
            mentionsDF = mentionsDF.append(mentionsdf)
    
        if citationsDF is None:
            citationsDF = resDf
        else:
            if len(resDf) < 1:
                print("No dataset matched mentions for file : ",file)
            else:
                citationsDF = citationsDF.append(resDf)
        print()   
    return citationsDF,mentionsDF

In [270]:
def evaluatePipeline(samplePublications,Th,Ts,Tgs):
    resultsDF,_ = runPipeLine(samplePublications,max_seq_len = 66, hit_th = Th, \
                            sim_th = Ts,group_sim_th = Tgs)
    print('ResultDf Length : ',len(resultsDF))
    if resultsDF is None:
        return {'Th':Th, 'Ts':Ts ,'Tgs':Tgs, 'recall': 0, 'precision': 0, 'fscore': 0}
    trueLabels = getTrueLabels(samplePublications)
    print('TrueLabel Length : ',len(trueLabels))
    recall,precision,fscore = evaluationMetrics(trueLabels,resultsDF)
    return {'Th':Th, 'Ts':Ts ,'Tgs':Tgs, 'recall': recall, 'precision': precision, 'fscore': fscore}

In [250]:
def get_publications():
    pubText = load_doc(PUBLICATIONS_JSON_FILE)
    pubJson = json.loads(pubText)
    return pubJson

In [251]:
#textFiles = getFileNames()
publications = get_publications()
publications = [p for p in publications if os.path.isfile(TEXT_DIRECTORY+p['text_file_name'])]
dataSetYears,dataSetIds,dataSetTitles = loadDataSetTitlesAndIds(DATSETS_JSON_FILE)
dataset_name_to_Id = dict(zip(dataSetTitles, dataSetIds))

cnnVocab = loadVocab(CNN_VOCAB_FILE)
cnnTokenizer = load_Keras_Tokenizer_CNN(CNN_TOKENIZER_File)
model = load_CNN_Sentece_Classifier(CNN_MODEL_FILE, CNN_MODEL_WEIGHTS_FILE)

dataset_vocab = loadVocab(DATASET_VOCAB_FILE)
datasetlines = getDataSetProcessedLines()
vectorizerDataset = getDatasetNgramVectorizer(datasetlines)
dataset_Ngram = vectorizerDataset.transform(datasetlines)
print('DataSet TFID shape: ',dataset_Ngram.shape)   


Loaded model from disk
DataSet TFID shape:  (10348, 66233)


In [403]:
os.path.isfile(TEXT_DIRECTORY+publications[0]['text_file_name'])

False

In [378]:
labelledFiles = load_doc('labelledTextFiles.txt').split('\n')
labelledPublications = [p for p in publications if p['text_file_name'] in labelledFiles]
sampleSize = 500
sampleIndex = np.random.randint(0,len(labelledPublications),sampleSize)
samplePublications = [t for i,t in enumerate(labelledPublications) if i in sampleIndex]
len(samplePublications)

459

In [398]:
resultsDF,matchDF = runPipeLine(samplePublications[:3],max_seq_len = 66, hit_th = 0.8, \
                            sim_th = 0.2,group_sim_th = 0.8)

Sentences:  392
Filtered Sentences:  56
Dataset Mentions:  5
Cos Sim shape:  (5, 10348)
Numer of Groups:  2
Group Sizes [1, 2]
Group  1  :  (2,)



  import sys


Sentences:  770
Filtered Sentences:  118
Dataset Mentions:  9
Cos Sim shape:  (9, 10348)
Numer of Groups:  3
Group Sizes [4, 3, 1]
Group  0  :  (4,)
Group  1  :  (3,)

Sentences:  792
Filtered Sentences:  195
Dataset Mentions:  6
Cos Sim shape:  (6, 10348)
Numer of Groups:  2
Group Sizes [1, 18]
Group  1  :  (18,)



In [288]:
save_citations(resultsDF)
save_mentions(matchDF)

## Research Fields

In [95]:
SAGE_FIELDS_FILE = '/home/urwa/Documents/Coleridge/ProjectFiles/train_test/train_test/sage_research_fields.json'
SAGE_VOCAB_FILE =  'sage_fields_wiki_vocab.txt'  #'sage_fields_vocab.txt'
SAGE_FIELDS_LINES = 'wiki_fields_lines.txt' #'fields_lines.txt'
SAGE_WIKI_FIELDS = 'sage_fields_wiki.txt'

In [77]:
# turn a doc into clean tokens
def fields_clean_doc(doc):
    #
    doc = doc.replace('(general)','')
    
    # split into tokens by white space
    tokens = doc.split()
    
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    
    # make lower case
    tokens = [word.lower() for word in tokens]
    
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    
    # filter special words
    special_words = ['fieldaltlabel','fieldid','fieldlabel']
    tokens = [w for w in tokens if not w in special_words]
    
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [78]:
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
	# load the doc
	doc = load_doc(filename)
	# clean doc
	tokens = fields_clean_doc(doc)
	# filter by vocab
	tokens = [w for w in tokens if w in vocab]
	return ' '.join(tokens)

In [79]:
# load all docs in a directory
def fields_process_docs(publications, vocab):
    lines = list()
    for pub in publications:
        path = TEXT_DIRECTORY+pub['text_file_name']
        line = doc_to_line(path, vocab)
        lines.append(line)
    return lines

In [80]:
def getFieldsSimMatrix(field_lines,pub_lines):
    # create the tokenizer
    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    # fit the tokenizer on the documents
    fieldTokenizer = vectorizer.fit(field_lines)
    # encode training data set
    fieldsNgram = fieldTokenizer.transform(field_lines)
    #print(fieldsNgram.shape)
    # encode training data set
    pubNgram = fieldTokenizer.transform(pub_lines)
    #print(doc_bow.shape)
    data_sim = cosine_similarity(pubNgram,fieldsNgram,dense_output=True)
    #print(data_sim.shape)
    #group_sim_scores = min_max_scale(data_sim.reshape(data_sim.shape[1]))
    return data_sim

In [99]:
def getSageFieldsWiki(publications):
    fields = load_doc(SAGE_WIKI_FIELDS).split('\n')

    sageVocab = load_doc(SAGE_VOCAB_FILE).split()
    processed_lines = fields_process_docs(publications, sageVocab)
    field_lines = load_doc(SAGE_FIELDS_LINES).split('\n')
    sim =getFieldsSimMatrix(field_lines, processed_lines)
    fieldLabels = [fields[np.argmax(c_s)] for c_s in sim]

    #print(fieldLabels)
    pubIds = [p['publication_id'] for p in publications]
    score = np.ones(len(publications))
    return pd.DataFrame({'publication_id': pubIds, 'research_field': fieldLabels, 'score': score}).to_dict('records')

In [91]:
def getSageFields(publications):
    text = load_doc(SAGE_FIELDS_FILE)
    sage_fields_json = json.loads(text)
    fields = list(sage_fields_json.keys())

    sageVocab = load_doc(SAGE_VOCAB_FILE).split()
    processed_lines = fields_process_docs(publications, sageVocab)
    field_lines = load_doc(SAGE_FIELDS_LINES).split('\n')
    sim =getFieldsSimMatrix(field_lines, processed_lines)
    fieldLabels = [fields[np.argmax(c_s)] for c_s in sim]

    SubFieldLabels = []
    for i in range(len(processed_lines)):
        subFieldJson = sage_fields_json[fieldLabels[i]]
        subFields = list(subFieldJson.keys())
        subLines = []
        for sf in subFields:
            subLines.append(' '.join(clean_doc(str(subFieldJson[sf]))))
        subSim =getFieldsSimMatrix(subLines, [processed_lines[i]])
        label = subFields[np.argmax(subSim[0])]
        SubFieldLabels.append(label)

    finalLabels = [f+" : "+s for f,s in zip(fieldLabels,SubFieldLabels)]
    #print(finalLabels)
    pubIds = [p['publication_id'] for p in publications]
    score = np.ones(len(publications))
    return pd.DataFrame({'publication_id': pubIds, 'research_field': finalLabels, 'score': score}).to_dict('records')

In [100]:
sageFieldsJson = getSageFieldsWiki(samplePublications)
with open(OUTPUT_DIRECTORY+'research_fields.json', 'w') as fp:
        json.dump(sageFieldsJson, fp)

## Research Methods

In [105]:
SAGE_METHODS_VOCAB = 'sage_methods_vocab.txt'

In [127]:
def getSageMethods(publications):
    sageMethodVocab = load_doc(SAGE_METHODS_VOCAB).split('\n')
    methodsList = []
    for pub in publications:
        txt = load_doc(TEXT_DIRECTORY + pub['text_file_name'])
        tok = txt.split()
        # remove punctuation from each token
        table = str.maketrans('', '', string.punctuation)
        tok = [w.translate(table) for w in tok]
        # remove remaining tokens that are not alphabetic
        tok = [word for word in tok if word.isalpha()]
        # remove the tokens not in vocab
        tok = [word for word in tok if word in sageMethodVocab]
        # count occurences
        counter = Counter()
        counter.update(tok)
        # dictionary to score by count
        scoreDict = {4:0.2, 5:0.4, 6:0.6, 7:0.8, 8:1}
        # methods and their counts
        meth = [k for k,c in counter.items() if c > 3]
        cnt = [c if c<=8 else 8 for k,c in counter.items() if c > 3]
        # computing score
        scr = [scoreDict[k] for k in cnt]
        # populating list of dictionary
        mList = []
        for i in range(len(meth)):
            row = {}
            row['publication_id'] = pub['publication_id']
            row['method'] = meth[i]
            row['score'] = scr[i]
            mList.append(row)
        methodsList += mList
    return methodsList

In [128]:
sageMethodsJson = getSageMethods(samplePublications)
with open(OUTPUT_DIRECTORY+'methods.json', 'w') as fp:
        json.dump(sageFieldsJson, fp)

## Parameter Tuning

In [399]:
%%time
import itertools

parameterResults = []
ThList = [0.8]
TsList = [0.1]
TgsList = [0.6]

parameterSpace = [ThList, TsList, TgsList]

for Th,Ts,Tgs in list(itertools.product(*parameterSpace)):
    par_res = evaluatePipeline(samplePublications,Th,Ts,Tgs)
    parameterResults.append(par_res)

Sentences:  392
Filtered Sentences:  56
Dataset Mentions:  5
Cos Sim shape:  (5, 10348)
Numer of Groups:  2
Group Sizes [7, 2]
Group  0  :  (7,)
Group  1  :  (2,)

Sentences:  770
Filtered Sentences:  118
Dataset Mentions:  9
Cos Sim shape:  (9, 10348)
Numer of Groups:  3
Group Sizes [8, 3, 2]
Group  0  :  (8,)
Group  1  :  (3,)
Group  2  :  (2,)

Sentences:  792
Filtered Sentences:  195
Dataset Mentions:  6
Cos Sim shape:  (6, 10348)
Numer of Groups:  2
Group Sizes [4, 1]
Group  0  :  (4,)

Sentences:  580
Filtered Sentences:  55
Dataset Mentions:  6
Cos Sim shape:  (6, 10348)
Numer of Groups:  2
Group Sizes [1, 6]
Group  1  :  (6,)

Sentences:  194
Filtered Sentences:  85
Dataset Mentions:  6
Cos Sim shape:  (6, 10348)
Numer of Groups:  1
Group Sizes [33]
Group  0  :  (33,)

Sentences:  345
Filtered Sentences:  41
Dataset Mentions:  9
Cos Sim shape:  (9, 10348)
Numer of Groups:  2
Group Sizes [405, 3]
Group  0  :  (405,)
Group  1  :  (3,)

Sentences:  200
Filtered Sentences:  26
Data

  import sys


Sentences:  278
Filtered Sentences:  83
Dataset Mentions:  15
Cos Sim shape:  (15, 10348)
Numer of Groups:  1
Group Sizes [23]
Group  0  :  (23,)

Sentences:  123
Filtered Sentences:  22
Dataset Mentions:  4
Cos Sim shape:  (4, 10348)
Numer of Groups:  2
Group Sizes [21, 50]
Group  0  :  (21,)
Group  1  :  (50,)

Sentences:  230
Filtered Sentences:  38
Dataset Mentions:  5
Cos Sim shape:  (5, 10348)
Numer of Groups:  3
Group Sizes [5, 47, 40]
Group  0  :  (5,)
Group  1  :  (47,)
Group  2  :  (40,)

Sentences:  245
Filtered Sentences:  22
Dataset Mentions:  6
Cos Sim shape:  (6, 10348)
Numer of Groups:  3
Group Sizes [3, 1, 5]
Group  0  :  (3,)
Group  2  :  (5,)

Sentences:  570
Filtered Sentences:  46
Dataset Mentions:  8
Cos Sim shape:  (8, 10348)
Numer of Groups:  3
Group Sizes [4, 3, 1]
Group  0  :  (4,)
Group  1  :  (3,)

Sentences:  801
Filtered Sentences:  88
Dataset Mentions:  12
Cos Sim shape:  (12, 10348)
Numer of Groups:  3
Group Sizes [7, 1, 7]
Group  0  :  (7,)
Group  2  : 

Group  0  :  (98,)

Sentences:  422
Filtered Sentences:  102
Dataset Mentions:  21
Cos Sim shape:  (21, 10348)
Numer of Groups:  7
Group Sizes [1, 1, 17, 5, 72, 4, 5]
Group  2  :  (17,)
Group  3  :  (5,)
Group  4  :  (72,)
Group  5  :  (4,)
Group  6  :  (5,)

Sentences:  547
Filtered Sentences:  127
Dataset Mentions:  30
Cos Sim shape:  (30, 10348)
Numer of Groups:  1
Group Sizes [136]
Group  0  :  (136,)

Sentences:  338
Filtered Sentences:  83
Dataset Mentions:  2
Cos Sim shape:  (2, 10348)
Numer of Groups:  2
Group Sizes [43, 45]
Group  0  :  (43,)
Group  1  :  (45,)

Sentences:  523
Filtered Sentences:  68
Dataset Mentions:  20
Cos Sim shape:  (20, 10348)
Numer of Groups:  6
Group Sizes [12, 15, 19, 12, 7, 2]
Group  0  :  (12,)
Group  1  :  (15,)
Group  2  :  (19,)
Group  3  :  (12,)
Group  4  :  (7,)
Group  5  :  (2,)

Sentences:  312
Filtered Sentences:  127
Dataset Mentions:  14
Cos Sim shape:  (14, 10348)
Numer of Groups:  1
Group Sizes [86]
Group  0  :  (86,)

Sentences:  323


Dataset Mentions:  1
Cos Sim shape:  (1, 10348)
Numer of Groups:  0
Group Sizes []
No dataset matched mentions for file :  726.txt

Sentences:  217
Filtered Sentences:  60
Dataset Mentions:  20
Cos Sim shape:  (20, 10348)
Numer of Groups:  1
Group Sizes [258]
Group  0  :  (258,)

Sentences:  445
Filtered Sentences:  68
Dataset Mentions:  13
Cos Sim shape:  (13, 10348)
Numer of Groups:  1
Group Sizes [4]
Group  0  :  (4,)

Sentences:  229
Filtered Sentences:  65
Dataset Mentions:  14
Cos Sim shape:  (14, 10348)
Numer of Groups:  1
Group Sizes [42]
Group  0  :  (42,)

Sentences:  263
Filtered Sentences:  38
Dataset Mentions:  6
Cos Sim shape:  (6, 10348)
Numer of Groups:  2
Group Sizes [32, 4]
Group  0  :  (32,)
Group  1  :  (4,)

Sentences:  391
Filtered Sentences:  58
Dataset Mentions:  6
Cos Sim shape:  (6, 10348)
Numer of Groups:  1
Group Sizes [38]
Group  0  :  (38,)

Sentences:  209
Filtered Sentences:  88
Dataset Mentions:  12
Cos Sim shape:  (12, 10348)
Numer of Groups:  3
Group 

Group  0  :  (51,)
Group  1  :  (2,)

Sentences:  254
Filtered Sentences:  71
Dataset Mentions:  5
Cos Sim shape:  (5, 10348)
Numer of Groups:  2
Group Sizes [6, 1]
Group  0  :  (6,)

Sentences:  399
Filtered Sentences:  160
Dataset Mentions:  14
Cos Sim shape:  (14, 10348)
Numer of Groups:  3
Group Sizes [11, 1, 4]
Group  0  :  (11,)
Group  2  :  (4,)

Sentences:  616
Filtered Sentences:  85
Dataset Mentions:  9
Cos Sim shape:  (9, 10348)
Numer of Groups:  2
Group Sizes [80, 11]
Group  0  :  (80,)
Group  1  :  (11,)

Sentences:  955
Filtered Sentences:  138
Dataset Mentions:  7
Cos Sim shape:  (7, 10348)
Numer of Groups:  1
Group Sizes [3]
Group  0  :  (3,)

Sentences:  339
Filtered Sentences:  85
Dataset Mentions:  17
Cos Sim shape:  (17, 10348)
Numer of Groups:  1
Group Sizes [53]
Group  0  :  (53,)

Sentences:  299
Filtered Sentences:  60
Dataset Mentions:  4
Cos Sim shape:  (4, 10348)
Numer of Groups:  1
Group Sizes [1]

Sentences:  485
Filtered Sentences:  127
Dataset Mentions:  

Dataset Mentions:  8
Cos Sim shape:  (8, 10348)
Numer of Groups:  1
Group Sizes [46]
Group  0  :  (46,)

Sentences:  312
Filtered Sentences:  88
Dataset Mentions:  38
Cos Sim shape:  (38, 10348)
Numer of Groups:  4
Group Sizes [177, 4, 1, 4]
Group  0  :  (177,)
Group  1  :  (4,)
Group  3  :  (4,)

Sentences:  216
Filtered Sentences:  65
Dataset Mentions:  7
Cos Sim shape:  (7, 10348)
Numer of Groups:  1
Group Sizes [51]
Group  0  :  (51,)

Sentences:  246
Filtered Sentences:  70
Dataset Mentions:  24
Cos Sim shape:  (24, 10348)
Numer of Groups:  1
Group Sizes [92]
Group  0  :  (92,)

Sentences:  262
Filtered Sentences:  42
Dataset Mentions:  5
Cos Sim shape:  (5, 10348)
Numer of Groups:  1
Group Sizes [42]
Group  0  :  (42,)

Sentences:  246
Filtered Sentences:  74
Dataset Mentions:  11
Cos Sim shape:  (11, 10348)
Numer of Groups:  1
Group Sizes [54]
Group  0  :  (54,)

Sentences:  198
Filtered Sentences:  46
Dataset Mentions:  2
Cos Sim shape:  (2, 10348)
Numer of Groups:  1
Group Siz

Sentences:  301
Filtered Sentences:  37
Dataset Mentions:  7
Cos Sim shape:  (7, 10348)
Numer of Groups:  4
Group Sizes [6, 5, 1, 8]
Group  0  :  (6,)
Group  1  :  (5,)
Group  3  :  (8,)

Sentences:  196
Filtered Sentences:  52
Dataset Mentions:  6
Cos Sim shape:  (6, 10348)
Numer of Groups:  1
Group Sizes [62]
Group  0  :  (62,)

Sentences:  315
Filtered Sentences:  93
Dataset Mentions:  26
Cos Sim shape:  (26, 10348)
Numer of Groups:  1
Group Sizes [96]
Group  0  :  (96,)

Sentences:  449
Filtered Sentences:  128
Dataset Mentions:  6
Cos Sim shape:  (6, 10348)
Numer of Groups:  2
Group Sizes [1, 1]

Sentences:  373
Filtered Sentences:  89
Dataset Mentions:  10
Cos Sim shape:  (10, 10348)
Numer of Groups:  3
Group Sizes [2, 11, 35]
Group  0  :  (2,)
Group  1  :  (11,)
Group  2  :  (35,)

Sentences:  244
Filtered Sentences:  52
Dataset Mentions:  13
Cos Sim shape:  (13, 10348)
Numer of Groups:  1
Group Sizes [82]
Group  0  :  (82,)

Sentences:  160
Filtered Sentences:  26
Dataset Menti

Group  0  :  (197,)

Sentences:  165
Filtered Sentences:  41
Dataset Mentions:  12
Cos Sim shape:  (12, 10348)
Numer of Groups:  1
Group Sizes [9]
Group  0  :  (9,)

Sentences:  354
Filtered Sentences:  67
Dataset Mentions:  11
Cos Sim shape:  (11, 10348)
Numer of Groups:  1
Group Sizes [16]
Group  0  :  (16,)

Sentences:  238
Filtered Sentences:  83
Dataset Mentions:  26
Cos Sim shape:  (26, 10348)
Numer of Groups:  3
Group Sizes [75, 4, 1]
Group  0  :  (75,)
Group  1  :  (4,)

Sentences:  533
Filtered Sentences:  43
Dataset Mentions:  9
Cos Sim shape:  (9, 10348)
Numer of Groups:  6
Group Sizes [2, 10, 4, 2, 2, 5]
Group  0  :  (2,)
Group  1  :  (10,)
Group  2  :  (4,)
Group  3  :  (2,)
Group  4  :  (2,)
Group  5  :  (5,)

Sentences:  170
Filtered Sentences:  83
Dataset Mentions:  12
Cos Sim shape:  (12, 10348)
Numer of Groups:  3
Group Sizes [30, 2, 65]
Group  0  :  (30,)
Group  1  :  (2,)
Group  2  :  (65,)

Sentences:  452
Filtered Sentences:  46
Dataset Mentions:  0
No mentions fo

Dataset Mentions:  9
Cos Sim shape:  (9, 10348)
Numer of Groups:  3
Group Sizes [2, 51, 3]
Group  0  :  (2,)
Group  1  :  (51,)
Group  2  :  (3,)

Sentences:  136
Filtered Sentences:  26
Dataset Mentions:  6
Cos Sim shape:  (6, 10348)
Numer of Groups:  1
Group Sizes [6]
Group  0  :  (6,)

Sentences:  514
Filtered Sentences:  109
Dataset Mentions:  5
Cos Sim shape:  (5, 10348)
Numer of Groups:  2
Group Sizes [321, 4]
Group  0  :  (321,)
Group  1  :  (4,)

Sentences:  188
Filtered Sentences:  63
Dataset Mentions:  7
Cos Sim shape:  (7, 10348)
Numer of Groups:  5
Group Sizes [2, 3, 3, 2, 76]
Group  0  :  (2,)
Group  1  :  (3,)
Group  2  :  (3,)
Group  3  :  (2,)
Group  4  :  (76,)

Sentences:  500
Filtered Sentences:  146
Dataset Mentions:  11
Cos Sim shape:  (11, 10348)
Numer of Groups:  2
Group Sizes [24, 3]
Group  0  :  (24,)
Group  1  :  (3,)

Sentences:  526
Filtered Sentences:  58
Dataset Mentions:  4
Cos Sim shape:  (4, 10348)
Numer of Groups:  1
Group Sizes [182]
Group  0  :  (182

Dataset Mentions:  20
Cos Sim shape:  (20, 10348)
Numer of Groups:  6
Group Sizes [52, 9, 7, 2, 22, 1]
Group  0  :  (52,)
Group  1  :  (9,)
Group  2  :  (7,)
Group  3  :  (2,)
Group  4  :  (22,)

Sentences:  424
Filtered Sentences:  99
Dataset Mentions:  27
Cos Sim shape:  (27, 10348)
Numer of Groups:  7
Group Sizes [1, 116, 17, 2, 83, 14, 11]
Group  1  :  (116,)
Group  2  :  (17,)
Group  3  :  (2,)
Group  4  :  (83,)
Group  5  :  (14,)
Group  6  :  (11,)

Sentences:  289
Filtered Sentences:  89
Dataset Mentions:  13
Cos Sim shape:  (13, 10348)
Numer of Groups:  1
Group Sizes [1]

Sentences:  493
Filtered Sentences:  125
Dataset Mentions:  5
Cos Sim shape:  (5, 10348)
Numer of Groups:  2
Group Sizes [1, 3]
Group  1  :  (3,)

Sentences:  349
Filtered Sentences:  131
Dataset Mentions:  9
Cos Sim shape:  (9, 10348)
Numer of Groups:  2
Group Sizes [110, 11]
Group  0  :  (110,)
Group  1  :  (11,)

Sentences:  434
Filtered Sentences:  51
Dataset Mentions:  10
Cos Sim shape:  (10, 10348)
Nume

Sentences:  330
Filtered Sentences:  87
Dataset Mentions:  7
Cos Sim shape:  (7, 10348)
Numer of Groups:  1
Group Sizes [3]
Group  0  :  (3,)

Sentences:  262
Filtered Sentences:  53
Dataset Mentions:  4
Cos Sim shape:  (4, 10348)
Numer of Groups:  0
Group Sizes []
No dataset matched mentions for file :  2982.txt

Sentences:  334
Filtered Sentences:  53
Dataset Mentions:  19
Cos Sim shape:  (19, 10348)
Numer of Groups:  2
Group Sizes [130, 7]
Group  0  :  (130,)
Group  1  :  (7,)

Sentences:  447
Filtered Sentences:  135
Dataset Mentions:  23
Cos Sim shape:  (23, 10348)
Numer of Groups:  3
Group Sizes [464, 292, 3]
Group  0  :  (464,)
Group  1  :  (292,)
Group  2  :  (3,)

Sentences:  344
Filtered Sentences:  115
Dataset Mentions:  9
Cos Sim shape:  (9, 10348)
Numer of Groups:  1
Group Sizes [27]
Group  0  :  (27,)

Sentences:  200
Filtered Sentences:  58
Dataset Mentions:  6
Cos Sim shape:  (6, 10348)
Numer of Groups:  1
Group Sizes [82]
Group  0  :  (82,)

Sentences:  291
Filtered Se

In [None]:
evalDf = pd.DataFrame(parameterResults)
evalDf.sort_values(by='fscore', axis=0, ascending=False)