In [9]:
"""Demo of CoreNLP mention recognition accessed through a python wrapper."""

from pycorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP('http://localhost:9000')
text = 'Ryan Amaral is a student at Dalhousie University in Halifax, Nova Scotia. It is a cool school.'
output = nlp.annotate(text, properties={
    'annotators': 'entitymentions',
    'outputFormat': 'json'
})
for mention in output['sentences'][0]['entitymentions']:
    print mention['text']
    
print output['sentences'][0]['entitymentions']

Ryan Amaral
Dalhousie University
Halifax
Nova Scotia
[{u'tokenBegin': 0, u'tokenEnd': 2, u'docTokenBegin': 0, u'characterOffsetEnd': 11, u'docTokenEnd': 2, u'text': u'Ryan Amaral', u'characterOffsetBegin': 0, u'ner': u'PERSON'}, {u'tokenBegin': 6, u'tokenEnd': 8, u'docTokenBegin': 6, u'characterOffsetEnd': 48, u'docTokenEnd': 8, u'text': u'Dalhousie University', u'characterOffsetBegin': 28, u'ner': u'ORGANIZATION'}, {u'tokenBegin': 9, u'tokenEnd': 10, u'docTokenBegin': 9, u'characterOffsetEnd': 59, u'docTokenEnd': 10, u'text': u'Halifax', u'characterOffsetBegin': 52, u'ner': u'LOCATION'}, {u'tokenBegin': 11, u'tokenEnd': 13, u'docTokenBegin': 11, u'characterOffsetEnd': 72, u'docTokenEnd': 13, u'text': u'Nova Scotia', u'characterOffsetBegin': 61, u'ner': u'LOCATION'}]


In [15]:
%%writefile gen-mention-data.py
from __future__ import division
"""
Generate data to be used for entity recognition.
"""

"""
Mention Data of form [pos before, pos on, pos after, mentions prob].
[0:3] are to be One Hot Encoded.
"""


import requests
import json
import os
from wikification import *
from wikipedia import title2id
import copy
import sys
import copy
import nltk
from pycorenlp import StanfordCoreNLP
scnlp = StanfordCoreNLP('http://localhost:9000')

# need this for novelty detection
#from sklearn.svm import OneClassSVM

# for pos on pos parts
#ohe = OneHotEncoder(n_values=[46,46,46])

# convert pos values to numbers
posDict = {
    "$":0,
    "''":1,
    "(":2,
    ")":3,
    ",":4,
    "--":5,
    ".":6,
    ":":7,
    "CC":8,
    "CD":9,
    "DT":10,
    "EX":11,
    "FW":12,
    "IN":13,
    "JJ":14,
    "JJR":15,
    "JJS":16,
    "LS":17,
    "MD":18,
    "NN":19,
    "NNP":20,
    "NNPS":21,
    "NNS":22,
    "PDT":23,
    "POS":24,
    "PRP":25,
    "PRP$":26,
    "RBR":27,
    "RBS":28,
    "RP":29,
    "SYM":30,
    "TO":31,
    "UH":32,
    "VB":33,
    "VBD":34,
    "VBG":35,
    "VBN":36,
    "VBP":37,
    "VBZ":38,
    "WDT":39,
    "WP":40,
    "WP$":41,
    "WRB":42,
    "``":43,
    "None":44,
    "NONE":45,
    "RB":46
}

posBefDict = {
    'IN':0,
    'DT':1,
    'NNP':2,
    'JJ':3,
    ',':4,
    'CC':5,
    'NN':6,
    'VBD':7,
    'CD':8,
    '(':9,
    'TO':10,
    'FAIL':11
}

posCurDict = {
    'NNP':0,
    'NN':1,
    'JJ':2,
    'NNS':3,
    'CD':4,
    'NNPS':5,
    'FAIL':6
}

posAftDict = {
    ',':0,
    '.':1,
    'IN':2,
    'NNP':3,
    'CC':4,
    'NN':5,
    'VBD':6,
    ':':7,
    'VBZ':8,
    'POS':9,
    'NNS':10,
    'TO':11,
    'FAIL':12
}

def normalize(nums):
    """Normalizes a list of nums to its sum + 1"""
    
    numSum = sum(nums) + 1 # get max
    
    # fill with normalized
    normNums = []
    for num in nums:
        normNums.append(num/numSum)
        
    return normNums

pathStrt = '/users/cs/amaral/wsd-datasets'
dsPath = os.path.join(pathStrt,'wiki-mentions.30000.json')

newData = []

# exclude non-mentions to treat as novelty detection
# include non-mentions to treat as classification
nonMentions = True

with open(dsPath, 'r') as dataFile:
    dataLines = []
    skip = 0
    amount = 30000
    i = 0
    for line in dataFile:
        if i >= skip:
            dataLines.append(json.loads(line.decode('utf-8').strip()))
        i += 1
        if i >= skip + amount:
            break
            
errors = 0
        
lnum = 0
for line in dataLines:
    
    oMentions = copy.deepcopy(line['mentions']) # mentions in original form
    oText = " ".join(copy.deepcopy(line['text']))
    #uni = unicode(oText, 'utf-8')
    #print uni
    line['mentions'] = mentionStartsAndEnds(line, True)

    #Get POS tags of all text
    postrs = nltk.pos_tag(copy.deepcopy(line['text']))

    # get stanford core mentions
    try:
        stnfrdMentions0 = scnlp.annotate(oText.encode('utf-8'), properties={
                'annotators': 'entitymentions',
                'outputFormat': 'json'})
    except:
        errors += 1
        print 'Error #' + str(errors) + ' on line #' + str(lnum)
        lnum += 1
        continue
    stnfrdMentions = []
    for sentence in stnfrdMentions0['sentences']:
        for mention in sentence['entitymentions']:
            stnfrdMentions.append(mention['text'])

    for i in range(len(line['text'])):
        
        if nonMentions == False and i not in [item[0] for item in oMentions]:
            continue
        
        newData.append([]) # add new row to mention data at mIdx
             
        """ 
        Append POS tags of before, on, and after mention.
        """
        if i == 0:
            bef = 'NONE'
        else:
            bef = postrs[i-1][1] # pos tag of before
        if bef in posBefDict:
            bef = posBefDict[bef]
        else:
            bef = posBefDict['FAIL']
            
        on = postrs[i][1] # pos tag of mention
        if on in posCurDict:
            on = posCurDict[on]
        else:
            on = posCurDict['FAIL']
        
        if i == len(line['text']) - 1:
            aft = 'NONE'
        else:
            aft = postrs[i+1][1] # pos tag of after
        if aft in posAftDict:
            aft = posAftDict[aft]
        else:
            aft = posAftDict['FAIL']
        
        newData[-1].extend([bef, on, aft])
        
        """
        Append mention probability.
        """
        newData[-1].append(mentionProb(line['text'][i]))
        
        """
        Find whether Stanford NER decides the word to be mention.
        """
        if line['text'][i] in stnfrdMentions:
            stnfrdMentions.remove(line['text'][i])
            newData[-1].append(1)
        else:
            newData[-1].append(0)
            
        """
        Whether starts with capital.
        """
        if line['text'][i][0].isupper():
            newData[-1].append(1)
        else:
            newData[-1].append(0)
            
        """
        Whether there is an exact match in Wikipedia.
        """
        if title2id(line['text'][i]) is not None:
            newData[-1].append(1)
        else:
            newData[-1].append(0)
            
        """
        Whether word contains a space.
        """
        if ' ' in line['text'][i]:
            newData[-1].append(1)
        else:
            newData[-1].append(0)
            
        """
        Whether the word contains only ascii characters.
        """
        try:
            line['text'][i].decode('ascii')
            newData[-1].append(1)
        except:
            newData[-1].append(0)
        
        # put in whether is mention or not only if including nonMentions
        if nonMentions == True:
            if i in [item[0] for item in oMentions]:
                newData[-1].append(1)
            else:
                newData[-1].append(0)
    
        #print newData[-1]
        
    lnum += 1
    print 'Line: ' + str(lnum)
    
# nov for novelty, cls for classification
#with open('/users/cs/amaral/wikisim/wikification/learning-data/er-10000-nov.txt', 'w') as f:
with open('/users/cs/amaral/wikisim/wikification/learning-data/er-30000-cls-v2.txt', 'w') as f:
    for data in newData:
        f.write(str(data)[1:-1] + '\n')

Overwriting gen-mention-data.py


In [2]:
#%%writefile er-model-create.py
from __future__ import division

"""
Create the machine learned models for detection of mentions
"""

import os
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(n_values = [12,7,13], categorical_features = [0,1,2])

def createModels(isNovel, posNegRatio = 1, posToUse = 48428, scores = None):
    """
    Description:
        Tests different models on er.
    Args:
        isNovel: Whether to use novelty detection, or classification algorithms.
        posNegRatio: How many negative examples to have per positive example.
        posToUse: Amount of positive examples to use (48428 from 10000 dataset).
    Return:
        Nothing, but creates the models and saves the best.
    """

    poss = [] # positive instances
    negs = [] # negative instances

    # with 10000 file
        # poss: 48428
        # negs: 454139
        # about 1:10 pos to neg
    
    negToUse = posToUse * posNegRatio

    if isNovel == True:
        
        # train it on 80% of positive then test on 20% of positive with the 
        # same amount of negatives
        
        # get the nov datasets
        # positive examples
        with open('/users/cs/amaral/wikisim/wikification/learning-data/er-10000-nov.txt', 'r') as f:
            for line in f:
                data = line.split(',')
                for i in range(len(data)):
                    if i == 3: # float data
                        data[i] = float(data[i])
                    else: # int data
                        data[i] = int(data[i])
                poss.append(data)
        
        # some negatives
        with open('/users/cs/amaral/wikisim/wikification/learning-data/er-10000-cls-neg.txt', 'r') as f:
            for line in f:
                data = line.split(',')
                for i in range(len(data)):
                    if i == 3: # float data
                        data[i] = float(data[i])
                    else: # int data
                        data[i] = int(data[i])
                negs.append(data)
                
        poss = shuffle(poss)
        
        trainAmount = long(0.8 * len(poss))
        testAmount = long(0.2 * len(poss))

        XTrain = poss[0:trainAmount]
        
        XTestPos = poss[trainAmount:trainAmount + testAmount]
        XTestNeg = [data[:-1] for data in shuffle(negs)[:testAmount]]
        
        from sklearn.svm import OneClassSVM
        
        svm = OneClassSVM()
        svm.fit(XTrain)
        
        posPred = svm.predict(XTestPos)
        negPred = svm.predict(XTestNeg)
        
        wrong = 0
        for pred in posPred:
            if pred == -1:
                wrong += 1
        print 'Positive Accuracy:', 1 - wrong/testAmount
        
        wrong = 0
        for pred in negPred:
            if pred == 1:
                wrong += 1
        print 'Negative Accuracy:', 1 - wrong/testAmount
        
    else:
        # get the cls datasets
        # positive examples
        with open('/users/cs/amaral/wikisim/wikification/learning-data/er-30000-cls-pos-v2.txt', 'r') as f:
            for line in f:
                data = line.split(',')
                for i in range(len(data)):
                    if i == 3: # float data
                        data[i] = float(data[i])
                    else: # int data
                        data[i] = int(data[i])
                poss.append(data)

        with open('/users/cs/amaral/wikisim/wikification/learning-data/er-30000-cls-neg-v2.txt', 'r') as f:
            for line in f:
                data = line.split(',')
                for i in range(len(data)):
                    if i == 3: # float data
                        data[i] = float(data[i])
                    else: # int data
                        data[i] = int(data[i])
                
                negs.append(data)

        poss = shuffle(poss)[:posToUse]
        negs = shuffle(negs)[:negToUse]
        poss.extend(negs)
        allData = shuffle(poss)
        
        enc.fit(allData)
        allData = enc.transform(allData).toarray()
        
        print allData[0]

        trainAmount = long(0.8 * len(allData))
        valiAmount = long(0.2 * len(allData))
        testAmount = long(0.0 * len(allData))

        XTrain = [data[:-1] for data in allData[0:trainAmount]]
        yTrain = [data[-1] for data in allData[0:trainAmount]]

        XVali = [data[:-1] for data in allData[trainAmount:trainAmount + valiAmount]]
        yVali = [data[-1] for data in allData[trainAmount:trainAmount + valiAmount]]

        XTest = [data[:-1] for data in allData[trainAmount + valiAmount:trainAmount + valiAmount + testAmount]]
        yTest= [data[-1] for data in allData[trainAmount + valiAmount:trainAmount + valiAmount + testAmount]]

        # try these classifiers
        from sklearn.ensemble import AdaBoostClassifier
        from sklearn.ensemble import BaggingClassifier
        from sklearn.ensemble import ExtraTreesClassifier
        from sklearn.ensemble import GradientBoostingClassifier
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.svm import LinearSVC
        from sklearn.svm import SVC
        
        from sklearn.metrics import classification_report
        
        #print 'adaboost:'
        #abc = AdaBoostClassifier(n_estimators=300)
        #abc.fit(XTrain, yTrain)
        #print classification_report(yVali, abc.predict(XVali))
        #scores['abc'] = abc.score(XVali, yVali)
        #print 'adaboost:', scores['abc']

        print 'bagging:'
        bgc = BaggingClassifier(verbose=0, n_estimators=300)
        bgc.fit(XTrain, yTrain)
        #print classification_report(yVali, bgc.predict(XVali))
        #scores['bgc'] = bgc.score(XVali, yVali)
        #print 'bagging:', scores['bgc']

        #print 'extra trees:'
        #etc = ExtraTreesClassifier(verbose=0, n_estimators=300, min_samples_split=5)
        #etc.fit(XTrain, yTrain)
        #print classification_report(yVali, etc.predict(XVali))
        #scores['etc'] = etc.score(XVali, yVali)
        #print 'extra trees:', scores['etc']

        #print 'gradient boosting:'
        #gbc = GradientBoostingClassifier(verbose=0, n_estimators=300, min_samples_split=5)
        #gbc.fit(XTrain, yTrain)
        #print classification_report(yVali, gbc.predict(XVali))
        #scores['gbc'] = gbc.score(XVali, yVali)
        #print 'gradient boosting:', scores['gbc']
        
        # save gradient boosting cause is tied for best and small size
        import pickle
        pickle.dump(bgc, open('/users/cs/amaral/wikisim/wikification/ml-models/er/er-model-bgc-30000.pkl', 'wb'))
        #k = 1/0

        #print 'random forest:'
        #rfc = RandomForestClassifier(verbose=0, n_estimators=300, min_samples_split=5)
        #rfc.fit(XTrain, yTrain)
        #print classification_report(yVali, rfc.predict(XVali))
        #scores['rfc'] = rfc.score(XVali, yVali)
        #print 'random forest:', scores['rfc']

        #print 'linear svc:'
        #lsvc = LinearSVC(verbose=0)
        #lsvc.fit(XTrain, yTrain)
        #print classification_report(yVali, lsvc.predict(XVali))
        #scores['lsvc'] = lsvc.score(XVali, yVali)
        #print 'linear svc:', scores['lsvc']

        #print 'svc:'
        #svc = SVC(verbose=False)
        #svc.fit(XTrain, yTrain)
        #print classification_report(yVali, svc.predict(XVali))
        #scores['svc'] = svc.score(XVali, yVali)
        #print 'svc:', scores['svc']
        

# with novelty detection
#Positive Accuracy: 0.498606091895
#Negative Accuracy: 0.835002581311
#createModels(True, posToUse = 48428) # it sucks


# keep scores
scores = {}

# 48428
# try with differing ratios of positive to negative instances
for i in [1]:#, 2, 4, 7, 10]:
    print 'With Ratio:', 10
    scores[str(i)] = {}
    createModels(False, posNegRatio = 10, posToUse = 139869, scores = scores[str(i)])
    #print scores
    print 'Ratio ' + str(i) + ' done.\n\n'

With Ratio: 10
[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
   7.01311452e-05   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   1.00000000e+00   0.00000000e+00]
bagging:
Ratio 1 done.




In [11]:
%%writefile el-model-create.py 
# originally model-create.py

allX = []
allY = []
allMId = []


trainX = []
trainY = []
trainMId = []

bigTrainX = []
bigTrainY = []
bigTrainMId = []

valiX = []
valiY = []
valiMId = []

testX = []
testY = []
testMId = []

# dataX from file is (id, isTrueEntity, popularity, context1, context2, word2vec, coherence, mentionId)

linesToUse = 10000000 # limit amount of total data
totalLines = 0
# first try with just getting all data
with open('/users/cs/amaral/wikisim/wikification/learning-data/el-10000-hybridgen.txt', 'r') as f:
    for line in f:
        totalLines += 1
        if totalLines > linesToUse:
            break
        data = line.split(',')
        allX.append([float(data[2]), float(data[3]), float(data[4]), 0, float(data[6])])
        allY.append(int(data[1]))
        allMId.append(long(data[7]))
        
# split 60,20,20 or 80,20 with bigTrain
trainLines = int(totalLines * 0.6)
valiLines = int(totalLines * 0.2)
testLines = int(totalLines * 0.2)

for i in range(0, trainLines):
    trainX.append(allX[i])
    trainY.append(allY[i])
    trainMId.append(allMId[i])
    
for i in range(0, trainLines + valiLines):
    bigTrainX.append(allX[i])
    bigTrainY.append(allY[i])
    bigTrainMId.append(allMId[i])

for i in range(trainLines, trainLines + valiLines):
    valiX.append(allX[i])
    valiY.append(allY[i])
    valiMId.append(allMId[i])
    
for i in range(trainLines + valiLines, trainLines + valiLines + testLines):
    testX.append(allX[i])
    testY.append(allY[i])
    testMId.append(allMId[i])
    
print 'about to start training'

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
#from sklearn.svm import NuSVC
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import sys
sys.path.append('./pyltr/')
import pyltr
"""
abc = AdaBoostClassifier(n_estimators=300)
abc.fit(bigTrainX, bigTrainY)

print 'adaboost done'

bgc = BaggingClassifier(verbose=1, n_estimators=300)
bgc.fit(bigTrainX, bigTrainY)

print 'bagging done'

etc = ExtraTreesClassifier(verbose=1, n_estimators=300, min_samples_split=5)
etc.fit(bigTrainX, bigTrainY)

print 'extra trees done'

gbc = GradientBoostingClassifier(verbose=1, n_estimators=300, min_samples_split=5)
gbc.fit(bigTrainX, bigTrainY)

print 'gradient boosting done'

rfc = RandomForestClassifier(verbose=1, n_estimators=300, min_samples_split=5)
rfc.fit(bigTrainX, bigTrainY)

print 'random forest done'

lsvc = LinearSVC(verbose=1)
lsvc.fit(bigTrainX, bigTrainY)

print 'linear svc done'

#nsvc = NuSVC(verbose=True)
#nsvc.fit(bigTrainX, bigTrainY)

#print 'nusvc done'

svc = SVC(verbose=True)
svc.fit(bigTrainX, bigTrainY)

print 'svc done'"""

monitor = pyltr.models.monitors.ValidationMonitor(
    valiX, valiY, valiMId, metric=pyltr.metrics.NDCG(k=10), stop_after=250)
lmart = pyltr.models.LambdaMART(n_estimators=300, learning_rate=0.1, verbose = 1)
lmart.fit(trainX, trainY, trainMId, monitor=monitor)

print 'lmart done'

"""
Save the model.
"""

import pickle

#pickle.dump(abc, open('/users/cs/amaral/wikisim/wikification/ml-models/model-abc-10000-pop.pkl', 'wb'))
#pickle.dump(bgc, open('/users/cs/amaral/wikisim/wikification/ml-models/model-bgc-10000-pop.pkl', 'wb'))
#pickle.dump(etc, open('/users/cs/amaral/wikisim/wikification/ml-models/model-etc-10000-pop.pkl', 'wb'))
#pickle.dump(gbc, open('/users/cs/amaral/wikisim/wikification/ml-models/model-gbc-10000-pop.pkl', 'wb'))
#pickle.dump(rfc, open('/users/cs/amaral/wikisim/wikification/ml-models/model-rfc-10000-pop.pkl', 'wb'))
#pickle.dump(lsvc, open('/users/cs/amaral/wikisim/wikification/ml-models/model-lsvc-10000-pop.pkl', 'wb'))
#pickle.dump(nsvc, open('/users/cs/amaral/wikisim/wikification/ml-models/model-nsvc-10000-hyb.pkl', 'wb'))
#pickle.dump(svc, open('/users/cs/amaral/wikisim/wikification/ml-models/model-svc-10000-pop.pkl', 'wb'))
pickle.dump(lmart, open('/users/cs/amaral/wikisim/wikification/ml-models/model-lmart-10000-hyb-no-w2v.pkl', 'wb'))

print 'models saved'

Overwriting el-model-create.py


In [20]:
"""
This cell is to get all the data for the ml model
"""

allX = []
allY = []
allMId = []

trainX = []
trainY = []
trainMId = []
valiX = []
valiY = []
valiMId = []
testX = []
testY = []
testMId = []

linesToUse = 1000000 # limit amount of total data
totalLines = 0
# first try with just getting all data
with open('/users/cs/amaral/wikisim/wikification/learning-data/el-5000.txt', 'r') as f:
    for line in f:
        totalLines += 1
        if totalLines > linesToUse:
            break
        data = line.split(',')
        allX.append([float(data[2]), float(data[3]), float(data[4]), float(data[5]), float(data[6])])
        allY.append(int(data[1]))
        allMId.append(long(data[7]))
        
# split 75, 25
trainLines = int(totalLines * 0.75)
valiLines = int(totalLines * 0.0)
testLines = int(totalLines * 0.25)

for i in range(0, trainLines):
    trainX.append(allX[i])
    trainY.append(allY[i])
    trainMId.append(allMId[i])

for i in range(trainLines, trainLines + valiLines):
    valiX.append(allX[i])
    valiY.append(allY[i])
    valiMId.append(allMId[i])
    
for i in range(trainLines + valiLines, trainLines + valiLines + testLines):
    testX.append(allX[i])
    testY.append(allY[i])
    testMId.append(allMId[i])
    
print len(trainX)
print len(testX)

 170097
56699


In [None]:
""" This cell helped by: https://github.com/ogrisel/notebooks/blob/master/Learning%20to%20Rank.ipynb
This cell is to train the model.
"""

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
import sys
sys.path.append('./pyltr/')
import pyltr

#etr = ExtraTreesRegressor(n_estimators=200, min_samples_split=5, random_state=1, n_jobs=-1)
#etr.fit(trainX, trainY)

#rfr = RandomForestRegressor(n_estimators=200, min_samples_split=5, random_state=1, n_jobs=-1)
#rfr.fit(trainX, trainY)

#gbr = GradientBoostingRegressor(n_estimators=300, max_depth=3, learning_rate=0.1, loss='ls', random_state=1)
#gbr.fit(trainX, trainY)

#gbc = GradientBoostingClassifier(n_estimators=200, min_samples_split=5, random_state=1)
#gbc.fit(trainX, trainY)

lmart = pyltr.models.LambdaMART(n_estimators=300, learning_rate=0.1, verbose = 1)
lmart.fit(trainX, trainY, trainMId)

 Iter  Train score    Remaining                           Monitor Output 
    1       0.8987       88.16m                                         
    2       0.9178       86.97m                                         
    3       0.9288       86.27m                                         
    4       0.9290       85.70m                                         
    5       0.9356       85.35m                                         
    6       0.9362       84.96m                                         
    7       0.9387       84.65m                                         
    8       0.9411       84.34m                                         
    9       0.9412       84.02m                                         
   10       0.9414       83.73m                                         
   15       0.9439       82.20m                                         
   20       0.9479       80.73m                                         
   25       0.9519       79.28m                   

In [77]:
"""
This cell tells the accuracy of the model.
"""

import numpy as np

model = etr
print 'Extra Trees Regressor:'
print 'R^2 Score:', model.score(testX, testY)
print
for i in np.arange(0.0, 1.0, 0.1):
    printEval(model, testX, testY, i)
    print
print

model = rfr
print 'Random Forest Regressor:'
print 'R^2 Score:', model.score(testX, testY)
print
for i in np.arange(0.0, 1.0, 0.1):
    printEval(model, testX, testY, i)
    print
print

model = gbr
print 'Gradient Boosting Regressor:'
print 'R^2 Score:', model.score(testX, testY)
print
for i in np.arange(0.0, 1.0, 0.1):
    printEval(model, testX, testY, i)
    print
print

model = gbc
print 'Gradient Boosting Classifier:'
print 'R^2 Score:', model.score(testX, testY)
print

Extra Trees Regressor:
R^2 Score: 0.843993888891

BDB =  0.0 
TP: 6134 0.993199481865 
FP: 8991 0.17795855353 
TN: 41532 0.82204144647 
FN: 42 0.00680051813472

BDB =  0.1 
TP: 5993 0.970369170984 
FP: 1909 0.0377847712923 
TN: 48614 0.962215228708 
FN: 183 0.0296308290155

BDB =  0.2 
TP: 5897 0.954825129534 
FP: 1204 0.0238307305584 
TN: 49319 0.976169269442 
FN: 279 0.0451748704663

BDB =  0.3 
TP: 5788 0.937176165803 
FP: 878 0.0173782237793 
TN: 49645 0.982621776221 
FN: 388 0.0628238341969

BDB =  0.4 
TP: 5679 0.919527202073 
FP: 667 0.0132019080419 
TN: 49856 0.986798091958 
FN: 497 0.0804727979275

BDB =  0.5 
TP: 5533 0.895887305699 
FP: 496 0.0098173109277 
TN: 50027 0.990182689072 
FN: 643 0.104112694301

BDB =  0.6 
TP: 5372 0.86981865285 
FP: 374 0.00740256912693 
TN: 50149 0.992597430873 
FN: 804 0.13018134715

BDB =  0.7 
TP: 5177 0.838244818653 
FP: 270 0.00534410070661 
TN: 50253 0.994655899293 
FN: 999 0.161755181347

BDB =  0.8 
TP: 4894 0.792422279793 
FP: 174 0.00

In [48]:
from __future__ import division
def printEval(model, X, y, bdb = 0.5):
    predY = model.predict(X)
    
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    
    for i in range(len(y)):
        if predY[i] > bdb and y[i] == 1:
            tp += 1
        elif predY[i] > bdb and y[i] == 0:
            fp += 1
        elif predY[i] <= bdb and y[i] == 1:
            fn += 1
        elif predY[i] <= bdb and y[i] == 0:
            tn += 1
            
    print 'BDB = ', bdb, '\nTP:', tp, tp/(tp+fn), '\nFP:', fp, fp/(fp+tn), '\nTN:', tn, tn/(fp+tn), '\nFN:', fn, fn/(tp+fn)

In [78]:
"""
Save the model.
"""

import pickle

#pickle.dump(etr, open('/users/cs/amaral/wikisim/wikification/ml-models/model-etr-1.pkl', 'wb'))
#pickle.dump(rfr, open('/users/cs/amaral/wikisim/wikification/ml-models/model-rfr-1.pkl', 'wb'))
#pickle.dump(gbr, open('/users/cs/amaral/wikisim/wikification/ml-models/model-gbr-1.pkl', 'wb'))
#pickle.dump(gbc, open('/users/cs/amaral/wikisim/wikification/ml-models/model-gbc-1.pkl', 'wb'))
#pickle.dump(lmart, open('/users/cs/amaral/wikisim/wikification/ml-models/model-lmart-1.pkl', 'wb'))

In [37]:
import pickle
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
import sys
sys.path.append('./pyltr/')
import pyltr

model = pickle.load(open('/users/cs/amaral/wikisim/wikification/ml-models/model-lmart-1.pkl', 'rb'))

model.predict(testX[2:20])

array([ 5.38906408, -0.38142295,  0.14399559, -1.21259834,  6.16869182,
        1.39566366,  6.07528854,  5.27429442, -5.77873421, -3.68008578,
       -7.37840903, -6.02215263, -4.83235341, -5.99034009, -4.64082089,
       -6.54484845, -3.38575076, -5.05438867])

In [35]:
print testX[2:20]
print
print testY[2:20]

[[0.16, 0.37197801830625504, 0.75, 0.0, 0.8058462601912664], [0.12, 0.12183860865881956, 0.0, 0.0, 0.003000814788764128], [0.08, 0.15388780453451467, 0.0, 0.0, 0.0197314194384961], [0.04, 0.12183860865881956, 0.0, 0.0, 0.00597341299761156], [0.6666666666666666, 0.5524107765948493, 0.75, 0.0, 0.6424277609623221], [0.2222222222222222, 0.43160739972500395, 0.0, 0.0, 0.04122347126384451], [0.9875, 0.9595134734099676, 0.9090909090909091, 0.25918609576466367, 0.9999999999999998], [0.9585703450891164, 0.1346076964691621, 0.9090909090909091, 0.265528050509531, 0.6452690222713563], [0.010586525091644546, 0.0, 0.0, 0.13257166946600463, 0.0008528990630150002], [0.006415118189862217, 0.07584652897571567, 0.0, 0.18615200157548295, 0.021349869684028633], [0.004361016306408798, 0.0, 0.0, 0.13820647183391077, 0.0], [0.0034445708507141954, 0.0, 0.0, 0.2197684333582972, 0.001374533158081892], [0.002844141069397042, 0.06692287344969088, 0.0, 0.1855829880405545, 0.0038656172984994353], [0.0027177347996460

In [None]:
%%writefile el-data-gen.py 

"""
This file/cell is to generate training data for entity linking for a supervised model.
Each row has form: (id, isTrueEntity, popularity, context1, context2, word2vec, coherence, mentionId)
"""

from __future__ import division
import requests
import json
import os
from wikification import *
import copy
import sys
 
def normalize(nums):
    """Normalizes a list of nums to its sum + 1"""
    
    numSum = sum(nums) + 1 # get max
    
    # fill with normalized
    normNums = []
    for num in nums:
        normNums.append(num/numSum)
        
    return normNums

pathStrt = '/users/cs/amaral/wsd-datasets'
dsPath = os.path.join(pathStrt,'wiki-mentions.5000.json')

with open(dsPath, 'r') as dataFile:
    dataLines = []
    i = 0
    for line in dataFile:
        dataLines.append(json.loads(line.decode('utf-8').strip()))
        i += 1
        if i > 5000:
            break
        
cPerM = 20 # candidates per mention

allCands = []

# word2vec loading
try:
    word2vec
except:
    print 'loading word2vec'
    word2vec = gensim_loadmodel('/users/cs/amaral/cgmdir/WikipediaClean5Negative300Skip10.Ehsan/WikipediaClean5Negative300Skip10')

print 'word2vec loaded'
    
f = 0

mNum = 0
# see each line
for line in dataLines:
    
    oMentions = copy.deepcopy(line['mentions']) # mentions in original form
    oText = " ".join(copy.deepcopy(line['text']))
    
    line['mentions'] = mentionStartsAndEnds(line)
    # get what should be all candidates
    candidates = generateCandidates(line, 999, True)
    
    i = 0
    for i in range(0, len(candidates)):
        entId = title2id(oMentions[i][1])
        j = 0
        candsRepl = []
        for cand in candidates[i]:
            if j >= cPerM:
                break
            
            if cand[0] == entId:
                candsRepl.append([entId, 1, cand[1]]) # put in correct cand id and popularity
                j += 1
            elif j < cPerM:
                candsRepl.append([cand[0], 0, cand[1]]) # put false cand in
                j += 1
        candidates[i] = candsRepl
    
    i = 0 # index of mention
    
    hasCoherence = False # whether coherence scores for this line were obtained
    
    # see each mention
    for mention in oMentions:
    
        entId = title2id(mention[1]) # id of the true entity
                
        candList = candidates[i]
        
        # normalize popularity scores
        cScrs = []
        for cand in candList:
            cScrs.append(cand[2])
        cScrs = normalize(cScrs)
        j = 0
        for cand in candList:
            cand[2] = cScrs[j]
            j += 1
          
        # get score from context1 method
        context = getMentionsInSentence(line, line['mentions'][i]) # get context for some w methods
        cScrs = getContext1Scores(line['text'][mention[0]], context, candList)
        cScrs = normalize(cScrs)
        # apply score to candList
        for j in range(0, len(candList)):
            candList[j].append(cScrs[j])
            
        # get score from context2 method
        context = getMentionsInSentence(line, line['mentions'][i]) # get context for some w methods
        cScrs = getContext2Scores(line['text'][mention[0]], context, candList)
        cScrs = normalize(cScrs)
        # apply score to candList
        for j in range(0, len(candList)):
            candList[j].append(cScrs[j])
        
        # get score form word2vec
        context = getMentionSentence(oText, line['mentions'][i], asList = True)
        cScrs = getWord2VecScores(context, candList)
        #cScrs = normalize(cScrs)
        # apply score to candList
        for j in range(0, len(candList)):
            candList[j].append(cScrs[j])

        # get score from coherence
        if hasCoherence == False:
            cohScores = coherence_scores_driver(candidates, 5, method='rvspagerank', direction=DIR_BOTH, op_method="keydisamb")
            hasCoherence = True
        for j in range(0, len(candList)):
            candList[j].append(cohScores[i][j])
            
        # put the mention id
        for j in range(len(candList)):
            candList[j].append(mNum)
            
        allCands.append(candList)
        
        mNum += 1
        
        i += 1
    f += 1
    print 'Line: ' + str(f)
        

with open('/users/cs/amaral/wikisim/wikification/learning-data/el-5000-hybridgen.txt', 'w') as f:
    for thing in allCands:
        for thingy in thing:
            f.write(str(thingy)[1:-1] + '\n')