In [2]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext()
sqlContext = pyspark.sql.SQLContext(sc)

In [3]:
import os

In [4]:
from sift.corpora import wikipedia, wikidata
from sift.models import text, links
wikipedia_base_path = 'data'
wikidata_base_path = '/n/schwa11/data0/linking/wikidata/dumps/20150713'

In [5]:
wikipedia_corpus = wikipedia.WikipediaCorpus()(sc, wikipedia_base_path)
docs = wikipedia.WikipediaArticles()(wikipedia_corpus).cache()

In [6]:
docs.take(1)

[{'_id': 'en.wikipedia.org/wiki/Anarchism',
  'links': [{'start': 0,
    'stop': 9,
    'target': u'en.wikipedia.org/wiki/Anarchism'},
   {'start': 15,
    'stop': 35,
    'target': u'en.wikipedia.org/wiki/Political_philosophy'},
   {'start': 51,
    'stop': 64,
    'target': u'en.wikipedia.org/wiki/Self-governance'},
   {'start': 137,
    'stop': 156,
    'target': u'en.wikipedia.org/wiki/Stateless_society'},
   {'start': 248, 'stop': 260, 'target': u'en.wikipedia.org/wiki/Hierarchy'},
   {'start': 264,
    'stop': 281,
    'target': u'en.wikipedia.org/wiki/Free_association_(communism_and_anarchism)'},
   {'start': 303,
    'stop': 308,
    'target': u'en.wikipedia.org/wiki/State_(polity)'},
   {'start': 360,
    'stop': 383,
    'target': u'en.wikipedia.org/wiki/Anti-statism'},
   {'start': 548,
    'stop': 556,
    'target': u'en.wikipedia.org/wiki/Far-left_politics'},
   {'start': 578,
    'stop': 597,
    'target': u'en.wikipedia.org/wiki/Anarchist_economics'},
   {'start': 602,
 

In [7]:
wikipedia_pfx = 'en.wikipedia.org/wiki/'

In [8]:
ec_model = links\
    .EntityCounts(min_count=5, filter_target=wikipedia_pfx)\
    .build(docs)\
    .map(links.EntityCounts.format_item)

In [9]:
enc_model = links\
    .EntityNameCounts(lowercase=True, filter_target=wikipedia_pfx)\
    .build(docs)\
    .filter(lambda (name, counts): sum(counts.itervalues()) > 1)\
    .map(links.EntityNameCounts.format_item)

In [10]:
ec_model.take(1)

[{'_id': u'en.wikipedia.org/wiki/Australian_federal_election,_1977',
  'count': 15}]

In [11]:
enc_model.take(5)

[{'_id': u'biennials',
  'counts': {u'en.wikipedia.org/wiki/Biennial_plant': 3},
  'total': 3},
 {'_id': u'seven-day weeks',
  'counts': {u'en.wikipedia.org/wiki/Week': 2},
  'total': 2},
 {'_id': u'ernst gr\xfcnfeld',
  'counts': {u'en.wikipedia.org/wiki/Ernst_Gr\xfcnfeld': 2},
  'total': 2},
 {'_id': u'down beat jazz hall of fame',
  'counts': {u'en.wikipedia.org/wiki/Down_Beat': 1,
   u'en.wikipedia.org/wiki/Down_Beat_Jazz_Hall_of_Fame': 1},
  'total': 2},
 {'_id': u'vani',
  'counts': {u'en.wikipedia.org/wiki/Vani': 1,
   u'en.wikipedia.org/wiki/Vani_Municipality': 1},
  'total': 2}]

In [12]:
#start
from nel.model import data
from nel.model.store import file

In [28]:
os.environ['NEL_DATASTORE_URI'] = 'file:///data0/nel/'

In [29]:
# we can use model.toLocalIterator if models don't fit in memory

In [30]:
data.ObjectStore\
    .Get('models:ecounts[wikipedia]')\
    .save_many(ec_model.collect())

2018-02-14 10:54:59,057|ERROR|data|Unsupported data store proto (/data), choose from (mongodb,redis,file)


NotImplementedError: 

In [31]:
data.ObjectStore\
    .Get('models:necounts[wikipedia]')\
    .save_many(enc_model.collect())

2018-02-14 10:55:40,093|ERROR|data|Unsupported data store proto (/data), choose from (mongodb,redis,file)


NotImplementedError: 

In [19]:
from nel.doc import Doc

In [20]:
from nel.harness.format import from_sift

In [21]:
from nel.process.pipeline import Pipeline
from nel.process.candidates import NameCounts
from nel.features.probability import EntityProbability, NameProbability

In [22]:
candidate_generation = [
    NameCounts('wikipedia', 10)
]
feature_extraction = [
    EntityProbability('wikipedia'),
    NameProbability('wikipedia')
]

2018-02-13 11:15:33,333|INFO|candidates|Preparing name model candidate generator (model=wikipedia, limit=10)...
2018-02-13 11:15:33,335|DEBUG|data|Using file object store for (models:necounts[wikipedia])...
2018-02-13 11:15:33,339|DEBUG|data|Using file object store for (models:ecounts[wikipedia])...
2018-02-13 11:15:33,343|DEBUG|data|Using file object store for (models:necounts[wikipedia])...


In [23]:
training_pipeline = Pipeline(candidate_generation + feature_extraction)

In [24]:
training_docs = [from_sift(doc) for doc in docs.takeSample(False, 100)]

In [25]:
train = [training_pipeline(doc) for doc in training_docs]

In [26]:
from nel.learn import ranking
from nel.features import meta
from nel.model import resolution
from nel.process import resolve

In [27]:
ranker = ranking.TrainLinearRanker(name='ranker', features=[f.id for f in feature_extraction])(train)

2018-02-13 11:15:50,064|INFO|train|Computing feature statistics over 100 documents...
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
2018-02-13 11:15:50,225|INFO|train|Building training set, feature mapping = PolynomialMapper...
2018-02-13 11:15:50,283|INFO|train|Fitting model over 0 instances...


ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
classifier_feature = meta.ClassifierScore(ranker)
linking = [
    classifier_feature,
    resolve.FeatureRankResolver(classifier_feature.id)
]

In [None]:
linking_pipeline = Pipeline(candidate_generation + feature_extraction + linking)

In [None]:
sample = [from_sift(doc) for doc in docs.takeSample(False, 10)]

In [None]:
# clear existing links
for doc in sample:
    for chain in doc.chains:
        chain.resolution = None
        for mention in chain.mentions:
            mention.resolution = None

In [None]:
linked_sample = [linking_pipeline(doc) for doc in sample]

In [None]:
[d.id for d in linked_sample]

In [None]:
sample[0].chains[0].resolution.id

In [None]:
from nel.harness.format import inject_markdown_links
from IPython.display import display, Markdown

In [None]:
display(Markdown(inject_markdown_links(linked_sample[0].text, linked_sample[0])))

In [None]:
from nel.process import tag, coref

mention_detection = [
    tag.SpacyTagger(),
    coref.SpanOverlap()
]

In [None]:
full_pipeline = Pipeline(mention_detection + candidate_generation + feature_extraction + linking)

In [None]:
linked_sample = [full_pipeline(doc) for doc in sample]

In [None]:
display(Markdown(inject_markdown_links(linked_sample[0].text, linked_sample[0], 'https://')))