In [22]:
import nltk
import numpy as np
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import reuters
import re, string

In [23]:
import re, string

punc_regex = re.compile('[{}]'.format(re.escape(string.punctuation)))

def strip_punc(corpus):
    """ Removes all punctuation from a string.

        Parameters
        ----------
        corpus : str

        Returns
        -------
        str
            the corpus with all punctuation removed"""
    # substitute all punctuation marks with ""
    return punc_regex.sub('', corpus)

In [24]:
def to_counter(doc):
    """ 
    Produce word-count of document, removing all punctuation
    and making all the characters lower-cased.
    
    Parameters
    ----------
    doc : str
    
    Returns
    -------
    collections.Counter
        lower-cased word -> count"""
    doc = sorted(strip_punc(doc).lower().split())
    return Counter(doc)

In [25]:
def to_vocab(counters, k=None, stop_words=None):
    """ 
    [word, word, ...] -> sorted list of top-k unique words
    Excludes words included in `stop_words`
    
    Parameters
    ----------
    counters : Iterable[Iterable[str]]
    
    k : Optional[int]
        If specified, only the top-k words are returned
    
    stop_words : Optional[Collection[str]]
        A collection of words to be ignored when populating the vocabulary
    """
    unique = Counter()
    for c in counters:
        unique.update(c)
    if stop_words is not None:
        for word in stop_words:
            del unique[word]
    if k is not None:
        unique = set(unique.most_common(k))
        return sorted(list(unique))
    return sorted(list(unique))

In [26]:
def to_tf(counter, vocab):
    """
    Parameters
    ----------
    counter : collections.Counter
        The word -> count mapping for a document.
    vocab : Sequence[str]
        Ordered list of words that we care about.
    
    Returns
    -------
    numpy.ndarray
        The TF descriptor for the document, whose components represent
        the frequency with which each term in the vocab occurs
        in the given document."""
    total = 0.0
    for key in counter:
        if key in vocab:
            total += counter[key]
    tf = []
    for word in vocab:
        if counter[word] is None:
            tf.append(0)
        else:
            tf.append(1.0 * counter[word] / total)
    return np.array(tf)

In [27]:
def to_idf(vocab, counters):
    """ 
    Given the vocabulary, and the word-counts for each document, computes
    the inverse document frequency (IDF) for each term in the vocabulary.
    
    Parameters
    ----------
    vocab : Sequence[str]
        Ordered list of words that we care about.

    counters : Iterable[collections.Counter]
        The word -> count mapping for each document.
    
    Returns
    -------
    numpy.ndarray
        An array whose entries correspond to those in `vocab`, storing
        the IDF for each term `t`: 
                           log10(N / nt)
        Where `N` is the number of documents, and `nt` is the number of 
        documents in which the term `t` occurs.
    """
    N = 1.0 * len(counters)
    idf = []
    for i in range(len(vocab)):
        word = vocab[i]
        docs = 0.0
        for countmap in counters:
            if word in countmap:
                docs += 1.0
        idf.append(N/docs)
    return np.array(np.log10(idf))

In [50]:
doc_1 = '** BEGIN ARTICLE: ** \"' + reuters.raw(reuters.fileids()[0])[:500] + ' [...]\"'
doc_2 = '** BEGIN ARTICLE: ** \"' + reuters.raw(reuters.fileids()[1])[:500] + ' [...]\"'
doc_3 = '** BEGIN ARTICLE: ** \"' + reuters.raw(reuters.fileids()[2])[:500] + ' [...]\"'
doc_4 = '** BEGIN ARTICLE: ** \"' + reuters.raw(reuters.fileids()[3])[:500] + ' [...]\"'
doc_5 = '** BEGIN ARTICLE: ** \"' + reuters.raw(reuters.fileids()[4])[:500] + ' [...]\"'
print(doc_4)
print(doc_2)

** BEGIN ARTICLE: ** "THAI TRADE DEFICIT WIDENS IN FIRST QUARTER
  Thailand's trade deficit widened to 4.5
  billion baht in the first quarter of 1987 from 2.1 billion a
  year ago, the Business Economics Department said.
      It said Janunary/March imports rose to 65.1 billion baht
  from 58.7 billion. Thailand's improved business climate this
  year resulted in a 27 pct increase in imports of raw materials
  and semi-finished products.
      The country's oil import bill, however, fell 23 pct in the
  first quarte [...]"
** BEGIN ARTICLE: ** "CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STOCKS
  A survey of 19 provinces and seven cities
  showed vermin consume between seven and 12 pct of China's grain
  stocks, the China Daily said.
      It also said that each year 1.575 mln tonnes, or 25 pct, of
  China's fruit output are left to rot, and 2.1 mln tonnes, or up
  to 30 pct, of its vegetables. The paper blamed the waste on
  inadequate storage and bad preservation methods.
      It sa

In [62]:
doc_1 = """We present a database of well-determined orbital parameters of exoplanets, and their host stars’
properties. This database comprises spectroscopic orbital elements measured for 427 planets orbiting 363 stars from
radial velocity and transit measurements as reported in the literature. We have also compiled fundamental transit
parameters, stellar parameters, and the method used for the planets discovery. This Exoplanet Orbit Database includes all planets with robust, well measured orbital parameters reported in peer-reviewed articles. The database is
available in a searchable, filterable, and sortable form online through the Exoplanets Data Explorer table, and the
data can be plotted and explored through the Exoplanet Data Explorer plotter. We use the Data Explorer to generate
publication-ready plots, giving three examples of the signatures of exoplanet migration and dynamical evolution:
We illustrate the character of the apparent correlation between mass and period in exoplanet orbits, the different
selection biases between radial velocity and transit surveys, and that the multiplanet systems show a distinct
semimajor-axis distribution from apparently singleton systems."""

doc_2 = """For the Exoplanet Orbit Database, we have dropped the 
200 pc limit from the old catalog and now include all robustly
detected planets appearing in the peer-reviewed literature with
well-determined orbital parameters. We have retained the generous upper mass limit of 24 Jupiter masses in our definition of a
“planet,” for the same reasons as in the catalog: at the moment,
any mass limit is arbitrary and will serve little practical function,
both because of the sin i ambiguity in radial velocity masses and
because of the lack of physical motivation.13 We therefore err on
the side of inclusiveness by admitting the long high-mass tail of
the exoplanet population at the risk of having a few bona fide
brown dwarfs in the sample."""

doc_3 = """We have opted to use these classical SB1 orbital parameters,
rather than using mean longitude at epoch, because they are
more frequently reported in the literature and the latter is
trivially computed from the former. In those cases (especially
for multiplanet systems or transiting systems) where the phase
of a planet is reported as the mean anomaly at epoch, or epoch
of transit center, or in some similar way, we have converted the
quantities to ω and T0 for consistency. We recognize that for
circular orbits the uncertainty in mean longitude is better behaved than those in T0 and ω, and we note that the uncertainty
in mean longitude can be estimated from the period uncertainty
and the span of the observations. We plan to incorporate mean
longitude at epoch, transit time predictions, and robust uncertainties for these quantities in the future, but in the meantime,
any application requiring more precision should calculate the
quantity explicitly from the radial velocities or from the source
article."""

doc_4 = """The EOD can be explored and displayed using the Exoplanet
Data Explorer table and plotter.
The Table Explorer allows for the user to dynamically create
a sorted table of planets and selected properties, including a
choice of units and parameter uncertainties. Once a table has
been generated, it may be exported as a custom text file. References are linked to their corresponding URLs; we provide columns for links to SIMBAD, NStED, and Exoplanet Transit
Database; and planets are linked to “one-up” planet pages that
contain all fields and values for a given set of planets. Both
pages as illustrated in Figure 1.
These one-up pages include a link to the publicly available
velocities of each star, stored at NStED, and a plot showing
these published velocities as a function of time or phase (as appropriate), along with a velocity curve generated from the listed
orbital solution. Note that we have not attempted to fit the
velocities and generate our own solution; we solve only for
the velocity offset γ and simply overplot the solution and data.
This serves as a check on the accuracy of our transcription of
orbital elements"""

In [101]:
docs = [doc_1, doc_2, doc_3, doc_4]
counter = [to_counter(i) for i in docs]

stops = nltk.corpus.stopwords.words('english')
vocab = to_vocab(counter, stop_words = stops)
print(vocab)
tf = tuple(to_tf(to_counter(i), vocab) for i in docs)

idf = to_idf(vocab, counter)

tf_idf = tf * idf
print(tf_idf.shape)
cos_sim = cosine_similarity(tf_idf)

['1', '200', '24', '363', '427', 'accuracy', 'admitting', 'allows', 'along', 'also', 'ambiguity', 'anomaly', 'apparent', 'apparently', 'appearing', 'application', 'appropriate', 'arbitrary', 'article', 'articles', 'attempted', 'available', 'behaved', 'better', 'biases', 'bona', 'brown', 'calculate', 'cases', 'catalog', 'center', 'character', 'check', 'choice', 'circular', 'classical', 'columns', 'compiled', 'comprises', 'computed', 'consistency', 'contain', 'converted', 'correlation', 'corresponding', 'create', 'curve', 'custom', 'data', 'database', 'definition', 'detected', 'different', 'discovery', 'displayed', 'distinct', 'distribution', 'dropped', 'dwarfs', 'dynamical', 'dynamically', 'elements', 'eod', 'epoch', 'err', 'especially', 'estimated', 'evolution', 'examples', 'exoplanet', 'exoplanets', 'explicitly', 'explored', 'explorer', 'exported', 'fide', 'fields', 'figure', 'file', 'filterable', 'fit', 'form', 'former', 'frequently', 'function', 'fundamental', 'future', 'generate', 

In [90]:
doc_1_split = " ".join(doc_1.split("\n")).split(". ")
doc_2_split = " ".join(doc_2.split("\n")).split(". ")
doc_3_split = " ".join(doc_3.split("\n")).split(". ")
doc_4_split = " ".join(doc_4.split("\n")).split(". ")
split_docs = [doc_1_split, doc_2_split, doc_3_split, doc_4_split]

In [100]:
print(doc_1_split)

['We present a database of well-determined orbital parameters of exoplanets, and their host stars’ properties', 'This database comprises spectroscopic orbital elements measured for 427 planets orbiting 363 stars from radial velocity and transit measurements as reported in the literature', 'We have also compiled fundamental transit parameters, stellar parameters, and the method used for the planets discovery', 'This Exoplanet Orbit Database includes all planets with robust, well measured orbital parameters reported in peer-reviewed articles', 'The database is available in a searchable, filterable, and sortable form online through the Exoplanets Data Explorer table, and the data can be plotted and explored through the Exoplanet Data Explorer plotter', 'We use the Data Explorer to generate publication-ready plots, giving three examples of the signatures of exoplanet migration and dynamical evolution: We illustrate the character of the apparent correlation between mass and period in exopla

In [118]:
for sentence in doc_1_split:
    sentence = sentence.split(" ")
    for word in sentence:
        print(word)

We
present
a
database
of
well-determined
orbital
parameters
of
exoplanets,
and
their
host
stars’
properties
This
database
comprises
spectroscopic
orbital
elements
measured
for
427
planets
orbiting
363
stars
from
radial
velocity
and
transit
measurements
as
reported
in
the
literature
We
have
also
compiled
fundamental
transit
parameters,
stellar
parameters,
and
the
method
used
for
the
planets
discovery
This
Exoplanet
Orbit
Database
includes
all
planets
with
robust,
well
measured
orbital
parameters
reported
in
peer-reviewed
articles
The
database
is
available
in
a
searchable,
filterable,
and
sortable
form
online
through
the
Exoplanets
Data
Explorer
table,
and
the
data
can
be
plotted
and
explored
through
the
Exoplanet
Data
Explorer
plotter
We
use
the
Data
Explorer
to
generate
publication-ready
plots,
giving
three
examples
of
the
signatures
of
exoplanet
migration
and
dynamical
evolution:
We
illustrate
the
character
of
the
apparent
correlation
between
mass
and
period
in
exoplanet
orbits,
the
d

In [120]:
for doc in split_docs:
    doc_stats = []
    for sentence in doc:
        sentence = sentence.split(" ")
        sentence_tfidf = 0
        for word in sentence:
            if word in vocab:
                word_stat = tf_idf[split_docs.index(doc),vocab.index(word)]
                sentence_tfidf += word_stat
        doc_stats.append(sentence_tfidf)
    print(doc_stats)
    print(doc[doc_stats.index(min(doc_stats))])
    print()

[0.02930867794824781, 0.07731802790630285, 0.04683068263156166, 0.04232953871911201, 0.04731325359804639, 0.1397959515346763]
We present a database of well-determined orbital parameters of exoplanets, and their host stars’ properties

[0.1254058111337211, 0.38063642235275746]
For the Exoplanet Orbit Database, we have dropped the  200 pc limit from the old catalog and now include all robustly detected planets appearing in the peer-reviewed literature with well-determined orbital parameters

[0.1410253484370681, 0.16824667412311894, 0.2606479230749105, 0.16609921974439107]
We have opted to use these classical SB1 orbital parameters, rather than using mean longitude at epoch, because they are more frequently reported in the literature and the latter is trivially computed from the former

[0.02605067270169068, 0.07596698174446567, 0.040523268647074395, 0.10272713727462676, 0.028945191890767424, 0.13265603152015149, 0.07766016692769338, 0.02605067270169068]
The EOD can be explored and displ

In [104]:
import datetime, re, sys
from sklearn.feature_extraction.text import TfidfVectorizer

def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

token_dict = {}
for article in reuters.fileids():
    token_dict[article] = reuters.raw(article)
        
tfidf = TfidfVectorizer(tokenizer=tokenize_and_stem, stop_words='english', decode_error='ignore')
print ('building term-document matrix... [process started: ' + str(datetime.datetime.now()) + ']')
sys.stdout.flush()

tdm = tfidf.fit_transform(token_dict.values()) # this can take some time (about 60 seconds on my machine)
print ('done! [process finished: ' + str(datetime.datetime.now()) + ']')

building term-document matrix... [process started: 2019-07-31 15:10:54.120645]


  'stop_words.' % sorted(inconsistent))


done! [process finished: 2019-07-31 15:12:01.222208]


In [107]:
from random import randint

feature_names = tfidf.get_feature_names()
print ('TDM contains ' + str(len(feature_names)) + ' terms and ' + str(tdm.shape[0]) + ' documents')

print ('first term: ' + feature_names[0])
print ('last term: ' + feature_names[len(feature_names) - 1])

for i in range(0, 4):
    print ('random term: ' + feature_names[randint(1,len(feature_names) - 2)])

TDM contains 25827 terms and 10788 documents
first term: 'd
last term: zzzz
random term: carlson
random term: marriag
random term: caspian
random term: dual


In [112]:
from __future__ import division
import math


article_id = randint(0, tdm.shape[0] - 1)
article_text = reuters.raw(reuters.fileids()[article_id])

sent_scores = []
for sentence in nltk.sent_tokenize(article_text):
    score = 0
    sent_tokens = tokenize_and_stem(sentence)
    for token in (t for t in sent_tokens if t in feature_names):
        score += tdm[article_id, feature_names.index(token)]
    sent_scores.append((score / len(sent_tokens), sentence))

summary_length = int(math.ceil(len(sent_scores) / 5))
sent_scores.sort(key=lambda sent: sent[0], reverse=True)

print ('*** SUMMARY ***')
for summary_sentence in sent_scores[:summary_length]:
    print (summary_sentence[1])

print ('\n*** ORIGINAL ***')
print (article_text)

*** SUMMARY ***
AUSTRALIA SAID TO RELY TOO MUCH ON OIL TAXES
  The government's
  over-reliance on revenue from crude oil is adversely affecting
  Australia's economic performance, Australian Petroleum
  Exploration Association (APEA) chairman Dennis Benbow said.
Domestic oil output from existing fields is expected to
  fall to 280,000 barrels per day (bpd) in fiscal 1992/93 from
  546,000 bpd in 1985/86, reflecting mainly the decline of the
  Bass Strait fields, he said.
Bass Strait reserves are now two-thirds depleted, with the
  three largest fields 80 pct depleted, he said.

*** ORIGINAL ***
AUSTRALIA SAID TO RELY TOO MUCH ON OIL TAXES
  The government's
  over-reliance on revenue from crude oil is adversely affecting
  Australia's economic performance, Australian Petroleum
  Exploration Association (APEA) chairman Dennis Benbow said.
      Over one-third of Australia's indirect tax income is
  derived from oil at a time of falling domestic output and weak
  crude prices, he told t

In [40]:
import numpy as np
arr = np.array([.1, .6, .3, .2, .9, .74, .7])
sort_ind = np.argsort(arr)
print(sort_ind)
sort_arr = np.array(sorted(arr))
print(sort_arr)
std = np.std(np.array(arr))
print(std)
diff = sort_arr[1:] - sort_arr[:-1]
print(diff)
where = np.where(diff>std)
print(where)
if len(where[0]) != 0:
    ind = sort_ind[where[0][0] + 1]
    print(ind)
    indices = np.where(arr >= arr[ind])
print(indices[0])

[0 3 2 1 6 5 4]
[0.1  0.2  0.3  0.6  0.7  0.74 0.9 ]
0.28217811714574603
[0.1  0.1  0.3  0.1  0.04 0.16]
(array([2], dtype=int64),)
1
[1 4 5 6]


In [41]:
a = [1,2,3,4,5,6]
print(a[:-1])

[1, 2, 3, 4, 5]
