In [1]:
cd ..

/Users/nguyen/projects/reinforce-ml


In [6]:
from src.scorer import Scorer
import numpy as np

In [7]:
model = Scorer()

In [8]:
definition = 'In statistics, overfitting is "the production of an analysis that corresponds too closely ' \
             'or exactly to a particular set of data, and may therefore fail to fit additional data or predict future observations reliably"'

In [9]:
def_nlp = model.nlp(definition)

In [10]:
def_nlp

In statistics, overfitting is "the production of an analysis that corresponds too closely or exactly to a particular set of data, and may therefore fail to fit additional data or predict future observations reliably"

In [11]:
user_subm = 'When the model memorizes the training data and performs badly with the test data'

In [12]:
uq_nlp = model.nlp(user_subm)

In [13]:
model.score(definition, user_subm)

root score: 0.38894438104970114
sentence similarity score: 0.8831680417060852
bonus score: 0.5019652446111044
total calculated score: 0.8454684919553543


0.84546849195535434

In [15]:
model.load_pickle('data/180530_def.pkl')

In [21]:
data = model.raw_data.to_json(orient='index')

In [25]:
type(data)

str

In [26]:
import json
json1_data = json.loads(data)

In [38]:
for row, val in json1_data.items():
    print(val)

{'definition': 'Machine learning is a field of computer science that often uses statistical techniques to give computers the ability to "learn" (i.e., progressively improve performance on a specific task) with data, without being explicitly programmed.[1]', 'href': '/wiki/Machine_learning', 'section': 'Introduction and Main Principles', 'title': 'Machine learning'}
{'definition': 'Numerical analysis\xa0· Simulation', 'href': '/wiki/Data_analysis', 'section': 'Introduction and Main Principles', 'title': 'Data analysis'}
{'definition': 'Occam\'s razor (also Ockham\'s razor or Ocham\'s razor; Latin: lex parsimoniae "law of parsimony") is the problem-solving principle that, when presented with competing hypothetical answers to a problem, one should select the answer that makes the fewest assumptions. The idea is attributed to William of Ockham (c. 1287–1347), who was an English Franciscan friar, scholastic philosopher, and theologian.', 'href': '/wiki/Occam%27s_razor', 'section': 'Introduc

In [18]:
def cos_sim(query_vec, def_vec):
    return (np.sum((query_vec * def_vec))
            / (np.sqrt(np.sum((query_vec ** 2)))
               * np.sqrt(np.sum((def_vec ** 2)))))

In [22]:
cos_sim(dog.vector, cat.vector)

0.80168551

In [23]:
cos_sim(def_nlp.vector, uq_nlp.vector)

0.88316804

In [24]:
bhaskar = "Overfitting is where the model fits to each and every point in the feature set."

In [26]:
b_nlp = model.nlp(bhaskar)

In [28]:
cos_sim(def_nlp.vector, b_nlp.vector)

0.90095067

In [29]:
emily = "Overfitting is when your predictive model is trying to be too close to too many points and so it loses its predictive power"
e_nlp = model.nlp(emily)

In [30]:
cos_sim(def_nlp.vector, e_nlp.vector)

0.9176628

In [40]:
random = "Costco sells a hot dog and a soda for $1.50"
r_nlp = model.nlp(random)

In [41]:
cos_sim(def_nlp.vector, r_nlp.vector)

0.60963613

In [106]:
hack = 'statistics overfitting set data observations'
hack_nlp = model.nlp(hack)

In [107]:
cos_sim(def_nlp.vector, hack_nlp.vector)

0.66674113

In [109]:
nc = def_nlp.noun_chunks

In [51]:
list(nc)

[statistics,
 overfitting,
 the production,
 an analysis,
 a particular set,
 data,
 additional data,
 future observations]

In [65]:
nc_root = [chunk.root.lemma_ for chunk in list(nc)]

In [66]:
nc_root = set(nc_root)

In [67]:
nc_root

{'analysis',
 'datum',
 'observation',
 'overfitting',
 'production',
 'set',
 'statistic'}

In [75]:
sents = list(def_nlp.sents)

In [76]:
sents

[In statistics, overfitting is "the production of an analysis that corresponds too closely or exactly to a particular set of data, and may therefore fail to fit additional data or predict future observations reliably"]

In [78]:
sent = sents[0]

In [79]:
sent.root

is

In [87]:
for token in sent:
    print(token, token.dep_)

In prep
statistics pobj
, punct
overfitting nsubj
is ROOT
" punct
the det
production attr
of prep
an det
analysis pobj
that nsubj
corresponds relcl
too advmod
closely advmod
or cc
exactly conj
to prep
a det
particular amod
set pobj
of prep
data pobj
, punct
and cc
may aux
therefore advmod
fail conj
to aux
fit xcomp
additional amod
data dobj
or cc
predict conj
future amod
observations dobj
reliably advmod
" punct


In [90]:
pobjs = [token.lemma_ for token in sent if token.dep_ == 'pobj']

In [91]:
pobjs

['statistic', 'analysis', 'set', 'datum']

get:
* list of multiword noun chunks (remove det tokens)
* list of lemmatized roots of noun chunks
* overall sentence embedding

score:
top noun chunks that match / total num of noun chunks in the definition * sentence similarity + num of multiword noun chunks that match

In [105]:
cos_sim(model.nlp('rain').vector, model.nlp('umbrella').vector)

0.36585009

In [122]:
def get_mw_nc(doc):
    """Get list of multiword noun chunks.
    
    Tokens with dependency of 'det' are removed.
    
    Arguments:
        doc (spaCy doc object)
    
    Returns:
        pruned_mw (list): list of multiword noun chunks
    """
    mw_nc = list(doc.noun_chunks)
    
    # remove det tokens

    pruned_mw = []

    for chunk in mw_nc:
        replace = []
        for token in chunk:
            if token.dep_ is not 'det':
                token = token.lemma_
                replace.append(token)
        if len(replace) > 1:
            pruned_mw.append(" ".join(replace))
    
    return pruned_mw

In [163]:
def get_lemma_roots(doc):
    """Get list of lemmatized roots of noun chunks.
    
    Arguments:
        doc (spaCy doc object)
    
    Returns:
        lemma_roots (list): list of lemmatized roots of noun chunks
    """
    nc = list(doc.noun_chunks)

    roots = [token.root.lemma_ for token in nc]
    
    return roots

In [164]:
get_lemma_roots(def_nlp)

['statistic',
 'overfitting',
 'production',
 'analysis',
 'set',
 'datum',
 'datum',
 'observation']