In [29]:
import nltk
import re
from nltk.corpus import wordnet as wn
from itertools import product
from nltk.corpus import wordnet_ic


def get_sim_score(word_1, word_2, info_content):
    """ 
    Calculate the highest path similarity among all pairs. 
    """

    if word_1 == word_2:
        return 1
    else:
        max_sim = -1.0
        synsets_1 = wn.synsets(word_1)
        synsets_2 = wn.synsets(word_2)
        if synsets_1 and synsets_2:
            for synset_1, synset_2 in product(synsets_1, synsets_2):
                try:
                    sim = wn.lin_similarity(synset_1, synset_2, info_content)
                    #sim = wn.wup_similarity(synset_1, synset_2)
                    if sim > max_sim:
                        max_sim = sim
                except:
                    continue

            return max_sim
        return max_sim


def mySim(text1, text2, sigma=0.85, w=0.3, corpus='ic-brown-resnik.dat'):
    # set stop words
    stopwords = nltk.corpus.stopwords.words('english')
    # set variables
    x = []
    y = []
    dic = {}
    info_content = wordnet_ic.ic(corpus)

    # clean raw text
    text1 = re.sub('[^a-zA-Z]', ' ', text1).lower()
    text2 = re.sub('[^a-zA-Z]', ' ', text2).lower()

    # tokenize inputs into vectors
    token_p = nltk.word_tokenize(text1, language='english')
    token_r = nltk.word_tokenize(text2, language='english')

    concept_p = [words for words in token_p if words not in stopwords]
    concept_r = [words for words in token_r if words not in stopwords]

    m = len(concept_p)
    n = len(concept_r)


    for w1, w2 in product(concept_p, concept_r):
        sim = get_sim_score(w1, w2, info_content)
        if sim >= sigma:
            x.append(w1)
            y.append(w2)
            dic[w1] = w2

    concept_x = [words for words in token_p if words in x]
    concept_y = [words for words in token_r if words in y]

    print(concept_x, concept_y)
    count = len(concept_x)
    total = 0

    if count == 0:
        sim_score_0 = 0

    elif count % 2 == 0 or count == 1:
        for position, word in enumerate(concept_x):
            total += abs(position - concept_y.index(dic[word]))

        sim_score_0 = 1 - 2 * total / count ** 2

    else:
        for position, word in enumerate(concept_x):
            total += abs(position - concept_y.index(dic[word]))

        sim_score_0 = 1 - 2 * total / (count ** 2 - 1)

    S = ((m + n) / (2 * m * n)) * (count * (1 - w * (1 - sim_score_0)))

    return S

In [30]:
mySim('Many consider Malin as the best player in PingPong history', 'Malin is one of the best PingPong players')

['malin', 'best', 'player', 'pingpong'] ['malin', 'best', 'pingpong', 'players']


0.6342857142857143

In [16]:
from nltk import tokenize
with open('/Users/Maxwell/PycharmProjects/Github/Rapid_Assessment_Tools/Bhutan_Input.txt') as f:
    data = f.read()
    sen = tokenize.sent_tokenize(data)

In [31]:
target = 'By 2030, eradicate extreme poverty for all people everywhere, currently measured as people living on less than $1.25 a day'

In [32]:
test = 'By 2017, the proportion of severely poor individuals has dropped from 15.8% in 2009/10 to below 10%.'

In [33]:
mySim(target, test)

[] []


0.0

In [19]:
brown_ic = wordnet_ic.ic('ic-brown-resnik.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')

In [10]:
from nltk.corpus import wordnet as wn
from itertools import product

synsets_1 = wn.synsets('poverty')
synsets_2 = wn.synsets('poor')
if synsets_1 and synsets_2:
    for synset_1, synset_2 in product(synsets_1, synsets_2):
        try:
            sim = wn.jcn_similarity(synset_1, synset_2, brown_ic)
            print(sim, synset_1, synset_2)
        except:
            continue

0.06838066838413086 Synset('poverty.n.01') Synset('poor_people.n.01')


In [28]:
for s in sen: 
    score = mySim(target, s)
    if score > 0.7:
        print(s)

['extreme', 'people', 'people', 'living'] ['goals', 'people']


['poverty', 'people', 'people', 'living'] ['poverty', 'come', 'people', 'population']


['people', 'people', 'living'] ['citizenry', 'rights']


['people', 'people', 'living', 'day'] ['realization', 'society']


[] []


['day'] ['tertiary']


['people', 'people', 'living'] ['labour', 'resources']


['living'] ['provision']


[] []


['people', 'people', 'living', 'day'] ['free', 'classes', 'age']


['eradicate', 'extreme', 'people', 'people', 'living'] ['end', 'class', 'board']


['people', 'people', 'living'] ['class']


['people', 'people', 'living'] ['classes']


['people', 'measured', 'people', 'living'] ['classes', 'marking']


['people', 'people', 'living'] ['country']


['eradicate', 'extreme', 'day'] ['months', 'level']


['people', 'people', 'living'] ['crafts']


['people', 'people', 'living'] ['class', 'labour', 'market']


['extreme', 'people', 'people', 'living', 'day'] ['tertiary', 'parts', 'country']


[] []


['extreme', 'people', 'people', 'living'] ['degree', 'resources', 'business']


['people', 'people', 'living', 'day'] ['post', 'business']


['extreme', 'people', 'people', 'living', 'day'] ['post', 'degree', 'public']


[] []


[] []


['living'] ['net', 'net']


['extreme'] ['high']


['extreme', 'measured', 'living'] ['gross', 'grade', 'reach', 'grade']


['measured'] ['rate']


[] []


[] []


['less'] ['parity']


['extreme', 'living', 'day'] ['opportunities', 'quality']


['eradicate', 'extreme', 'measured', 'living', 'day'] ['key', 'last', 'levels', 'net']


['living', 'day'] ['going', 'age']


['extreme', 'poverty', 'people', 'people', 'living', 'less'] ['reach', 'disabilities', 'needs', 'poor']


['measured', 'living'] ['rates', 'exist']


['living'] ['net']


['measured', 'living', 'less'] ['parity', 'scores', 'exists']


['eradicate', 'extreme'] ['level']


['eradicate', 'people', 'measured', 'people', 'living'] ['move', 'classes', 'drop', 'rates']


['eradicate', 'extreme', 'poverty', 'measured', 'living', 'less'] ['last', 'levels', 'needs', 'disabilities']


['extreme', 'living'] ['quality', 'quality', 'issue']


['eradicate', 'extreme', 'poverty', 'people', 'measured', 'people', 'living', 'day'] ['drop', 'rates', 'rates', 'rates', 'survival', 'rates', 'quality', 'rates', 'rates', 'class', 'high', 'need']


['day'] ['tertiary']


['extreme'] ['hand', 'face']


['eradicate', 'extreme', 'people', 'measured', 'people', 'living', 'day'] ['high', 'levels', 'youth', 'time']


['measured', 'living'] ['scale', 'met']


['extreme', 'people', 'people', 'living', 'day'] ['fact', 'meet', 'top', 'tertiary', 'poor', 'youth', 'today']


['extreme', 'people', 'people', 'living', 'day'] ['free', 'quality', 'people', 'potential']


['extreme', 'people', 'people', 'living', 'day'] ['today', 'population', 'youth', 'opportunity', 'quality']


[] []


['eradicate', 'extreme', 'people', 'people', 'living'] ['countries', 'high', 'level']


['eradicate', 'extreme', 'people', 'people', 'living', 'day'] ['even', 'english', 'levels']


['extreme'] ['quality', 'quality']


['eradicate', 'extreme', 'poverty', 'people', 'measured', 'people', 'living', 'day'] ['key', 'quality', 'level', 'grades', 'level', 'way', 'low', 'quality', 'resources', 'initiate', 'infrastructure', 'support', 'goals', 'goals', 'need', 'degree', 'resources', 'quality']


['people', 'people', 'living', 'day'] ['resources', 'poor', 'service']


['extreme', 'measured'] ['recommended', 'high']


['extreme', 'living'] ['infrastructure', 'quality']


[] []


['extreme', 'day'] ['today', 'high']


['people', 'people', 'living'] ['poor']


['living'] ['resource']


['living'] ['resource']


['extreme', 'living', 'day'] ['tertiary', 'issue', 'quality']


['people', 'people', 'living'] ['completing', 'classes', 'labour', 'market']


['extreme', 'day'] ['tertiary', 'quality', 'post']


['people', 'people', 'living'] ['trade']


['extreme', 'poverty', 'day'] ['view', 'need', 'quality', 'tertiary']


['measured', 'living'] ['share', 'last', 'allocation']


[] []


['day'] ['th']


[] []


['extreme', 'living'] ['capital']


['eradicate', 'extreme', 'living', 'day'] ['coming', 'maintenance', 'past', 'infrastructure', 'investments', 'capital', 'investments', 'levels']


['poverty', 'living'] ['need', 'take']


['measured'] ['rates']


['eradicate', 'extreme', 'poverty', 'people', 'people', 'living', 'day'] ['key', 'key', 'key', 'level', 'take', 'stock', 'parts', 'needs', 'quality', 'support']


[] []


[] []


['extreme', 'living', 'less', 'day'] ['experience', 'top', 'gestation']


['living'] ['resources', 'support']


['eradicate', 'extreme', 'measured', 'living', 'day'] ['tertiary', 'goals', 'goals', 'goals', 'quality', 'goals', 'principals', 'quality', 'resources', 'principals', 'support', 'promote', 'quality', 'infrastructure', 'support', 'execute', 'life', 'life', 'meeting', 'tertiary']


[] []


['extreme'] ['reach']


['extreme', 'people', 'people', 'living', 'day'] ['infrastructure', 'service', 'high', 'boarding']


['living', 'day'] ['past', 'infrastructure']


[] []


['living'] ['issue']


['extreme', 'measured', 'living'] ['scale', 'infrastructure', 'resources', 'grade', 'quality']


['extreme'] ['quality']


['living', 'day'] ['service', 'resources', 'realities']


['less', 'day'] ['less', 'future']


['currently', 'less'] ['currently', 'less']


[] []


['extreme'] ['quality', 'face']


['eradicate', 'extreme', 'people', 'people', 'living', 'day'] ['quality', 'resource', 'service', 'drawn']


['measured'] ['promoting']


['day'] ['service']


['people', 'people', 'living', 'day'] ['public', 'service']


['day'] ['tertiary']


['eradicate', 'extreme', 'poverty', 'living', 'day'] ['key', 'key', 'need', 'levels', 'provision', 'minimum']


['eradicate', 'living', 'day'] ['infrastructure', 'provision', 'store', 'quarters', 'hall']


['eradicate', 'extreme', 'poverty', 'people', 'people', 'living', 'day'] ['rationalizing', 'provision', 'quarters', 'allowances', 'quality', 'level', 'provision', 'finding', 'tertiary', 'tertiary', 'quality', 'tertiary', 'meet', 'needs', 'society']


['extreme', 'living'] ['quality', 'infrastructure']


[] []


['eradicate', 'extreme', 'people', 'people', 'living', 'day'] ['free', 'services', 'tertiary', 'level', 'right', 'free', 'public', 'services']


[] []


['people', 'people', 'living'] ['domain', 'contribution', 'domains']


['poverty', 'living', 'day'] ['provision', 'services', 'need', 'issues']


[] []


['extreme', 'living', 'day'] ['coverage', 'quality', 'services']


['extreme'] ['goals']


['living', 'less'] ['investments', 'diseases']


[] []


['less'] ['tb', 'malaria']


['less'] ['hiv', 'aids']


['less'] ['hiv', 'aids']


['living'] ['coverage']


['currently'] ['currently']


['day'] ['services']


['day'] ['services']


['living'] ['support']


['people', 'people', 'living'] ['country']


['measured', 'living'] ['last']


['less'] ['growth']


['measured', 'living'] ['percentage', 'stands']


[] []


['living', 'less', 'day'] ['coming', 'services', 'diseases', 'diseases']


['living'] ['resource']


['extreme', 'people', 'people', 'living', 'day'] ['key', 'quality', 'public', 'quality']


['people', 'people', 'living'] ['net', 'population', 'public']


['day'] ['service']


['eradicate', 'extreme', 'poverty', 'day'] ['hand', 'services', 'levels', 'service', 'services', 'needs']


['extreme', 'measured'] ['measure', 'quality']


['living'] ['resource', 'suffers']


[] []


[] []


['extreme', 'day'] ['even', 'region']


['extreme', 'poverty', 'living', 'day'] ['face', 'resource', 'even', 'face', 'need', 'services']


['living'] ['infrastructure']


['measured'] ['rate']


['poverty', 'living', 'day'] ['need', 'service', 'resource']


['people', 'people', 'living'] ['people', 'people', 'people', 'people', 'people']


['extreme', 'living', 'less', 'day'] ['diseases', 'presence', 'high', 'consumption']


['extreme', 'less'] ['diabetes', 'hypertension', 'cancers', 'injuries']


['extreme', 'people', 'people', 'living', 'day'] ['quality', 'life', 'families', 'society']


['poverty'] ['need']


['less'] ['hiv', 'aids', 'tb', 'malaria', 'diseases', 'hiv', 'aids']


['poverty', 'people', 'people', 'living', 'less'] ['affluence', 'labour', 'people', 'hiv', 'aids']


['less'] ['tb']


['less'] ['tb', 'hiv', 'tb', 'infection']


['people', 'people', 'living', 'less'] ['malaria', 'free', 'disease']


['eradicate', 'extreme', 'living', 'less'] ['eliminate', 'malaria', 'border', 'issue']


['less'] ['dengue']


[] []


['extreme', 'people', 'people', 'living'] ['countries', 'region', 'low', 'allocations']


[] []


[] []


['poverty'] ['need']


['people', 'people', 'living'] ['people']


['eradicate', 'people', 'people', 'living', 'less'] ['malnutrition', 'wasting', 'public', 'issue']


['people', 'people', 'living', 'less'] ['anemia', 'women']


['living', 'day'] ['experiencing', 'occurrences']


['poverty', 'people', 'people', 'living'] ['need', 'agriculture']


['people', 'people', 'living'] ['free', 'coverage']


['currently'] ['currently']


['extreme', 'people', 'measured', 'people', 'living', 'day'] ['recommend', 'country', 'allocation', 'quality', 'services']


['living'] ['fund', 'support']


['measured', 'living'] ['endowment', 'fund', 'stands']


['poverty', 'measured', 'day'] ['need', 'promote', 'services', 'promoted']


['eradicate', 'extreme', 'people', 'people', 'living', 'day'] ['service', 'line', 'take', 'services', 'level']


['extreme', 'living', 'day'] ['service', 'quality', 'assurance']


['extreme', 'day'] ['quality', 'services', 'emphasis']


[] []


['living'] ['resource']


['eradicate', 'extreme', 'poverty', 'people', 'measured', 'people', 'living', 'day'] ['line', 'coverage', 'services', 'level', 'standardized', 'services', 'need']


['living', 'less'] ['investment', 'diseases']


['extreme', 'people', 'people', 'living', 'less'] ['emphasis', 'public', 'injuries', 'hiv', 'aids', 'tb', 'malaria', 'diseases']


['people', 'people', 'living'] ['resources', 'resource', 'line']


['day'] ['service']


['poverty'] ['need']


['measured', 'living', 'day'] ['push', 'go', 'coverage']


['poverty', 'people', 'people', 'living'] ['country', 'needs']


['people', 'measured', 'people', 'living', 'day'] ['measures', 'free', 'services']


['measured'] ['measures']


[] []


['living', 'day'] ['potential']


['poverty'] ['needs']


[] []


['day'] ['services']


['extreme', 'measured', 'living', 'day'] ['infrastructure', 'resource', 'pace', 'infrastructure', 'resources', 'infrastructure', 'quality', 'services']


['living'] ['existing']


['living'] ['infrastructure', 'infrastructure']


['day'] ['key', 'services']


[] []


[] []


[] []


[] []


['people', 'people', 'living'] ['public']


[] []


[] []


['measured', 'day'] ['services', 'scaling']


['people', 'measured', 'people', 'living', 'day'] ['country', 'time']


['people', 'people', 'living', 'day'] ['public', 'services']


['living'] ['resources']


['day'] ['services']


['people', 'people', 'living'] ['resource', 'resources', 'society']


['extreme', 'people', 'people', 'living', 'day'] ['high', 'resource', 'investments', 'country', 'country', 'resource', 'services', 'resource']


['extreme', 'people', 'people', 'living'] ['quality', 'resources', 'high', 'public']


['extreme', 'less'] ['dependency', 'high']


['living'] ['pool']


['people', 'people', 'living', 'day'] ['labour', 'resources', 'service']


['living', 'day'] ['service', 'security', 'opportunities']


[] []


['living', 'day'] ['service', 'support']


['less', 'day'] ['less', 'service']


['extreme', 'living', 'day'] ['bachelors', 'degree', 'post', 'masters', 'degree']


['extreme', 'living'] ['resources', 'issue', 'meet', 'minimum']


['living'] ['percentage', 'meeting', 'support']


['eradicate', 'extreme', 'living', 'day'] ['key', 'service', 'percentage', 'levels', 'meeting', 'minimum', 'high']


['extreme', 'living'] ['issue', 'high']


['poverty', 'living'] ['resource', 'needs']


['extreme', 'people', 'people', 'living'] ['hand', 'hand', 'youth', 'high']


['people', 'people'] ['fill']


[] []


['day'] ['service', 'service']


['extreme'] ['high']


['people', 'measured', 'people', 'living', 'day'] ['last', 'months', 'lost']


['poverty', 'living'] ['resource', 'issues', 'need']


['day'] ['key', 'service']


['currently', 'living'] ['currently', 'allowance']


['living', 'less', 'day'] ['present', 'growth', 'service', 'coming']


['people', 'people', 'living', 'day'] ['key', 'key', 'resource', 'service', 'labour', 'resources']


['day'] ['service', 'services']


['extreme', 'living'] ['resource', 'emphasis']


['living'] ['support']


['living', 'day'] ['resources', 'hr']


['extreme', 'day'] ['open', 'past']


['living', 'day'] ['hr', 'resources']


[] []


['living'] ['resource']


[] []


['living'] ['resource']


['people', 'people', 'living', 'less'] ['labour', 'conditions']


[] []


['measured'] ['rate']


['people', 'people', 'living', 'day'] ['period', 'hour', 'business', 'going']


['day'] ['period', 'week', 'date']


['extreme', 'people', 'people', 'living'] ['labour', 'force', 'labour', 'resources']


['measured'] ['rates']


['eradicate', 'extreme', 'people', 'people', 'living'] ['levels', 'youth']


['people', 'people', 'living', 'less'] ['agriculture', 'public', 'less']


['people', 'people', 'living'] ['percentage', 'agriculture', 'trade']


['people', 'people', 'living'] ['agriculture', 'family']


['eradicate', 'extreme', 'measured'] ['level', 'rate', 'rates']


['measured', 'less'] ['rate', 'less']


['extreme', 'measured', 'day'] ['key', 'quality', 'rates', 'quality']


['people', 'measured', 'people', 'living', 'less'] ['agriculture', 'growth', 'rate']


['eradicate', 'extreme', 'less', 'day'] ['growth', 'tertiary', 'levels', 'growth']


['people', 'people', 'living', 'day'] ['even', 'income', 'security', 'agriculture', 'family']


['people', 'people', 'living'] ['women']


['extreme', 'people', 'people', 'living', 'day'] ['agriculture', 'hours', 'hours', 'earnings', 'low', 'income']


['living'] ['incomes']


[] []


ZeroDivisionError: division by zero