In [26]:
import nltk
import re
from nltk.corpus import wordnet as wn
from itertools import product
from nltk.corpus import wordnet_ic


def get_sim_score(word_1, word_2, info_content):
    """ 
    Calculate the highest path similarity among all pairs. 
    """

    if word_1 == word_2:
        return 1
    else:
        max_sim = 0.0
        synsets_1 = wn.synsets(word_1)
        synsets_2 = wn.synsets(word_2)
        if synsets_1 and synsets_2:
            for synset_1, synset_2 in product(synsets_1, synsets_2):
                try:
                    #sim = wn.lin_similarity(synset_1, synset_2, info_content)
                    sim = wn.jcn_similarity(synset_1, synset_2, info_content)
                    #sim = wn.wup_similarity(synset_1, synset_2)
                    if sim > max_sim:
                        max_sim = sim
                except:
                    continue

            return max_sim
        return max_sim

def remove_dup(seq):
    seen = set()
    return [x for x in seq if not (x in seen or seen.add(x))]


def mySim(text1, text2, sigma=0.85, w=0.3, corpus='ic-brown-resnik.dat'):
    # set stop words
    stopwords = nltk.corpus.stopwords.words('english')
    # set variables
    x = []
    y = []
    dic = {}
    info_content = wordnet_ic.ic(corpus)

    # clean raw text
    text1 = re.sub('[^a-zA-Z]', ' ', text1).lower()
    text2 = re.sub('[^a-zA-Z]', ' ', text2).lower()

    # tokenize inputs into vectors
    token_p = nltk.word_tokenize(text1, language='english')
    token_r = nltk.word_tokenize(text2, language='english')

    concept_p = [words for words in token_p if words not in stopwords]
    concept_r = [words for words in token_r if words not in stopwords]

    
    
    # pos the tokens and n-grams
    pos_p = [word for word, tag in nltk.pos_tag(concept_p) if tag.startswith('NN') or tag.startswith('JJ')]
    pos_p.extend([' '.join(words).strip() for words in nltk.ngrams(pos_p, 2)])
    
    pos_r = [word for word, tag in nltk.pos_tag(concept_r) if tag.startswith('NN') or tag.startswith('JJ')]
    pos_r.extend([' '.join(words).strip() for words in nltk.ngrams(pos_r, 2)])
    
    #print(pos_p, pos_r)


    for w1, w2 in product(pos_p, pos_r):
        sim = 0
        w1_set = set(w1.split())
        w2_set = set(w2.split())
        for t1, t2 in product(w1_set, w2_set):
            sim += get_sim_score(t1, t2, info_content)
            
            
        if sim >= sigma:
            x.append(w1)
            y.append(w2)
            dic[w1] = w2
            #print(sim, '\t\t', w1_set, '|',  w2_set)

    concept_x = [words for words in pos_p if words in x]
    concept_y = [words for words in pos_r if words in y]

    print(concept_x, concept_y)
    
    if concept_x:
        return 1
    else:
        return 0


'''
    count = len(concept_x)
    total = 0
    
    m = len(concept_p)
    n = len(concept_r)

    if count == 0:
        sim_score_0 = 0

    elif count % 2 == 0 or count == 1:
        for position, word in enumerate(concept_x):
            total += abs(position - concept_y.index(dic[word]))

        sim_score_0 = 1 - 2 * total / count ** 2

    else:
        for position, word in enumerate(concept_x):
            total += abs(position - concept_y.index(dic[word]))

        sim_score_0 = 1 - 2 * total / (count ** 2 - 1)

    S = ((m + n) / (2 * m * n)) * (count * (1 - w * (1 - sim_score_0)))

    return S
'''

'\n    count = len(concept_x)\n    total = 0\n    \n    m = len(concept_p)\n    n = len(concept_r)\n\n    if count == 0:\n        sim_score_0 = 0\n\n    elif count % 2 == 0 or count == 1:\n        for position, word in enumerate(concept_x):\n            total += abs(position - concept_y.index(dic[word]))\n\n        sim_score_0 = 1 - 2 * total / count ** 2\n\n    else:\n        for position, word in enumerate(concept_x):\n            total += abs(position - concept_y.index(dic[word]))\n\n        sim_score_0 = 1 - 2 * total / (count ** 2 - 1)\n\n    S = ((m + n) / (2 * m * n)) * (count * (1 - w * (1 - sim_score_0)))\n\n    return S\n'

In [3]:
mySim('Many consider Malin as the best player in PingPong history', 'Malin is one of the best PingPong players')

['many', 'malin', 'best', 'player', 'pingpong', 'history', 'many malin', 'malin best', 'best player', 'player pingpong', 'pingpong history'] ['malin', 'best', 'pingpong', 'players', 'malin best', 'best pingpong', 'pingpong players']


0.0 many | malin 		 {'many'} | {'malin'}
0.0 many | best 		 {'many'} | {'best'}
0.0 many | pingpong 		 {'many'} | {'pingpong'}
0.0 many | players 		 {'many'} | {'players'}
0.0 many | malin 		 {'many'} | {'malin', 'best'}
0.0 many | best 		 {'many'} | {'malin', 'best'}
0.0 many | best 		 {'many'} | {'best', 'pingpong'}
0.0 many | pingpong 		 {'many'} | {'best', 'pingpong'}
0.0 many | players 		 {'many'} | {'players', 'pingpong'}
0.0 many | pingpong 		 {'many'} | {'players', 'pingpong'}
1 malin | malin 		 {'malin'} | {'malin'}
0.0 malin | best 		 {'malin'} | {'best'}
0.0 malin | pingpong 		 {'malin'} | {'pingpong'}
0.0 malin | players 		 {'malin'} | {'players'}
1 malin | malin 		 {'malin'} | {'malin', 'best'}
1.0 malin | best 		 {'malin'} | {'malin', 'best'}
0.0 malin | best 		 {'malin'} | {'best', 'pingpong'}
0.0 malin | pingpong 		 {'malin'} | {'best', 'pingpong'}
0.0 malin | players 		 {'malin'} | {'players', 'pingpong'}
0.0 malin | pingpong 		 {'malin'} | {'players', 'pingpong'}
0.0 

0.0 pingpong | malin 		 {'pingpong', 'player'} | {'malin', 'best'}
0.0 pingpong | best 		 {'pingpong', 'player'} | {'malin', 'best'}
0.0 player | malin 		 {'pingpong', 'player'} | {'malin', 'best'}
0.09319338523851901 player | best 		 {'pingpong', 'player'} | {'malin', 'best'}
0.0 pingpong | best 		 {'pingpong', 'player'} | {'best', 'pingpong'}
1.0 pingpong | pingpong 		 {'pingpong', 'player'} | {'best', 'pingpong'}
1.093193385238519 player | best 		 {'pingpong', 'player'} | {'best', 'pingpong'}
1.093193385238519 player | pingpong 		 {'pingpong', 'player'} | {'best', 'pingpong'}
0.0 pingpong | players 		 {'pingpong', 'player'} | {'players', 'pingpong'}
1.0 pingpong | pingpong 		 {'pingpong', 'player'} | {'players', 'pingpong'}
1e+300 player | players 		 {'pingpong', 'player'} | {'players', 'pingpong'}
1e+300 player | pingpong 		 {'pingpong', 'player'} | {'players', 'pingpong'}
0.0 history | malin 		 {'history', 'pingpong'} | {'malin'}
0.0 pingpong | malin 		 {'history', 'pingpong'} | {

1

In [24]:
from nltk import tokenize
with open('/Users/maxwelllee54/GitHubs/Rapid_Assessment_Tools/Bhutan_Input.txt') as f:
    data = f.read()
    sen = tokenize.sent_tokenize(data)

In [18]:
target = 'By 2030, reduce at least by half the proportion of men, women and children of all ages living in poverty in all its dimensions according to national definitions'

In [21]:
test = 'Expand the social protection system to cover children in all poor households'

In [22]:
mySim(target, test, sigma=0.6)

['least', 'half', 'proportion', 'men', 'women', 'children', 'ages', 'poverty', 'dimensions', 'national', 'definitions', 'least half', 'half proportion', 'proportion men', 'men women', 'women children', 'children ages', 'ages poverty', 'poverty dimensions', 'dimensions national', 'national definitions'] ['social', 'protection', 'system', 'cover', 'children', 'poor', 'households', 'social protection', 'protection system', 'system cover', 'cover children', 'children poor', 'poor households']
1 		 {'children'} | {'children'}
1.0950701259887032 		 {'children'} | {'cover', 'children'}
1.066734669871967 		 {'children'} | {'poor', 'children'}


1.1212713746416614 		 {'women', 'children'} | {'children'}
1.3179252615015744 		 {'women', 'children'} | {'cover', 'children'}
1.343778961002156 		 {'women', 'children'} | {'poor', 'children'}
1.0728309862672214 		 {'ages', 'children'} | {'children'}
1.2473288368461346 		 {'ages', 'children'} | {'cover', 'children'}
1.2103216199212885 		 {'ages', 'children'} | {'poor', 'children'}
['children', 'women children', 'children ages'] ['children', 'cover children', 'children poor']


1

In [33]:
mySim(target, test)

[] []


0.0

In [19]:
brown_ic = wordnet_ic.ic('ic-brown-resnik.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')

In [13]:
from nltk.corpus import wordnet as wn
from itertools import product

synsets_1 = wn.synsets('poverty')
synsets_2 = wn.synsets('individuals')
if synsets_1 and synsets_2:
    for synset_1, synset_2 in product(synsets_1, synsets_2):
        try:
            sim = wn.wup_similarity(synset_1, synset_2)
            print(sim, synset_1, synset_2)
        except:
            continue

0.18181818181818182 Synset('poverty.n.01') Synset('person.n.01')
0.14285714285714285 Synset('poverty.n.01') Synset('individual.n.02')


In [28]:
for s in sen: 
    if mySim('poverty, dimensions', s, sigma=1):
        print(s)

[] []


['poverty', 'dimensions', 'poverty dimensions'] ['dimensions', 'poverty', 'education dimensions', 'dimensions development', 'sustainable poverty', 'poverty alleviation']
Education virtually impacts all dimensions of development and is critical for ensuring sustainable poverty alleviation which can only come through the empowerment of our people, particularly the remote and most vulnerable sections of our population.


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


['dimensions', 'poverty dimensions'] ['proportion', 'ratios proportion', 'proportion pupils']
In terms of the MDGs, the targets for gross primary enrolment ratios, proportion of pupils starting grade 1 who reach grade 5, and the ratio of girls to boys in primary schools and secondary schools have already been achieved while the other targets are well on track to be achieved.


[] []


[] []


[] []


[] []


[] []


[] []


[] []


['poverty', 'poverty dimensions'] ['needs', 'learning needs', 'needs children']
This 1.5 percent are assumed to be children in remote and hard to reach areas, children of nomadic communities, children with learning disabilities whose special learning needs are yet to be catered for and children of the urban poor.


[] []


[] []


[] []


[] []


[] []


['poverty', 'poverty dimensions'] ['needs', 'special needs', 'needs children']
Addressing the last mile challenge of enrollment, including disparities at local levels and catering to the special needs of children with disabilities will require innovative and cost effective strategies in the delivery of education.


[] []


['poverty', 'poverty dimensions'] ['need', 'high need', 'need improvement']
Judging solely by improvements in efficiency indicators such as drop-out rates, repetition rates, completion rates and survival rates, the quality of education has certainly improved over the years, although some of these rates, such as repetition and dropout rates in class IV and VII, are still undesirably high and need further improvement.


[] []


[] []


['dimensions', 'poverty dimensions'] ['time', 'shortage time']
This has resulted in a paradoxical situation of relatively high levels of youth unemployment and a critical skills shortage at the same time.


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


['poverty', 'poverty dimensions'] ['needs', 'developmental needs', 'needs clear']
5.1.3  Sector Key Result Areas

The sector key result areas and key performance indicators for TVET and Education sector are as follows:

5.1.4  Strategies

Any strategy to strengthen our education system and institutions to the level required to bring about the transformation in our socio-economic development process must ensure that we take a comprehensive stock of the state of our entire education system (spanning school, TVET, higher education, and continuing education) as an integrated whole rather than disparate parts (including regular assessment and diagnostics of student performance and identify the root causes of the mismatch between educational outcomes and our developmental needs); establish clear quality criteria in terms of the desired outcomes that are responsive to the changing expectations of students and are regularly benchmarked to global standards (such as student outcomes and employme

[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


[] []


['poverty', 'poverty dimensions'] ['need', 'schools need', 'need increase']
5.1.5  Key Programmes

Some of the key programmes for the Eleventh Plan:

Education

i)	Construction of new schools based on need to increase space especially at the secondary levels;

ii)	Provision of minimum standard facilities, wherever required, to ensure a positive environment for teaching-learning process.


[] []


['poverty', 'poverty dimensions'] ['needs', 'programmes needs', 'needs individuals']
in nearby schools and rationalizing smaller schools;

iv)	Provision of staff quarters/allowances especially in remote and rural areas to make rural posting attractive and improve quality of education in the remote locations;

v)	Addressing gender gaps at higher secondary level through provision of gender responsive facilities such as hostels and hostel toilets for girls' toilets and housing for female teachers;

vi)	Strengthening curriculum research and development and finding mechanisms to enhance efficiency, accountability and retain specialized skills;

vii)	Enhance capacity of tertiary education to achieve GER at tertiary education of 35 percent by 2017-18;

viii)	Enhance quality and relevance of tertiary education programmes to meet the needs of individuals as well as the industry and society at large.


[] []


[] []


[] []


[] []


[] []


KeyboardInterrupt: 