In [3]:
import pandas as pd
import re

path = 'Namibia.txt'

with open(path, 'r') as f:
    text = f.read()


df = pd.DataFrame([context.strip().split(' ', 1) for context in text.split(sep='*****')], columns=['sector', 'text'])
df.head()

Unnamed: 0,sentence,sector
0,Institutional Environment\n\nThere is a genera...,1
1,"Very broadly defined, institutional environmen...",1
2,"Arguably, the private sector has the greatest ...",1
3,Business activities can create jobs and entrep...,1
4,The private sector has a role to play not only...,1


In [17]:
# split raw text into sentences and keep the sector info
df_new = pd.concat([pd.Series(row['sector'], re.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.)\s+', row['text']))              
                    for _, row in df.iterrows()]).reset_index()
df_new.columns= ['sentence', 'sector']
df_new.head()

Unnamed: 0,sentence,sector
0,Institutional Environment\n\nThere is a genera...,1
1,"Very broadly defined, institutional environmen...",1
2,"Arguably, the private sector has the greatest ...",1
3,Business activities can create jobs and entrep...,1
4,The private sector has a role to play not only...,1


In [18]:
import accessment_model

goals = pd.read_csv('SDG_Goal.csv')
goals.columns = ['goal', 'text']

health = goals.iloc[10:19, :]
health

Unnamed: 0,goal,text
10,3.1,"By 2030, reduce the global maternal mortality ..."
11,3.2,"By 2030, end preventable deaths of newborns an..."
12,3.3,"By 2030, end the epidemics of AIDS, tuberculos..."
13,3.4,"By 2030, reduce by one third premature mortali..."
14,3.5,Strengthen the prevention and treatment of sub...
15,3.6,"By 2020, halve the number of global deaths and..."
16,3.7,"By 2030, ensure universal access to sexual and..."
17,3.8,"Achieve universal health coverage, including f..."
18,3.9,"By 2030, substantially reduce the number of de..."


In [19]:
for ind in health.index.values:
    target = health.loc[ind, 'text']
    goal = health.loc[ind, 'goal']
    for i in df_new.index.values:
        df_new.loc[i , goal] = accessment_model.mySim(df_new.loc[i, 'sentence'], target)

In [20]:
df_new.to_csv('test0423_6.csv')

In [None]:
df = spark.read.csv('bhutan_sentence.csv', header=True)
df.show()

In [41]:
from pyspark.sql.functions import udf
import nltk
from nltk.corpus import wordnet as wn
from itertools import product
from nltk.corpus import wordnet_ic


def get_sim_score(word_1, word_2, info_content):
    """ 
    Calculate the highest path similarity among all pairs. 
    """

    if word_1 == word_2:
        return 1
    else:
        max_sim = 0.0
        synsets_1 = wn.synsets(word_1)
        synsets_2 = wn.synsets(word_2)
        if synsets_1 and synsets_2:
            for synset_1, synset_2 in product(synsets_1, synsets_2):
                try:
                    #sim = wn.lin_similarity(synset_1, synset_2, info_content)
                    sim = wn.jcn_similarity(synset_1, synset_2, info_content)
                    #sim = wn.wup_similarity(synset_1, synset_2)
                    if sim > max_sim:
                        max_sim = sim
                except:
                    continue

            return max_sim
        return max_sim

def f(text):
    return nltk.word_tokenize(text, language='english')

def mySim(text1, text2 = 'poverty', sigma=0.85, w=0.3, corpus='ic-brown-resnik.dat'):
    # set stop words
    stopwords = nltk.corpus.stopwords.words('english')
    # set variables
    x = []
    y = []
    dic = {}
    info_content = wordnet_ic.ic(corpus)

    # clean raw text
    text1 = re.sub('[^a-zA-Z]', ' ', text1).lower()
    text2 = re.sub('[^a-zA-Z]', ' ', text2).lower()

    # tokenize inputs into vectors
    token_p = nltk.word_tokenize(text1, language='english')
    token_r = nltk.word_tokenize(text2, language='english')

    concept_p = [words for words in token_p if words not in stopwords]
    concept_r = [words for words in token_r if words not in stopwords]

    
    
    # pos the tokens and n-grams
    pos_p = [word for word, tag in nltk.pos_tag(concept_p) if tag.startswith('NN') or tag.startswith('JJ')]
    pos_p.extend([' '.join(words).strip() for words in nltk.ngrams(pos_p, 2)])
    
    pos_r = [word for word, tag in nltk.pos_tag(concept_r) if tag.startswith('NN') or tag.startswith('JJ')]
    pos_r.extend([' '.join(words).strip() for words in nltk.ngrams(pos_r, 2)])
    
    #print(pos_p, pos_r)
    
    count = 0
    total_sim = 0
 
    for w1, w2 in product(pos_p, pos_r):
        sim = 0
        w1_set = set(w1.split())
        w2_set = set(w2.split())
        for t1, t2 in product(w1_set, w2_set):
            sim += get_sim_score(t1, t2, info_content)
            
            
        if sim >= sigma:
            count += 1
            total_sim += sim
    
    if count:
        return  total_sim/count
    else:
        return 0

KeyboardInterrupt: 

In [42]:
udf_f = udf(mySim)

df.withColumn("category", udf_f('sentence')).show(10)

KeyboardInterrupt: 