In [1]:
import pandas as pd
import sys
import unicodedata
import nltk
from SPARQLWrapper import SPARQLWrapper
from SPARQLWrapper import JSON as sqjson
#import hunspell -- installed and in package list but not working/not recognized??
sys.path.append("../../")
import ontology_category as oc
sys.path.pop()
svo = oc.init_svo()

In [2]:
def my_tokenizer(s):
    return s.replace('_',' ').split('/')

In [3]:
my_tokenizer('Yes/No/Don\'t Know')

['Yes', 'No', "Don't Know"]

In [4]:
# look up term in ontology, return its class if exact match found
def search_ontology_for_class(term):
    sparql = SPARQLWrapper("http://sparql.geoscienceontology.org")
    sparql.setQuery("""
                    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
                    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

                    SELECT ?entity ?class
                    WHERE {{ ?entity a ?class .
                           ?entity rdfs:label ?label .
                           FILTER regex(?label,"^{}$") .}}
                    """.format(term))
    sparql.setReturnFormat(sqjson)
    results = sparql.query().convert()

    data = []
    for result in results["results"]["bindings"]:
        c = result["class"]["value"].split('#')[1]
        if not c in data:
            data.append(c)

    return data

In [5]:
search_ontology_for_class('ocean')

['SpatialPhenomenon', 'Phenomenon', 'NamedIndividual', 'Body']

In [6]:
is_noun = lambda pos: pos[:2] == 'NN'
is_verb = lambda pos: pos[:2] == 'VB'

In [7]:
def get_nouns(phrase):
    tokenized = nltk.word_tokenize(phrase)
    nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] 
    return nouns

def get_verbs(phrase):
    tokenized = nltk.word_tokenize(phrase)
    verbs = [word for (word, pos) in nltk.pos_tag(tokenized) if is_verb(pos)] 
    return verbs

In [16]:
def find_categories(var):
    categories = ['phenomenon','process','property','quantity','state','attribute']
    variable_classification = pd.DataFrame(columns=['variable name phrase','property nouns','process nouns','phenomenon nouns','state attribute or value nouns'])
    for var_string in var:
        var_name = my_tokenizer(var_string)
        print(var_name)
        #analyze first word in sequence:
        variable = var_name[0].split(': ')[-1].lower()
        print('Var name is: ', variable)
        var_nouns = get_nouns(variable)
        print('Nouns are: ',var_nouns)
        var_verbs = get_verbs(variable)
        print('Verbs are: ',var_verbs)
        index = len(variable_classification)
        variable_classification.loc[index,'variable name phrase'] = variable
        variable_classification = variable_classification.fillna('')
        for word in var_nouns:
            word_classes = search_ontology_for_class(word)
            if word_classes != []:
                #print(word,' found in ontology ...')
                #print('\tclasses are:',word_classes)
                if 'Phenomenon' in word_classes:
                    variable_classification.loc[index,'phenomenon nouns'] += word + ','
                if 'Property' in word_classes:
                    variable_classification.loc[index,'property nouns'] += word + ','
                if 'Process' in word_classes:
                    variable_classification.loc[index,'process nouns'] += word + ','
            else:
                #print(word,' NOT found in ontology ...')
                #print('\tpossible categories are:')
                prop = False
                state = False
                for cat in categories:
                    if svo.is_cat(word,cat,out='short'):
                        #print('\t\t',cat)
                        if cat == 'phenomenon':
                            variable_classification.loc[index,'phenomenon nouns'] += word + ','
                        elif cat == 'process':
                            variable_classification.loc[index,'process nouns'] += word + ','
                        elif cat == 'property' or cat == 'quantity' and not prop:
                            prop = True
                            variable_classification.loc[index,'property nouns'] += word + ','
                        elif cat == 'attribute' or cat == 'state' and not state:
                            state = True
                            variable_classification.loc[index,'state attribute or value nouns'] += word + ','
    return variable_classification

In [9]:
#simple categorizer:
def simple_categorizer(variable_classification):
    for index in variable_classification.index:
        procs_found = variable_classification.loc[index,'process nouns'].split(',')
        props_found = variable_classification.loc[index,'property nouns'].split(',')
        phens_found = variable_classification.loc[index,'phenomenon nouns'].split(',')
        attrs_found = variable_classification.loc[index,'state attribute or value nouns'].split(',')
        prop = ''
        proc = ''
        phen = ''
        attr = ''
        variable = variable_classification.loc[index,'variable name phrase']
        if len(props_found) == 1: #only one property present
            prop = props_found[0]
        elif len(props_found) > 1:
            for p in props_found: #add only props that are not categorized as anything else
                if not p in procs_found and not p in phens_found and not p in attrs_found:
                    prop = (prop + ' ' + p).lstrip()
            if prop == '': #no property that matches previous criteria, check if first or last terms in phrase are props
                var_words = variable.split()
                if 'of' in var_words:
                    indices = [i for i, x in enumerate(var_words) if x == 'of']
                    for i in indices:
                        if i>0 and var_words[i-1] in props_found:
                            prop = (prop + ' ' + var_words[i-1]).lstrip()
                elif var_words[-1] in props_found:
                    prop = var_words[-1]
                else:
                    prop = var_words[0]
        # remove property from other entries:
        for p in prop.split():
            if p in procs_found:
                procs_found.remove(p)
            if p in phens_found:
                phens_found.remove(p)
            if p in attrs_found:
                attrs_found.remove(p)
        if len(procs_found) == 1: #only one process present
            proc = procs_found[0]
        elif len(procs_found) > 1: #more than one process found
            for p in procs_found: #add only processes that are not categorized as anything else
                if not p in phens_found and not p in attrs_found:
                    proc = (proc + ' ' + p).lstrip()
            if proc == '': #no process that matches previous criteria, check if first or last terms in phrase are procs
                var_words = variable.split()
                if 'of' in var_words:
                    indices = [i for i, x in enumerate(var_words) if x == 'of']
                    for i in indices:
                        if i>0 and var_words[i-1] in procs_found:
                            proc = (proc + ' ' + var_words[i-1]).lstrip()
                elif var_words[-1] in procs_found:
                    proc = var_words[-1]
                else:
                    proc = var_words[0]
        # remove process from other entries:
        for p in proc.split():
            if p in phens_found:
                phens_found.remove(p)
            if p in attrs_found:
                attrs_found.remove(p)
        if len(phens_found) == 1: #only one phenomenon present
            phen = phens_found[0]
        elif len(phens_found) > 1: #more than one phenomenon found
            for p in phens_found: #add only phenomena that are not categorized as anything else
                if not p in attrs_found:
                    phen = (phen + ' ' + p).lstrip()
            if phen == '': #no phenomena that match previous criteria, check if first or last terms in phrase are phen
                var_words = variable.split()
                for p in phens_found:
                    phen = (phen + ' ' + p).lstrip()
        # remove phenomena from other entries:
        for p in phen.split():
            if p in attrs_found:
                attrs_found.remove(p)
        for a in attrs_found:
            attr = (attr + ' ' + a).lstrip()
        variable_classification.loc[index,'suggested phenomenon'] = phen
        variable_classification.loc[index,'suggested property'] = prop
        variable_classification.loc[index,'suggested process'] = proc
        variable_classification.loc[index,'suggested attribute or value'] = attr
        possible_name = ('_').join(phen.split())+'__'+('_').join(proc.split())+'_'+('_').join(prop.split())
        variable_classification.loc[index,'possible name'] = possible_name

In [10]:
#Variable information from 4 different files/sources:
lsms = pd.read_csv('LSMS.csv',index_col=False)
mics = pd.read_csv('MICS.csv',index_col=False)
ipums = pd.read_csv('IPUMS.csv',index_col=False)
dhs2 = pd.read_csv('DHS2.csv',index_col=False)

In [11]:
def parse_qualitative_variables(var, output_file):
    var_phrase_list = var['Question'].dropna().tolist()
    variable_classification = find_categories(var_phrase_list)
    simple_categorizer(variable_classification)
    variable_classification.to_csv(output_file)
    return variable_classification

In [17]:
lsms_classification = parse_qualitative_variables(lsms, 'breakdown_of_lsms_qualitative_variables.csv')

['Sex']
Var name is:  sex
Nouns are:  ['sex']
Verbs are:  []
['Yes', 'No']
Var name is:  yes
Nouns are:  ['yes']
Verbs are:  []
['Yes', 'No', "Don't know"]
Var name is:  yes
Nouns are:  ['yes']
Verbs are:  []
['Frequency']
Var name is:  frequency
Nouns are:  ['frequency']
Verbs are:  []
['Satisfaction']
Var name is:  satisfaction
Nouns are:  ['satisfaction']
Verbs are:  []
['Distance']
Var name is:  distance
Nouns are:  ['distance']
Verbs are:  []
['Time']
Var name is:  time
Nouns are:  ['time']
Verbs are:  []
['Time unit']
Var name is:  time unit
Nouns are:  ['time', 'unit']
Verbs are:  []
['HOUSE ROSTER']
Var name is:  house roster
Nouns are:  ['house', 'roster']
Verbs are:  []
['Part A: Household Roster']
Var name is:  household roster
Nouns are:  ['household', 'roster']
Verbs are:  []
['Relationship to head']
Var name is:  relationship to head
Nouns are:  ['relationship']
Verbs are:  ['head']
['Marital Status']
Var name is:  marital status
Nouns are:  ['status']
Verbs are:  []
['Pa

['Type of hospital room ']
Var name is:  type of hospital room 
Nouns are:  ['type', 'hospital', 'room']
Verbs are:  []
['Treatment received']
Var name is:  treatment received
Nouns are:  ['treatment']
Verbs are:  ['received']
['Health care facility']
Var name is:  health care facility
Nouns are:  ['health', 'care', 'facility']
Verbs are:  []
['Employment']
Var name is:  employment
Nouns are:  ['employment']
Verbs are:  []
['Part A: Labor Force Participation ']
Var name is:  labor force participation 
Nouns are:  ['labor', 'force', 'participation']
Verbs are:  []
['All yes', 'All No']
Var name is:  all yes
Nouns are:  ['yes']
Verbs are:  []
['Main reason for not working']
Var name is:  main reason for not working
Nouns are:  ['reason']
Verbs are:  ['working']
['Main reason for not looking for work']
Var name is:  main reason for not looking for work
Nouns are:  ['reason', 'work']
Verbs are:  ['looking']
['Standard module, Part A: Labor Force Participation ']
Var name is:  labor force p

['Time unit']
Var name is:  time unit
Nouns are:  ['time', 'unit']
Verbs are:  []
['From whom do you rent this dwelling']
Var name is:  from whom do you rent this dwelling
Nouns are:  ['dwelling']
Verbs are:  ['do', 'rent']
['Rent inclusion']
Var name is:  rent inclusion
Nouns are:  ['rent', 'inclusion']
Verbs are:  []
['Monthly payments']
Var name is:  monthly payments
Nouns are:  ['payments']
Verbs are:  []
['Non-household member pay (part of) rent']
Var name is:  non-household member pay (part of) rent
Nouns are:  ['member', 'pay', 'part', 'rent']
Verbs are:  []
['Who pay(part of) rent']
Var name is:  who pay(part of) rent
Nouns are:  ['part', 'rent']
Verbs are:  ['pay']
['Part E: Planned Moves and Upgrades']
Var name is:  planned moves and upgrades
Nouns are:  ['moves', 'upgrades']
Verbs are:  ['planned']
['Plan to move within 12 months']
Var name is:  plan to move within 12 months
Nouns are:  ['plan', 'months']
Verbs are:  ['move']
['Motivation to move to new dwelling']
Var name i

['Water charges']
Var name is:  water charges
Nouns are:  ['water', 'charges']
Verbs are:  []
['Complaints on piped water system']
Var name is:  complaints on piped water system
Nouns are:  ['complaints', 'water', 'system']
Verbs are:  []
['Method to collect water from well']
Var name is:  method to collect water from well
Nouns are:  ['method', 'water']
Verbs are:  ['collect']
['Tye of rainwater collection system used']
Var name is:  tye of rainwater collection system used
Nouns are:  ['tye', 'rainwater', 'collection', 'system']
Verbs are:  ['used']
['Module 5: Household Sanitation - Attitudes and Practices']
Var name is:  household sanitation - attitudes and practices
Nouns are:  ['household', 'sanitation', 'attitudes', 'practices']
Verbs are:  []
['Quality']
Var name is:  quality
Nouns are:  ['quality']
Verbs are:  []
['Place where toilet waste is discharged']
Var name is:  place where toilet waste is discharged
Nouns are:  ['place', 'toilet', 'waste']
Verbs are:  ['is', 'discharged

['Method used to find the first wage work ']
Var name is:  method used to find the first wage work 
Nouns are:  ['method', 'wage', 'work']
Verbs are:  ['used', 'find']
['Yes', 'No', 'Blank']
Var name is:  yes
Nouns are:  ['yes']
Verbs are:  []
['Filter for been living here for more than 5 years ']
Var name is:  filter for been living here for more than 5 years 
Nouns are:  ['filter', 'years']
Verbs are:  ['been', 'living']
['Relationship to person living in the same dwelling']
Var name is:  relationship to person living in the same dwelling
Nouns are:  ['relationship', 'person', 'dwelling']
Verbs are:  ['living']
['Main activity']
Var name is:  main activity
Nouns are:  ['activity']
Verbs are:  []
['Lives in Country', ' Lives abroad']
Var name is:  lives in country
Nouns are:  ['lives', 'country']
Verbs are:  []
['Major field of study in post secondary education ']
Var name is:  major field of study in post secondary education 
Nouns are:  ['field', 'study', 'post', 'education']
Verbs 

In [63]:
mics_classification = parse_qualitative_variables(mics, 'breakdown_of_mics_qualitative_variables.csv')

In [64]:
ipums_classification = parse_qualitative_variables(ipums, 'breakdown_of_ipums_qualitative_variables.csv')

In [60]:
dhs2_classification = parse_qualitative_variables(dhs2, 'breakdown_of_dhs2_qualitative_variables.csv')

In [14]:
svo.is_cat('cooking','process',out='short')

True

In [15]:
search_ontology_for_class('cooking')

['Process', 'NamedIndividual']