In [1]:
import pandas as pd
import sys
import unicodedata
import nltk
from SPARQLWrapper import SPARQLWrapper
from SPARQLWrapper import JSON as sqjson
#import hunspell -- installed and in package list but not working/not recognized??
sys.path.append("../../")
import ontology_category as oc
sys.path.pop()
svo = oc.init_svo()

In [6]:
def my_tokenizer(s):
    return s.replace('_',' ').split('/')

In [7]:
my_tokenizer('Yes/No/Don\'t Know')

['Yes', 'No', "Don't Know"]

In [7]:
# look up term in ontology, return its class if exact match found
def search_ontology_for_class(term):
    sparql = SPARQLWrapper("http://sparql.geoscienceontology.org")
    sparql.setQuery("""
                    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
                    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

                    SELECT ?entity ?class
                    WHERE {{ ?entity a ?class .
                           ?entity rdfs:label ?label .
                           FILTER regex(?label,"^{}$") .}}
                    """.format(term))
    sparql.setReturnFormat(sqjson)
    results = sparql.query().convert()

    data = []
    for result in results["results"]["bindings"]:
        c = result["class"]["value"].split('#')[1]
        if not c in data:
            data.append(c)

    return data

In [11]:
search_ontology_for_class('ocean')

['SpatialPhenomenon', 'Phenomenon', 'NamedIndividual', 'Body']

In [21]:
is_noun = lambda pos: pos[:2] == 'NN'
is_verb = lambda pos: pos[:2] == 'VB'

In [24]:
def get_nouns(phrase):
    tokenized = nltk.word_tokenize(phrase)
    nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] 
    return nouns

def get_verbs(phrase):
    tokenized = nltk.word_tokenize(phrase)
    verbs = [word for (word, pos) in nltk.pos_tag(tokenized) if is_verb(pos)] 
    return verbs

In [57]:
def find_categories(var):
    categories = ['phenomenon','process','property','quantity','state','attribute']
    variable_classification = pd.DataFrame(columns=['variable name phrase','property nouns','process nouns','phenomenon nouns','state attribute or value nouns'])
    for var_string in var:
        var_name = my_tokenizer(var_string)
        #print(var_name)
        #analyze first word in sequence:
        variable = var_name[0].split(': ')[-1].lower()
        #print('Var name is: ', variable)
        var_nouns = get_nouns(variable)
        #print('Nouns are: ',var_nouns)
        var_verbs = get_verbs(variable)
        #print('Verbs are: ',var_verbs)
        index = len(variable_classification)
        variable_classification.loc[index,'variable name phrase'] = variable
        variable_classification = variable_classification.fillna('')
        for word in var_nouns:
            word_classes = search_ontology_for_class(word)
            if word_classes != []:
                #print(word,' found in ontology ...')
                #print('\tclasses are:',word_classes)
                if 'Phenomenon' in word_classes:
                    variable_classification.loc[index,'phenomenon nouns'] += word + ','
                if 'Property' in word_classes:
                    variable_classification.loc[index,'property nouns'] += word + ','
                if 'Process' in word_classes:
                    variable_classification.loc[index,'process nouns'] += word + ','
            else:
                #print(word,' NOT found in ontology ...')
                #print('\tpossible categories are:')
                prop = False
                state = False
                for cat in categories:
                    if svo.is_cat(word,cat,out='short'):
                        #print('\t\t',cat)
                        if cat == 'phenomenon':
                            variable_classification.loc[index,'phenomenon nouns'] += word + ','
                        elif cat == 'process':
                            variable_classification.loc[index,'process nouns'] += word + ','
                        elif cat == 'property' or cat == 'quantity' and not prop:
                            prop = True
                            variable_classification.loc[index,'property nouns'] += word + ','
                        elif cat == 'attribute' or cat == 'state' and not state:
                            state = True
                            variable_classification.loc[index,'state attribute or value nouns'] += word + ','
    return variable_classification

In [44]:
#simple categorizer:
def simple_categorizer(variable_classification):
    for index in variable_classification.index:
        procs_found = variable_classification.loc[index,'process nouns'].split(',')
        props_found = variable_classification.loc[index,'property nouns'].split(',')
        phens_found = variable_classification.loc[index,'phenomenon nouns'].split(',')
        attrs_found = variable_classification.loc[index,'state attribute or value nouns'].split(',')
        prop = ''
        proc = ''
        phen = ''
        attr = ''
        variable = variable_classification.loc[index,'variable name phrase']
        if len(props_found) == 1: #only one property present
            prop = props_found[0]
        elif len(props_found) > 1:
            for p in props_found: #add only props that are not categorized as anything else
                if not p in procs_found and not p in phens_found and not p in attrs_found:
                    prop = (prop + ' ' + p).lstrip()
            if prop == '': #no property that matches previous criteria, check if first or last terms in phrase are props
                var_words = variable.split()
                if 'of' in var_words:
                    indices = [i for i, x in enumerate(var_words) if x == 'of']
                    for i in indices:
                        if i>0 and var_words[i-1] in props_found:
                            prop = (prop + ' ' + var_words[i-1]).lstrip()
                elif var_words[-1] in props_found:
                    prop = var_words[-1]
                else:
                    prop = var_words[0]
        # remove property from other entries:
        for p in prop.split():
            if p in procs_found:
                procs_found.remove(p)
            if p in phens_found:
                phens_found.remove(p)
            if p in attrs_found:
                attrs_found.remove(p)
        if len(procs_found) == 1: #only one process present
            proc = procs_found[0]
        elif len(procs_found) > 1: #more than one process found
            for p in procs_found: #add only processes that are not categorized as anything else
                if not p in phens_found and not p in attrs_found:
                    proc = (proc + ' ' + p).lstrip()
            if proc == '': #no process that matches previous criteria, check if first or last terms in phrase are procs
                var_words = variable.split()
                if 'of' in var_words:
                    indices = [i for i, x in enumerate(var_words) if x == 'of']
                    for i in indices:
                        if i>0 and var_words[i-1] in procs_found:
                            proc = (proc + ' ' + var_words[i-1]).lstrip()
                elif var_words[-1] in procs_found:
                    proc = var_words[-1]
                else:
                    proc = var_words[0]
        # remove process from other entries:
        for p in proc.split():
            if p in phens_found:
                phens_found.remove(p)
            if p in attrs_found:
                attrs_found.remove(p)
        if len(phens_found) == 1: #only one phenomenon present
            phen = phens_found[0]
        elif len(phens_found) > 1: #more than one phenomenon found
            for p in phens_found: #add only phenomena that are not categorized as anything else
                if not p in attrs_found:
                    phen = (phen + ' ' + p).lstrip()
            if phen == '': #no phenomena that match previous criteria, check if first or last terms in phrase are phen
                var_words = variable.split()
                for p in phens_found:
                    phen = (phen + ' ' + p).lstrip()
        # remove phenomena from other entries:
        for p in phen.split():
            if p in attrs_found:
                attrs_found.remove(p)
        for a in attrs_found:
            attr = (attr + ' ' + a).lstrip()
        variable_classification.loc[index,'suggested phenomenon'] = phen
        variable_classification.loc[index,'suggested property'] = prop
        variable_classification.loc[index,'suggested process'] = proc
        variable_classification.loc[index,'suggested attribute or value'] = attr
        possible_name = ('_').join(phen.split())+'__'+('_').join(proc.split())+'_'+('_').join(prop.split())
        variable_classification.loc[index,'possible name'] = possible_name

In [62]:
#Variable information from 4 different files/sources:
lsms = pd.read_csv('LSMS.csv',index_col=False)
mics = pd.read_csv('MICS.csv',index_col=False)
ipums = pd.read_csv('IPUMS.csv',index_col=False)
dhs2 = pd.read_csv('DHS2.csv',index_col=False)

In [53]:
def parse_qualitative_variables(var, output_file):
    var_phrase_list = var['Question'].dropna().tolist()
    variable_classification = find_categories(var_phrase_list)
    simple_categorizer(variable_classification)
    variable_classification.to_csv(output_file)
    return variable_classification

In [56]:
lsms_classification = parse_qualitative_variables(lsms, 'breakdown_of_lsms_qualitative_variables.csv')

Var name is:  sex
Var name is:  yes
Var name is:  yes
Var name is:  frequency
Var name is:  satisfaction
Var name is:  distance
Var name is:  time
Var name is:  time unit
Var name is:  house roster
Var name is:  household roster
Var name is:  relationship to head
Var name is:  marital status
Var name is:  children of household members who are living elsewhere and not members of the household 
Var name is:  place of residence
Var name is:  education
Var name is:  short module (education)
Var name is:  type of school
Var name is:  general education 
Var name is:  literacy
Var name is:  numeracy
Var name is:  school
Var name is:  type of diploma attained
Var name is:  major field of study in post-secondary education
Var name is:  successfully completed school
Var name is:  public
Var name is:  education in last 12 months 
Var name is:  who paid educational expenses
Var name is:  mode of transportation
Var name is:  expanded module, additional questions for part b 
Var name is:  has a comp

Var name is:  quality
Var name is:  place members go to relieve themselves
Var name is:  fuel sources and uses
Var name is:  souce of fuel
Var name is:  primary
Var name is:  detail on fuel by source
Var name is:  fuel purpose
Var name is:  members who collects fuel source
Var name is:  proportions
Var name is:  source of dung
Var name is:  households’ willingness to pay for improved water service - an example of a contingent valuation scenario (urban) 
Var name is:  vote for the new water supply project 
Var name is:  would want to be connected to the new water supply system 
Var name is:  households’ willingness to pay for improved water service - an example of a contingent valuation scenario (rural - public taps only)
Var name is:  would pay monthly fee and use public taps
Var name is:  would buy water from public taps
Var name is:  preferred payment system for public taps 
Var name is:  households’ willingness to pay for improved sanitation - an example of a contingent valuation sc

Unnamed: 0,variable name phrase,property nouns,process nouns,phenomenon nouns,state attribute or value nouns,suggested phenomenon,suggested property,suggested process,suggested attribute or value,possible name
0,sex,"sex,","sex,",,"sex,",,sex,,,___sex
1,yes,,,,,,,,,___
2,yes,,,,,,,,,___
3,frequency,"frequency,",,,,,frequency,,,___frequency
4,satisfaction,"satisfaction,","satisfaction,",,"satisfaction,",,satisfaction,,,___satisfaction
5,distance,"distance,",,,,,distance,,,___distance
6,time,"time,",,,,,time,,,___time
7,time unit,"time,",,"unit,",,unit,time,,,unit___time
8,house roster,,"house,","house,",,,,house,,__house_
9,household roster,,,,,,,,,___


In [63]:
mics_classification = parse_qualitative_variables(mics, 'breakdown_of_mics_qualitative_variables.csv')

In [64]:
ipums_classification = parse_qualitative_variables(ipums, 'breakdown_of_ipums_qualitative_variables.csv')

In [60]:
dhs2_classification = parse_qualitative_variables(dhs2, 'breakdown_of_dhs2_qualitative_variables.csv')

In [5]:
svo.is_cat('location','process',out='short')

True