Skip to content

Commit

Permalink
Add a demo function for the WALS module. Patch by Michael Wayne Goodman.
Browse files Browse the repository at this point in the history
  • Loading branch information
alexrudnick committed Nov 2, 2011
1 parent 3e9d1fa commit 09eeb91
Showing 1 changed file with 43 additions and 1 deletion.
44 changes: 43 additions & 1 deletion nltk_contrib/wals.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def open_csv(filename, remove_header=True):
def map_fields(vectors, fields):
for vector in vectors:
yield dict(zip(fields, vector))

# Features
self.features = dict((f['id'], f) for f in
map_fields(open_csv('features'),
Expand All @@ -98,6 +99,10 @@ def map_fields(vectors, fields):
self.languages = dict((l['wals_code'], l) for l in
map_fields(open_csv('languages'),
language_fields))
# convert longitude and latitude to float from string
for l in self.languages.values():
l['latitude'] = float(l['latitude'])
l['longitude'] = float(l['longitude'])
# The datapoints file is more complicated. There is a column for
# every feature, and a row for every language. Each cell is either
# empty or contains a value dependent on the feature.
Expand Down Expand Up @@ -200,11 +205,48 @@ def get_languages_with_feature(self, feature, value=None, superclass=None):
subfamily, and genus.
"""

value = str(value) # In case it's an int instead of a string
if value: value = str(value) # be robust to int values
supermatch = lambda x: superclass in (x['genus'],
x['subfamily'],
x['family'])
valmatch = lambda x: self.data[l['wals_code']].get(feature) == value
return [l for l in self.feat_lg_map[feature]
if (not value or valmatch(l)) and \
(not superclass or supermatch(l))]

def demo(wals_directory=None, dialect='excel-tab', encoding='utf-8'):
if not wals_directory:
import sys
print >>sys.stderr, 'Error: No WALS data directory provided.'
print >>sys.stderr, ' You may obtain the database from ' +\
'http://wals.info/export'
return
w = WALS(wals_directory, dialect, encoding)

# Basic statistics
print 'In database:\n %d\tlanguages\n %d\tfeatures ' %\
(len(w.languages), len(w.features))
# values are a nested dictionary (w.values[feature_id][value_id])
num_vals = sum(map(len, w.values.values()))
print ' %d\ttotal values (%f avg. number per feature)' %\
(num_vals, float(num_vals)/len(w.features))
# More statistics
print " %d languages specify feature 81A (order of S, O, and V)" %\
(len(w.get_languages_with_feature('81A')))
print " %d langauges have VOS order" %\
(len(w.get_languages_with_feature('81A', value='4')))

# Getting language data
print "\nGetting data for languages named 'Irish'"
for wals_code in w.get_wals_codes_from_name('Irish'):
l = w.languages[wals_code]
print ' %s (ISO-639 code: %s WALS code: %s)' %\
(l['name'], l['iso_codes'], wals_code)
print "\nGetting data for languages with ISO 'isl'"
for wals_code in w.get_wals_codes_from_iso('isl'):
w.show_language(wals_code)
print "\nLocations of dialects for the Min Nan language (ISO 'nan'):"
for wals_code in w.get_wals_codes_from_iso('nan'):
l = w.languages[wals_code]
print " %s\tlat:%f\tlong:%f" %\
(l['name'], l['latitude'], l['longitude'])

0 comments on commit 09eeb91

Please sign in to comment.