Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Branch: master
Fetching contributors…

Cannot retrieve contributors at this time

310 lines (255 sloc) 9.752 kB
#!/opt/local/Library/Frameworks/Python.framework/Versions/2.7/bin/python
# -*- coding: utf-8 -*-
import sys, os, re, csv
from itertools import tee, islice, chain, izip
#-----------------------------------------------------------------------------
# DATA STRUCTURES
#-----------------------------------------------------------------------------
class autoViv(dict):
"""Implementation of Perl's autovivification feature"""
def __missing__(self, key):
value = self[key] = type(self)()
return value
class KeyList(dict):
"""BUILD NESTED DICTIONARIES WITH IMPLICIT
CREATION OF NESTED INTERMEDIATE CONTAINERS.
>>> a = KeyList()
>>> a.keylset("a.b.c", 2)
>>> print(a)
{'a':{'b':{'c' : 2}}}"""
def keylset(self, path, value):
attr = self
path_elements = path.split('.')
for i in path_elements[:-1]:
try:
attr = attr[i]
except KeyError:
attr[i] = KeyList()
attr = attr[i]
attr[path_elements[-1]] = value
#-----------------------------------------------------------------------------
# DIRECTORY & FILE HELPERS
#-----------------------------------------------------------------------------
def dirEntries(dir_name, subdir, *args):
"""
Returns a list of files w/in the supplied directory
if subdir == true: get files from sub directory
additional arguments you can supply are file types
to limit what kind of file is returned.
Example usage:
l = dir_Entries(r'/path/to/folder', False, 'txt', 'py')
"""
fileList = []
for file in os.listdir(dir_name):
dirfile = os.path.join(dir_name, file)
if os.path.isfile(dirfile):
if len(args) == 0:
fileList.append(dirfile)
else:
if os.path.splitext(dirfile)[1][1:] in args:
fileList.append(dirfile)
# recursively access file names in subdirectories
elif os.path.isdir(dirfile) and subdir:
print "Accessing directory:", dirfile
fileList += dirEntries(dirfile, subdir, *args)
return fileList
#-----------------------------------------------------------------------------
# STRING HELPERS
#-----------------------------------------------------------------------------
def whitespaceBegone(s):
'''Remove all forms of whitespace from a string
NOTE - this will remove spaces between words.
'''
return "".join(s.split())
def tsplit(string, delimiters):
"""
Like str.split but supports multiple delimiters.
Example:
s = 'thing1,thing2/thing3-thing4'
tsplit(s, (',', '/', '-'))
['thing1', 'thing2', 'thing3', 'thing4']
"""
delimiters = tuple(delimiters)
stack = [string,]
for delimiter in delimiters:
for i, substring in enumerate(stack):
substack = substring.split(delimiter)
stack.pop(i)
for j, _substring in enumerate(substack):
stack.insert(i+j, _substring)
return stack
def replace_txt(txt, word_dic):
'''Scan text for words that match a given key
& replace them with the keys associated value.
EXAMPLE USAGE:
txt = "you are the bestest"
d = {
'you': 'I',
'are the': 'am',
'bestest': 'the naughtiest'
}
print replace_txt(txt, d)
# 'I am the naughtiest'
'''
rc = re.compile('|'.join(map(re.escape, word_dic)))
def translate(match):
return word_dic[match.group(0)]
return rc.sub(translate, txt)
def GetTxtBetween(txt, a, b):
'''Get txt between two delimiters (a & b)
returns an re group object
'''
return re.findall('{0}(.*?){1}'.format(a, b), txt, re.S)
def LevDistance(s, t):
"""computes the levenshtein distance - aka the distance between two strings
by counting the minimum number of operations needed to transform one
string into another, where an operation is defined as: insertion, deletion,
or substitution of a single character, or transposing of two characters.
"""
s = ' ' + s
t = ' ' + t
d = {}
S = len(s)
T = len(t)
for i in xrange(S): d[i, 0] = i
for j in xrange(T): d[0, j] = j
for j in xrange(1, T):
for i in xrange(1, S):
if s[i] == t[j]:
d[i, j] = d[i-1, j-1]
else:
d[i, j] = min(d[i-1, j] + 1, d[i, j-1] + 1, d[i-1, j-1] + 1)
return d[S-1, T-1]
def LCSubstr(S, T):
"""Longest Common Substring:
computes in length of characters the longest shared substring of (S & T)"""
m, n = len(S), len(T)
L = [[0] * (n+1) for i in xrange(m+1)]
lcs = 0
for i in xrange(m):
for j in xrange(n):
if S[i] == T[j]:
L[i+1][j+1] = L[i][j] + 1
lcs = max(lcs, L[i+1][j+1])
return lcs
#-----------------------------------------------------------------------------
# LIST HELPERS
#-----------------------------------------------------------------------------
def remove_duplicates(lst):
'''
http://stackoverflow.com/questions/6197409/ordered-sets-python-2-7-1
'''
dset = set()
return [x for x in lst if x not in dset and not dset.add(x)]
def nrange(v, lmin, lmax, rmin=0, rmax=100):
'''interpolates one number range to antoher.
Much like d3's range() function
NOTE - a better option is to use numpy's interp
from numpy import interp
a = [18, 36]
b = [0, 100]
interp(18, a, b) # returns 0
'''
return rmin + (float(v - lmin) / float(lmax - lmin) * (rmax - rmin))
def ldistance(l1,l2):
'''Calculate the difference between 2 lists using a distance
metric - similar to how levenstein distance works for words.
NOTE - the two lists must have the same set of ints/chars/objs
EXAMPLE USE:
l1 = [1,2,3,4,5,6]
l2 = [2,3,4,5,6,1]
l3 = [3,4,5,6,1,2]
l4 = [4,5,6,1,2,3]
l5 = [5,6,1,2,3,4]
l6 = [6,1,2,3,4,5]
ldistance(l1, l1) # 36
ldistance(l1, l2) # 26
ldistance(l1, l3) # 20
ldistance(l1, l4) # 18
ldistance(l1, l5) # 20
ldistance(l1, l6) # 26
'''
# x = sum((len(l1) - abs(i - l2.index(v)) for i, v in enumerate(l1)))
# lmax = len(l1)**2
# lmin = float(lmax) / 2
# return nrange(rs, lmin, lmax)
return sum((len(l1) - abs(i - l2.index(v)) for i, v in enumerate(l1)))
#-----------------------------------------------------------------------------
# NLP
#-----------------------------------------------------------------------------
def triples(txt):
"""Generates triples from the given data string.
EXAMPLE:
"What a lovely day" would yield
(What, a, lovely), (a, lovely, day)
"""
words = txt.split()
if len(words) < 3: return
for i in range(len(words) - 2):
yield (words[i], words[i+1], words[i+2])
#-----------------------------------------------------------------------------
# PREVIOUS AND NEXT VALUES INSIDE A LOOP
#-----------------------------------------------------------------------------
# http://stackoverflow.com/questions/1011938/python-previous-and-next-values-inside-a-loop
def prev_and_next(iterable_item):
'''Get previous and next values inside a loop
works on any iterable (files, lists, etc) of any
size, and removes the need to loop multiple times
and save intermediary lists. Also uses generators
EXAMPLE USE:
li = ['a', 'b', 'c', 'd', 'e']
for prev, item, next in prev_and_next(li):
print('{0}-{1}-{2}'.format(prev, item, next))
OUTPUT:
None-a-b
a-b-c
b-c-d
c-d-e
d-e-None
'''
prevs, items, nexts = tee(iterable_item, 3)
prevs = chain([None], prevs)
nexts = chain(islice(nexts, 1, None), [None])
return izip(prevs, items, nexts)
#-----------------------------------------------------------------------------
# UNICODE TOOLS
#-----------------------------------------------------------------------------
def unicode_csv_reader(utf8_data, dialect=csv.excel, **kwargs):
'''Read a UTF8 CSV file.
taken from:
http://stackoverflow.com/questions/904041/reading-a-utf8-csv-file-with-python
EXAMPLE USE:
reader = unicode_csv_reader(open('yourfile.csv'))
for field1, field2, field3 in reader:
print field1, field2, field3
'''
csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
for row in csv_reader:
yield [unicode(cell, 'utf-8') for cell in row]
def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
"""THIS IS TAKEN FROM: django.utils.encoding
Returns a bytestring version of 's', encoded as specified in 'encoding'.
If strings_only is True, don't convert (some) non-string-like objects.
USAGE:
a = u"アニメ"
print smart_str(a)
"""
if strings_only and isinstance(s, (types.NoneType, int)):
return s
if not isinstance(s, basestring):
try:
return str(s)
except UnicodeEncodeError:
if isinstance(s, Exception):
# An Exception subclass containing non-ASCII data that doesn't
# know how to print itself properly. We shouldn't raise a
# further exception.
return ' '.join([smart_str(arg, encoding, strings_only,
errors) for arg in s])
return unicode(s).encode(encoding, errors)
elif isinstance(s, unicode):
return s.encode(encoding, errors)
elif s and encoding != 'utf-8':
return s.decode('utf-8', errors).encode(encoding, errors)
else:
return s
Jump to Line
Something went wrong with that request. Please try again.