In [1]:
def read_unsampled_file(filename):
    """
    Reads the unsampled export file, and produces a list of assemblages, classes, and a numpy array
    of the class counts for further sampling.
    :return: tuple with a list of assemblage names, class_names, and a Numpy array of trait counts
    """
    with open(filename, 'r') as incsv:
        csvread = csv.reader(incsv, delimiter="\t")

        header_row = csvread.next()
        class_names = header_row[1:]  # everything except the first item

        row_list = []
        assemblage_list = []
        for row in csvread:
            assemblage_list.append(row[0])
            row_list.append(row[1:])

        count_arr = np.array(row_list, dtype=np.float32)

    return (assemblage_list, class_names, count_arr)

In [13]:
import numpy as np
import csv

In [8]:
import os.path
import sys
sys.path.append(os.path.abspath( ".."))
import tatome.dip as dip

In [19]:
def filter_cols_for_unimodality(count_arr, threshold):
    """
    Given an array of counts, iterates over the array and tests each column for unimodality
    using the Hartigans' dip test, as implemented by https://github.com/tatome/dip_test.  Values 
    over threshold are taken as evidence for multimodality.  Returns a list of columns which are
    retained (as unimodal) and columns which should be filtered out (as multimodal).
    
    :param count_arr:
    :return: tuple of lists:  rejected_columns, retained_columns
    """
    rejected_columns = []
    retained_columns = []
    for i in range(0, count_arr.shape[1]):
        col = count_arr[:,i]
        print col
        # element zero of the dip test tuple is the p-value (or "dip test value" as it's described in the docs)
        dip_pvalue = dip.dip(idxs=col)[0]
        if dip_pvalue > threshold:
            rejected_columns.append(i)
        else:
            retained_columns.append(i)
    
    return (rejected_columns, retained_columns)

In [14]:
assem, classes, counts = read_unsampled_file("../testdata/testdata-10.txt")

In [15]:
print counts

[[  0.  40.  10.  50.]
 [ 20.   0.  30.  50.]
 [  0.  53.   2.  45.]
 [  0.  48.   4.  48.]
 [  0.  45.   5.  50.]
 [  0.  30.  20.  50.]
 [  0.   5.  25.  70.]
 [ 10.   0.  30.  60.]
 [ 30.   0.  40.  30.]
 [  5.   0.  28.  67.]]


In [20]:
rej, ret = filter_cols_for_unimodality(counts, 0.10)

[  0.  20.   0.   0.   0.   0.   0.  10.  30.   5.]
[ 40.   0.  53.  48.  45.  30.   5.   0.   0.   0.]
[ 10.  30.   2.   4.   5.  20.  25.  30.  40.  28.]
[ 50.  50.  45.  48.  50.  50.  70.  60.  30.  67.]


In [17]:
print rej

[0, 1, 2, 3]


In [18]:
print ret

[]
