# Pandas

In [45]:
from nose.tools import assert_true, assert_false, \
    assert_almost_equal, assert_equal, assert_raises

**Problem 1 (25 points).** Use Pandas to read in the file pruning_machine_learning_10292014.txt. Create a pandas DataFrame that has all the columns in the text file EXCEPT for "edge_label" and "minmaxdfe." Also all missing (NaN) values should be replaced with the mean value for the relevant column. **Hint**: read up on [``fillna``](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html) and [``drop``](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html)

In [46]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, OrderedDict
import datetime
import numpy as np
from dateutil import parser

In [47]:
data1 = \
    pd.read_table(
        "data.txt", na_values=['None']).drop(['edge_label', 'minmaxdfe'], axis=1)
data1 = data1.fillna(data1.mean())
data1.head(10)


Unnamed: 0,basename,prune_label,pathlength,exterior2surface,surface2volume,mindfe,maxdfe,length2width,depth,angle
0,671_PV_skeleton_graphs.pckle,0,23,0.775,0.101429,4.0,7.0,4.181818,50.074449,0.024576
1,671_PV_skeleton_graphs.pckle,0,45,0.910923,0.075812,2.0,7.0,10.0,50.074449,0.021655
2,671_PV_skeleton_graphs.pckle,0,137,0.922793,0.063619,2.0,9.0,24.909091,14.517342,0.03048
3,202_PV_skeleton_graphs.pckle,0,3,0.713208,0.345953,5.0,5.0,0.6,60.433135,0.019737
4,202_PV_skeleton_graphs.pckle,0,6,0.732123,0.377787,4.0,5.0,1.333333,60.433135,0.016531
5,202_PV_skeleton_graphs.pckle,0,113,0.922076,0.200871,3.0,9.0,18.833333,10.903907,0.021772
6,849_PV_skeleton_graphs.pckle,0,21,0.658359,0.03006,4.0,9.0,3.230769,85.675047,0.016949
7,849_PV_skeleton_graphs.pckle,0,17,0.742336,0.033536,4.0,7.0,3.090909,95.765084,0.019737
8,849_PV_skeleton_graphs.pckle,0,22,0.847027,0.024682,5.0,7.0,3.666667,95.765084,0.019737
9,849_PV_skeleton_graphs.pckle,0,18,0.62244,0.036421,3.0,7.0,3.6,95.765084,0.015989


In [48]:
assert_equal(type(data1), pd.core.frame.DataFrame)

In [49]:
assert_equal(data1.shape, (753, 10))

In [50]:
assert_false(pd.isnull(data1["exterior2surface"][70]))

In [51]:
assert_almost_equal(data1["mindfe"][732], 4.1495063469675602)

In [52]:
assert_true("maxdfe" in data1.columns)

In [53]:
assert_false("edge_label" in data1.columns)

**Problem 2 (10 points):** Write a function `get_date` that takes a single positional argument and uses the parser defined in the [dateutil](http://dateutil.readthedocs.io/en/stable/parser.html#dateutil.parser.parse) package to convert the argument to a [`datetime`](https://docs.python.org/3/library/datetime.html) [`date`](https://docs.python.org/3/library/datetime.html#datetime.date) object. If the argument cannot be converted to a `date` object, the function should return a [Pandas `NaT`](https://pandas.pydata.org/pandas-docs/stable/missing_data.html#datetimes) (not a time) object.


In [54]:
# YOUR CODE HERE
def get_date(date):
    """
    Uses the parser defined in the dateutil package to convert the argument to a datetime date object.
    Arguments:
        date: the date in string dormat to be converted.
    Returns:
        dt: the datatime date object
    """
    try:
        dt = parser.parse(date, dayfirst=False, yearfirst=False).date()
        return dt
    except:
        return pd.NaT
    

In [55]:
assert_equal(get_date("5/17/2007"), datetime.date(2007, 5, 17))

In [56]:
assert_true(pd.isnull(get_date("Hello")))

**Problem 3 (5 points):** Write a function get_date that takes a single positional argument and uses the parser defined in the dateutil package to convert the argument to a datetime time object. If the argument cannot be converted to a time object, the function should return a Pandas NaT (not a time) object.

In [57]:
# YOUR CODE HERE
def get_time(time):
    """
    Uses the parser defined in the dateutil package to convert the argument to a datetime time object.
    Arguments:
        time: the time in string dormat to be converted.
    Returns:
        dt: the datatime time object
    """
    try:
        dt = parser.parse(time, dayfirst=False, yearfirst=False).time()
        return dt
    except:
        return pd.NaT

In [58]:
assert_true(pd.isnull(get_time("Hello")))

**Problem 4 (15 points):** Write a function `get_range` that takes a positional argument containing a string from which to extract a range and a keyword argument named `delimiter` that has a string indicating the delimiter between the low and high values of the range. The function should **try** to return a tuple of two floating point values (the low range value and the high range value). 

The function should return a two-tuple `np.nan` values  (`(np.nan, np.nan)`) for the following conditions:

1. The string does not contain exactly one instance of `delimiter`.
1. The attempt to extract two floating point values fails.   

If either the positional or keyword arguments are not strings, return a `TypeError`.

In [285]:
# YOUR CODE HERE
def get_range(range_in, delimeter="1"):
    """
    Extracts a range
    Arguments:
        range_in: string from which to extract a range.
        delimeter: a string indicating the delimiter between the low and high values of the range.
    Returns:
        range_out: a tuple of two floating point values representing the low range value and the high range value.
    """
    if not isinstance(range_in, str) or not isinstance(delimeter, str):
        raise TypeError
    else:
        try:
            d = float(delimeter)
            nums = [float(s) for s in range_in.split("-")]
            if len(nums) != 2:
                return tuple((np.nan, np.nan))
            out = (float(nums[0]),float(nums[1]))
            return out
        
        except:
            return tuple((np.nan, np.nan))  

In [234]:
assert_raises(TypeError, get_range, 5.4)

In [235]:
assert_raises(TypeError, get_range, 5.4, delimiter=(5,4,3))

In [236]:
assert_true(np.isnan(get_range("12-24-1972")).all())

In [237]:
assert_equal(get_range("15-20"), (15,20))

**Problem 5 (10 points):** Write a function `is_number` that takes a single positional argument and returns True if that argument can be converted to a `float` and `false` if it cannot.

In [64]:
# YOUR CODE HERE
def is_number(number):
    """
    Returns True if that argument can be converted to a float and  false if it cannot
    Argument:
        number: the number to test
    """
    try:
        float(number)
        return True
    except:
        return False


In [65]:
assert_true(is_number("5.3e4"))

In [66]:
assert_false(is_number(">5000"))

**Problem 6 (5 points):** Write a function `get_value` that takes a single positional argument (e.g. `x`) and returns `x` converted to a float. If the conversion fails, return `np.nan`.

In [67]:
# YOUR CODE HERE
def get_value(x):
    try:
        return float(x)
    except:
        return np.nan

In [68]:
assert_true(np.isnan(get_value("[5.4]")))

In [69]:
assert_equal(get_value("5.7"), 5.7)

### For the following problems we will use the `sampleLabs.txt` file
#### In this directory is a file (`sampleLabs.txt`) with a set of lab values. The file includes lab values obtained on different individuals during their in-patient hospitalizations. 

**Problem 7 (15 points)**: Use `is_number()`, list comprehension, and the Pandas Series `unique` method to create a list of all non-numeric values in the `result` Series of `lab_data`. By non-numeric, I mean any value that cannot be converted to a `float`.

In [238]:
non_number_results = []
lab_data = pd.read_table("samplelabs.txt")

uniques = lab_data['result'].unique()
for word in uniques:
    if not is_number(word):
        non_number_results.append(word)

print(non_number_results)
    

['MISL', 'ID', '>5000', '<2', 'RQ', 'CNTM', 'A', 'Mislabeled', 'RQER', 'LERR', 'NCAL', 'NDA', 'QNS']


In [239]:
assert_true(type(non_number_results)==list)

**Problem 8. (40 Points):**  Use Pandas to write a function named `read_labs` that does the following:

* Takes the name of the file to read as a positional argument
* Takes as a keyword argument named `converters` a dictionary with keys equal to the test component names (columns in DataFrame) and as values, functions that convert the expected column value to the appropriate type. You should use the ordered dictionary (`converters`) I define below.
* Reads the contents of the input file into a **default dictionary** keyed by **visitid**. The value for each **visitid** should be a **list** of tests. For each row, use `converters` to convert the row value from a string to the appropriate data type (eg. float, datetime.date). Each test result result should be stored as a tuple of 2-tuples with the first element being the explanatory metadata (i.e, the column name) and the second element being the converted value.

#### Hints

1. Use the DataFrame `iterrows` method to iterate over each row in the DataFrame.
1. Use the function `get_test_values` to convert the data types of each row.
1. Use slicing on the row to grab the test values (`visitid` is going to be the key for the dictionary and should not be part of the test result tuple.
1. A row is a Pandas Series and we can access the elements of the row using either an index or the column name:

```Python
print(row)
print()
print(row[2])
print(row["ctime"])

visitid    OMHioJh8XEeq7152
cdate             6/13/2007
ctime                 06:30
pqno       1181750718122403
test                  CREAT
result                  1.0
unit                  mg/dL
range               0.5-1.4
Name: 5, dtype: object

06:30
06:30
```

Your resulting output should look something like this:

```Python
defaultdict(list,
            {'+yhZLyY5Uqra5115': [[('cdate', datetime.date(2005, 4, 29)),
               ('ctime', datetime.time(6, 30)),
               ('pqno', 1114780124780442),
               ('test', 'CREAT'),
               ('result', 3.4),
               ('unit', 'mg/dL'),
               ('range', (0.5, 1.4))],
              [('cdate', datetime.date(2005, 4, 28)),
               ('ctime', datetime.time(6, 45)),
               ('pqno', 1114692308737109),
               ('test', 'CREAT'),
               ('result', 8.6),
               ('unit', 'mg/dL'),
               ('range', (0.5, 1.4))],
              [('cdate', datetime.date(2005, 5, 2)),
               ('ctime', datetime.time(4, 0)),
               ('pqno', 1115041801503288),
               ('test', 'CREAT'),
               ('result', 1.6),
               ('unit', 'mg/dL'),
               ('range', (0.5, 1.4))],
              [('cdate', datetime.date(2005, 4, 30)),
               ('ctime', datetime.time(4, 0)),
               ('pqno', 1114875733484671),
               ('test', 'CREAT'),
               ('result', 1.9),
               ('unit', 'mg/dL'),
               ('range', (0.5, 1.4))],
              [('cdate', datetime.date(2005, 4, 26)),
               ('ctime', datetime.time(16, 5)),
               ('pqno', 1114551787814371),
               ('test', 'CREAT'),
               ('result', 16.0),
               ('unit', 'mg/dL'),
               ('range', (0.5, 1.4))],
              [('cdate', datetime.date(2005, 5, 1)),
               ('ctime', datetime.time(4, 0)),
               ('pqno', 1114957388877371),
               ('test', 'CREAT'),
               ('result', 1.7),
               ('unit', 'mg/dL'),
               ('range', (0.5, 1.4))]],}
```

In [286]:
from collections import OrderedDict
converters = OrderedDict((('cdate', get_date), 
                          ('ctime', get_time), 
                          ('pqno', lambda x:x), 
                          ('test', lambda x:x), 
                          ('result', get_value), 
                          ('unit', lambda x:x), 
                          ('range', get_range)))

In [287]:
import datetime
import os
import pandas as pd
from collections import defaultdict

        
def get_test_values(row, converters):
    """
    Arguments:
        row: A row from a Pandas DataFrame
        converters: A Dictionary with keys the DataFrame column names and 
            values functions to convert each column value
    Returns:
        A tuple of two-tuples. Each two-tuple consists of the column name
            and the converted column value
    """
    # YOUR CODE HERE
    tuples = []
    for k,v in converters.items():
        tuples.append((k,v(row[k])))
    return tuples
    
    
def read_labs(fname, converters):
    """
    Arguments:
        fname: Path to a file containing lab values
        converters: A Dictionary with keys the DataFrame column names and 
            values functions to convert each column value
    Returns:
        A default dictionary with keys being the visitid and values
            begin a list of test results
    """
    # YOUR CODE HERE
    dic = defaultdict(list)
    data = pd.read_table(fname)
    row_iterator = data.iterrows()
    for i, row in row_iterator:
        values = get_test_values(row, converters)
        dic[row[0]].append(values)
    return dic

In [288]:
sorted_labs = read_labs("samplelabs.txt", converters)

In [289]:
assert_equal(sorted_labs['X4HZ5/mbdrls7055'][8][1][1], datetime.time(4, 4))

In [290]:
assert_almost_equal(sorted_labs["3WobBYVBHdIa6106"][5][6][1],(0.5,1.4))

In [291]:
assert_true('QLE6UIMptiKf5228' in sorted_labs)

In [292]:
assert_equal(len(sorted_labs['n9b8R+Aynzy+5257']),  31)

In [293]:
assert_equal(sorted_labs['yyutduwkJpYJ7131'][1][1], 
             ('ctime', datetime.time(6, 15)))

In [294]:
assert_true(type(sorted_labs['yyutduwkJpYJ7131'][1]),tuple)

In [295]:
assert_true(np.isnan(sorted_labs['deaOcpSo0N617039'][14][4][1]))

In [296]:
assert_true(np.isnan(sorted_labs['QLE6UIMptiKf5228'][17][6][1]).all())

**Problem 9 (20 points)**: Use Pandas to read in the file ~/Data/Numerics/mimic2/bp/subjects/19468.txt. Rename the columns to be "systolic" and "diastolic". Create a new column named "difference" with values equal to the difference between the "systolic" and "diastolic" values.

In [44]:
# COULDN'T FIND THE FILE, CREATED AND USED BP.TXT INSTEAD
def difference(row):
    return row['systolic'] - row['diastolic']

bpdata = \
    pd.read_csv(
        "bp.txt", header=None)
'''
bpdata = \
    pd.read_csv(
        " ~/Data/Numerics/mimic2/bp/subjects/19468.txt", header=None)
'''   
bpdata.columns = ['systolic', 'diastolic']
bpdata['difference'] = bpdata.apply (lambda row: difference(row), axis=1)
bpdata.head()

Unnamed: 0,systolic,diastolic,difference
0,121,24,97
1,131,35,96
2,123,42,81


In [38]:
assert_equal(bpdata.shape, (35,3))

AttributeError: 'NoneType' object has no attribute 'shape'

In [None]:
assert_equal(bpdata.columns.tolist(), ['systolic', 'diastolic', 'difference'])

In [None]:
assert_almost_equal(bpdata["systolic"].mean(), 142.71428571428572)

In [None]:
assert_equal(bpdata["systolic"].dtype, np.int64)