# Setup and get data

In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import datasets
import math
dir(sklearn.datasets)[15:20] # print all 
iris = sklearn.datasets.load_iris()
# convert to pandas df
iris = pd.DataFrame(np.concatenate((iris.data, np.array([iris.target]).T), axis=1), 
                    columns=iris.feature_names + ['target'])
# clean col names
iris.columns = [c.replace(' ', '_') for c in iris.columns]
iris.rename(columns={'sepal_length_(cm)': 'sepal_length', 
                     'sepal_width_(cm)': 'sepal_width', 
                     'petal_length_(cm)':  'petal_length',
                     'petal_width_(cm)': 'petal_width'}, inplace=True)

%run -i pandas_startup.py

pandas_startup() #set pandas options
assert(pd.get_option('display.max_rows')) #make sure options exist

In [9]:
iris.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target,test_condition
0,5.1,3.5,1.4,0.2,0.0,0
1,4.9,3.0,1.4,0.2,0.0,0
2,4.7,3.2,1.3,0.2,0.0,0
3,4.6,3.1,1.5,0.2,0.0,0
4,5.0,3.6,1.4,0.2,0.0,0
5,5.4,3.9,1.7,0.4,0.0,0
6,4.6,3.4,1.4,0.3,0.0,0
7,5.0,3.4,1.5,0.2,0.0,0
8,4.4,2.9,1.4,0.2,0.0,0
9,4.9,3.1,1.5,0.1,0.0,0


### Iterate outside the function with apply() and lambda

assertions and printing info has to go outside the function.

In [23]:
def test_condition(petal_width, petal_length):
    if petal_width < .5*petal_length and petal_width > .2:
        return 1
    else:
        return 0

iris['test_condition'] = iris.apply(lambda x: 
                                   test_condition(x['petal_width'],
                                   x['petal_length']),
                                   axis=1)

## assertions and printing info ##
assert(((iris['test_condition'] == 0) | (iris['test_condition'] == 1)).all())
print(len(iris),
      "cases processed.",
      len(iris['test_condition'][iris['test_condition'] == 1]),
      "meet the condition and",
      len(iris['test_condition'][iris['test_condition'] == 0]),
      "do not meet the condition")
    

150 cases processed. 116 meet the condition and 34 do not meet the condition


### Without explicit iteration: fails with `truth value of a Series is ambiguous`

In [25]:
def test_condition(data):
    if data['petal_width'] < .5*data['petal_length'] and data['petal_width'] > .2:
        return 1
    else:
        return 0

test_condition(iris)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

### Iterate *inside the function*
so you can print info about the input and output data within the function too

In [18]:
def test_condition(data):
    test_condition = [None] * len(data) # initialize empty
    for i in data.index:
    
        if (data['petal_width'][i] < .5*data['petal_length'][i]) and (data['petal_width'][i] > .2):
            test_condition[i] = 1
        else:
            test_condition[i] = 0
    test_condition = pd.Series(test_condition)
    assert((test_condition == 0) | (test_condition == 1)).all() # all values are 0 or 1
    print(len(test_condition),
          "cases processed.",
          len(test_condition[test_condition == 1]), 
          "are previously paid and",
          len(test_condition[test_condition == 0]),
          "are not previously paid.")
    return test_condition

iris['test_condition'] = test_condition(iris)


150 cases processed. 116 are previously paid and 34 are not previously paid.
