## Problem 1

In [1]:
from nose.tools import assert_equal
import numpy as np
import pandas as pd
from numpy.testing import assert_almost_equal, assert_array_equal

In [2]:
df = pd.DataFrame({
    "Origin": ['ORD', 'DFW', 'ATL', 'LAX', 'PHX', 'STL', 'DTW', 'MSP', 'LAS', 'BOS'],
    "Distance": [1050, 370,  228,  678,  873, 472,  370, 1182,  745,  448],
    "ArrDelay": [-3,   4,  23,  10,  20, -3, -10, -12,  -9,  -1],
    "DepDelay": [-4,  -5,  11,  -3,   0, -3,  -8,  -6,   2,   2],
    "Cancelled": [0, 0, 0, 0, 1, 0, 1, 0, 1, 1]
})

print(df)

   ArrDelay  Cancelled  DepDelay  Distance Origin
0        -3          0        -4      1050    ORD
1         4          0        -5       370    DFW
2        23          0        11       228    ATL
3        10          0        -3       678    LAX
4        20          1         0       873    PHX
5        -3          0        -3       472    STL
6       -10          1        -8       370    DTW
7       -12          0        -6      1182    MSP
8        -9          1         2       745    LAS
9        -1          1         2       448    BOS


In [4]:
def get_log10(df, column):
    '''
    Takes a data frame and a column name.
    Returns base-10 log of the specified column.
    
    Parameters
    ----------
    df: A pandas data frame.
    column: A string.
    
    Returns
    -------
    A Pandas Series.
    '''
    
    return df[column].apply(np.log10)

In [5]:
log_10_distance = get_log10(df, "Distance")
answer1 = np.array([
    3.0211893, 2.56820172, 2.35793485, 2.83122969,  2.94101424,
    2.673942, 2.56820172, 3.07261748, 2.87215627, 2.65127801
])

assert_almost_equal(log_10_distance.values, answer1)

In [10]:
def get_sum(df, columns):
    '''
    Takes a Pandas DataFrame as its first argument and a list of columns as its second argument.
    Returns a Pandas Series with the sum of the columns listed in "columns".
    
    Parameters
    ----------
    df: A pandas DataFrame.
    columns: A list of strings.
    
    Returns
    -------
    A pandas Series.
    '''

    return df[columns].apply(sum,axis=1)

In [11]:
# insert assert tests here
total_delay = get_sum(df, ["ArrDelay", "DepDelay"])
answer2 = np.array([ -7,  -1,  34,   7,  20,  -6, -18, -18,  -7,   1])
assert_almost_equal(answer2, total_delay.values)

arr_delay = get_sum(df, ["ArrDelay"])
answer3 = np.array(df.ArrDelay)
assert_almost_equal(answer3, arr_delay.values)

In [32]:
f = lambda x: x['Origin'] if (x['Cancelled']==1) else 'NA' 

In [27]:
def get_cancelled_origin(s):
    if s['Cancelled']==1:
        return s['Origin']
    else:
        return 'NA'

In [29]:
s = pd.Series({
        'foo':'bar',
        "Origin":'A', 
        'Cancelled': 1
          })
assert_equal("A", get_cancelled_origin(s))

s = pd.Series({
        'foo':'bar',
        "Origin":'A', 
        'Cancelled': 0
          })

assert_equal("NA", get_cancelled_origin(s))

s = pd.Series({
        'foo':'bar',
        "Origin":'A', 
        'Cancelled': -1
          })

assert_equal("NA", get_cancelled_origin(s))

In [36]:
df.apply(get_cancelled_origin,axis=1)

0     NA
1     NA
2     NA
3     NA
4    PHX
5     NA
6    DTW
7     NA
8    LAS
9    BOS
dtype: object

In [46]:
def apply_userdefined_func(df, f, axis=1):
    '''
    Generic wrapper to apply a function to a dataframe on a given axis
    
    Parameters
    ----------
    df: A pandas DataFrame.
    f: A function to apply
    axis: the axis argument to pass to pd.DataFrame.apply()
    
    Returns
    -------
    A pandas Series.
    '''
    
    result=df.apply(f,axis)
    return result

In [47]:
ans4 = pd.Series(
    ["NA", "NA", "NA", "NA", "PHX", "NA", "DTW", "NA", "LAS", "BOS"]
    )
assert_array_equal(ans4, apply_userdefined_func(df, get_cancelled_origin).values)


df2 = df.drop('Origin', axis=1)
f = lambda x: (np.mean(x) + 1 )
ans5 = pd.Series(
    [261.75, 93.25, 66.50, 172.25, 224.50,
    117.5, 89.25, 292.0, 185.75, 113.5]
)
assert_almost_equal(ans5, apply_userdefined_func(df2, f, axis=1).values)

In [48]:
df2 = df.drop('Origin', axis=1)
f = lambda x: (np.mean(x) - 2 * np.std(x), np.mean(x) + 2 * np.std(x))
apply_userdefined_func(df2, f, axis=0)

ArrDelay      (-21.362845913602232, 25.16284591360223)
Cancelled    (-0.5797958971132712, 1.3797958971132713)
DepDelay      (-11.761467077590895, 8.961467077590894)
Distance      (38.531711992746295, 1244.6682880072538)
dtype: object