In [69]:
import os
import numpy as np
import pandas as pd
from nose.tools import ok_, assert_equal

import warnings
warnings.filterwarnings("ignore")

In [74]:
fixed_df = pd.read_csv('~/w7p3/2001.csv', encoding='latin-1', usecols=['Year'])

## Problem 1

### Function: get_column()

In [75]:
def get_column(filename, column):
    '''
    Reads the specified column of airline on-time performance CSV file,
    which is in 'latin-1' encoding.
    Returns a Pandas DataFrame with only one column.
    
    Parameters
    ----------
    filename(str): The file name.
    column(str): The column header.
    
    Returns
    -------
    A pandas.DataFrame object that has only column.
    
    Examples
    --------
    arr_delay = get_column('/home/data_scientist/data/2001.csv', 'ArrDelay')
    '''
    
    df=pd.read_csv(filename, encoding='latin-1', usecols=[column])
    return df

In [77]:
# header cases
ok_(
    get_column('~/w7p3/test.header.csv', 'Year').equals(
        pd.DataFrame(data=[2001] * 4, columns=['Year'])
    ))
ok_(
    get_column('~/w7p3/test.header.csv', 'DayofMonth').equals(
        pd.DataFrame(data=list(range(17, 21)), columns=['DayofMonth'])
    ))
ok_(
    get_column('~/w7p3/test.header.csv', 'DepTime').equals(
        pd.DataFrame(data=[1806, 1805, 1821, 1807], columns=['DepTime'])
    ))
ok_(
    get_column('~/w7p3/test.header.csv', 'SecurityDelay').equals(
        pd.DataFrame(data=[np.nan] * 4, columns=['SecurityDelay'])
    ))
ok_(
    get_column('~/w7p3/test.header.csv', 'LateAircraftDelay').equals(
        pd.DataFrame(data=[1, 1, np.nan, np.nan], columns=['LateAircraftDelay'])
    ))


### Function: get_stats()

In [94]:
def get_stats(df, column):
    '''
    Calculates the mininum, maximum, mean, and median values
    of a column from a Pandas DataFrame object.
    
    Parameters
    ----------
    df(pandas.DataFrame): A Pandas DataFrame.
    column(str): The column header.
    
    Returns
    -------
    minimum(float)
    maximum(float)
    mean(float)
    median(float)
    '''
    
    a=df[column].min()
    b=df[column].max()
    c=df[column].mean()
    d=df[column].median()
    return (a,b,c,d)

In [91]:
data1 = {
    'A': [0, 1, 2, 3, 4],
    'B': [1, 2, 3, 4, np.nan], # append NaN since we need same number of elements
    'C': [4, 3, 2, 1, 0],
    'D': [4, 1, 0, 2, 3]
    }
df1= pd.DataFrame(data1)

In [96]:
import warnings
warnings.filterwarnings("ignore")
data1 = {
    'A': [0, 1, 2, 3, 4],
    'B': [1, 2, 3, 4, np.nan], # append NaN since we need same number of elements
    'C': [4, 3, 2, 1, 0],
    'D': [4, 1, 0, 2, 3]
    }
df1= pd.DataFrame(data1)

assert_equal(get_stats(df1, 'A'), (0, 4, 2, 2))
assert_equal(get_stats(df1, 'B'), (1, 4, 2.5, 2.5))
assert_equal(get_stats(df1, 'C'), (0, 4, 2, 2))
assert_equal(get_stats(df1, 'D'), (0, 4, 2, 2))

data2 = {
    'E': np.append(np.arange(51), np.nan), # append NaN since we need same number of elements
    'F': np.arange(52)
}
df2 = pd.DataFrame(data2)

assert_equal(get_stats(df2, 'E'), (0, 50, 25.0, 25.0))
assert_equal(get_stats(df2, 'F'), (0, 51, 25.5, 25.5))

# shuffle rows in df2
df3 = df2.reindex(np.random.permutation(df2.index))
assert_equal(get_stats(df2, 'E'), (0, 50, 25.0, 25.0))
assert_equal(get_stats(df2, 'F'), (0, 51, 25.5, 25.5))