## Decision Trees

In [2]:
import numpy as np
import pandas as pd

import os
import time
import requests
import json
from pprint import pprint

from sklearn import tree
from sklearn.cross_validation import train_test_split
from sklearn.utils import check_random_state
from sklearn.metrics import accuracy_score

from nose.tools import assert_equal, assert_is_not, assert_is_instance
from numpy.testing import assert_array_equal, assert_array_almost_equal, assert_almost_equal
from pandas.util.testing import assert_frame_equal

import warnings
warnings.filterwarnings("ignore")

In [3]:
!ls weather | head

weather_kord_2001_0101.json
weather_kord_2001_0102.json
weather_kord_2001_0103.json
weather_kord_2001_0104.json
weather_kord_2001_0105.json
weather_kord_2001_0106.json
weather_kord_2001_0107.json
weather_kord_2001_0108.json
weather_kord_2001_0109.json
weather_kord_2001_0110.json


### Load JSON files

In [4]:
def from_json_to_dict(date, path='/Users/Constance/weather/', prefix='weather_kord_2001_'):
    '''
    Takes a string in the format MMDD where M = month, D = day of month.
    Read a json file at "path" + "prefix" + "date".
    Returns the JSON dictionary.
    
    Parameters
    ----------
    date: A string.
    
    Optional
    --------
    path: A string.
    prefix: A string.
    
    Returns
    -------
    A dict.
    '''
    
    suffix='.json'
    file=path+prefix+date+suffix
    with open(file,'r') as fin:
        data=json.load(fin)
    return data

In [5]:
test_0101_dict = from_json_to_dict('0101')
assert_is_instance(test_0101_dict, dict)
assert_equal('history' in test_0101_dict, True)
assert_equal('observations' in test_0101_dict['history'], True)
assert_is_instance(test_0101_dict['history']['observations'], list)

test_0103_dict = from_json_to_dict('0103')
assert_is_instance(test_0103_dict, dict)
assert_equal('history' in test_0103_dict, True)
assert_equal('observations' in test_0103_dict['history'], True)
assert_is_instance(test_0103_dict['history']['observations'], list)

In [97]:
data = from_json_to_dict('0101')
print(data.keys())

dict_keys(['response', 'current_observation', 'history'])


In [7]:
print(data['history'].keys())

dict_keys(['date', 'utcdate', 'observations', 'dailysummary'])


### Parse time and visibility from JSON

In [8]:
def from_dict_to_visibility(json_data):
    '''
    Takes a dictionary and returns a tuple of (Month, Day, Hour, Minute, Visibility).
    
    Parameters
    ----------
    json_data: A dict.
    
    Returns
    -------
    A 5-tuple (str, str, str, str, str)
    '''
    
    obs=json_data['history']['observations']
    result=[]
    for i in range(len(obs)):
        month=obs[i]['date']['mon']
        day=obs[i]['date']['mday']
        hour=obs[i]['date']['hour']
        minute=obs[i]['date']['min']
        visi=obs[i]['visi']
        one=(month,day,hour,minute,visi)
        result.append(one)
    return result

In [9]:
test_0101_visi = from_dict_to_visibility(test_0101_dict)
assert_is_instance(test_0101_visi, list)
assert_equal(len(test_0101_visi), 24)
for item in test_0101_visi:
    assert_is_instance(item, tuple)
    assert_equal(len(item), 5) # month, day, hour, minute, visibility
    assert_equal(item[0], '01')
    assert_equal(item[1], '01')
    
test_0103_visi = from_dict_to_visibility(test_0103_dict)
assert_is_instance(test_0103_visi, list)
assert_equal(len(test_0103_visi), 34) # some days have more than one measurement per hour
for item in test_0103_visi:
    assert_is_instance(item, tuple)
    assert_equal(len(item), 5)
    assert_equal(item[0], '01')
    assert_equal(item[1], '03')

### Process all 365 files

In [10]:
dates = ['{0:0>2}{1:0>2}'.format(m, d + 1) for m in [1, 3, 5, 7, 8, 10, 12] for d in range(31)]
dates.extend(['{0:0>2}{1:0>2}'.format(m, d + 1) for m in [4, 6, 9, 11] for d in range(30)])
dates.extend(['02{0:0>2}'.format(d + 1) for d in range(28)])
dates.sort()

assert_equal(len(dates), 365)

print("The first five elements are {}".format(dates[:5]))
print("The last five elements are {}".format(dates[-5:]))

The first five elements are ['0101', '0102', '0103', '0104', '0105']
The last five elements are ['1227', '1228', '1229', '1230', '1231']


In [11]:
def collect_365_days(dates):
    '''
    Uses from_json_to_dict() and from_dict_to_visiblility() to
    generate a list of tuples of the form
    (Month, Day, Hour, Minute, Visibility)
    
    Parameters
    ----------
    dates: A list of strings "MMDD"
    
    Returns
    -------
    A list of 5-tuples (str, str, str, str, str)
    '''
    
    result=[]
    for i in dates:
        new=from_dict_to_visibility(from_json_to_dict(i))
        result.append(new)
    visibilities=[]
    for j in result:
        for k in j:
            visibilities.append(k)
    return visibilities

In [12]:
visibilities = collect_365_days(dates)

print("The length of visibilities is {}.".format(len(visibilities)))
print("The first five elements of visibilities are {}".format(visibilities[:5]))

The length of visibilities is 10168.
The first five elements of visibilities are [('01', '01', '00', '56', '9.0'), ('01', '01', '01', '56', '7.0'), ('01', '01', '02', '56', '10.0'), ('01', '01', '03', '56', '10.0'), ('01', '01', '04', '56', '9.0')]


In [13]:
assert_is_instance(visibilities, list)
assert_equal(len(visibilities), 10168)
assert_equal(visibilities[:5],
    [('01', '01', '00', '56', '9.0'),
     ('01', '01', '01', '56', '7.0'),
     ('01', '01', '02', '56', '10.0'),
     ('01', '01', '03', '56', '10.0'),
     ('01', '01', '04', '56', '9.0')]
    )
assert_equal(visibilities[-5:],
    [('12', '31', '19', '56', '10.0'),
     ('12', '31', '20', '56', '10.0'),
     ('12', '31', '21', '56', '10.0'),
     ('12', '31', '22', '56', '10.0'),
     ('12', '31', '23', '56', '10.0')]
    )

In [14]:
df = pd.read_csv('2001.csv', encoding='latin-1', usecols=(1, 2, 5, 8, 15, 16))

In [15]:
local = df[(df['Origin'] == 'ORD') & (df['UniqueCarrier'] == 'AA')]
local = local.drop(['UniqueCarrier', 'Origin'], axis=1) 
local['Delayed'] = (local['DepDelay'] > 15).astype(np.int) 
local = local.drop('DepDelay', axis=1).dropna() 

### Convert strings to numbers

In [16]:
def from_string_to_numbers(visibilities):
    '''
    Takes a list of 5-tuples of strings.
    Convert the strings into integers in the form `mmddHHMM`,
    where `m` is month, `d` is day of month, `H` is hour, and `M` is minute.
    Returns a pandas.DataFrame with two columns "Time" and "Visibility".
    
    Parameters
    ----------
    visibilities: A list of 5-tuple of strings.
    
    Returns
    -------
    A pandas.DataFrame
    '''
    
    new=[]
    for i in visibilities:
        date=''.join(i)[:8]
        visi=''.join(i)[8:]
        anew=(date,visi)
        new.append(anew)
    result=pd.DataFrame(new,columns=('Time','Visibility'))
    result['Time']=result.Time.astype('int64')
    result['Visibility']=result.Visibility.astype('float64')
    return result

In [127]:
time_visi = from_string_to_numbers(visibilities)
time_visi.Visibility.unique()
time_visi.Time.dtype

dtype('int64')

In [18]:
visi0 = [
    ('01', '01', '06', '00', '1.0'),
    ('02', '31', '08', '00', '2.0'),
    ('10', '05', '07', '00', '3.0'),
    ('12', '29', '09', '00', '4.0'),
    ('09', '30', '23', '00', '5.0'),
    ('07', '04', '12', '00', '6.0'),
    ('05', '12', '15', '00', '7.0'),
    ('11', '11', '18', '00', '8.0')
]

visi_answer = pd.DataFrame({
    'Time': [1010600, 2310800, 10050700, 12290900,
             9302300, 7041200, 5121500, 11111800],
    'Visibility': [1., 2., 3., 4., 5., 6., 7., 8.]
    })

assert_frame_equal(from_string_to_numbers(visi0), visi_answer)

In [19]:
local.head()

Unnamed: 0,Month,DayofMonth,CRSDepTime,Delayed
398444,1,1,1905,1
398445,1,2,1905,1
398446,1,3,1905,1
398447,1,4,1905,0
398448,1,5,1905,1


### Create a Time column

In [20]:
def combine_time(df):
    '''
    Combines "Month", "DayofMonth", and "CRSDepTime" in the form mmddHHMM.
    Creates a new column named "Time".
    
    Parameters
    ----------
    df: A pandas.DataFrame
    
    Returns
    -------
    A pandas.DataFrame
    '''
    
    new=df.astype(str)
    new['DayofMonth']=new.DayofMonth.apply(lambda x: x.zfill(2))
    new['CRSDepTime']=new.CRSDepTime.apply(lambda x: x.zfill(4))
    new['Time']=new.Month+new.DayofMonth+new.CRSDepTime
    new=new.astype('int64')
    return new

In [21]:
time_delayed = combine_time(local)
print(time_delayed.head())
time_delayed.Time.dtype

        Month  DayofMonth  CRSDepTime  Delayed     Time
398444      1           1        1905        1  1011905
398445      1           2        1905        1  1021905
398446      1           3        1905        1  1031905
398447      1           4        1905        0  1041905
398448      1           5        1905        1  1051905


dtype('int64')

In [22]:
df0 = pd.DataFrame({
    'Month':      [  1,   2,  10,   12,   9,     7,    5,   11],
    'DayofMonth': [  1,  31,   5,   29,  30,     4,   12,   11],
    'CRSDepTime': [600, 800, 700,  900, 2300, 1200, 1500, 1800]
    })

df_answer = df0.join(pd.DataFrame({
    'Time': [1010600, 2310800, 10050700, 12290900, 9302300, 7041200, 5121500, 11111800]
    }))

assert_is_not(combine_time(df0), df0)
assert_frame_equal(combine_time(df0), df_answer)

In [23]:
def match_visibility(df_delayed, df_visibility):
    result = df_delayed.copy()
    for idx, row in result.iterrows():
        matched = (row['Time'] - df_visibility['Time']).abs().idxmin()
        result.loc[idx, 'Visibility'] = df_visibility.loc[matched, 'Visibility']
    return result

local_visi = match_visibility(time_delayed, time_visi)

print(local_visi.head())

        Month  DayofMonth  CRSDepTime  Delayed     Time  Visibility
398444      1           1        1905        1  1011905        10.0
398445      1           2        1905        1  1021905         9.0
398446      1           3        1905        1  1031905         5.0
398447      1           4        1905        0  1041905         7.0
398448      1           5        1905        1  1051905        10.0


In [24]:
local_visi = local_visi.drop(['Month', 'DayofMonth', 'Time'], axis=1)
print(local_visi.head())

        CRSDepTime  Delayed  Visibility
398444        1905        1        10.0
398445        1905        1         9.0
398446        1905        1         5.0
398447        1905        0         7.0
398448        1905        1        10.0


### Split

In [26]:
def split(df, test_column, test_size, random_state):
    '''
    Uses sklearn.train_test_split to split "df" into a testing set and a test set.
    The "test_columns" lists the column that we are trying to predict.
    All columns in "df" except "test_columns" will be used for training.
    The "test_size" should be between 0.0 and 1.0 and represents the proportion of the
    dataset to include in the test split.
    The "random_state" parameter is used in sklearn.train_test_split.
    
    Parameters
    ----------
    df: A pandas.DataFrame
    test_columns: A list of strings
    test_size: A float
    random_state: A numpy.random.RandomState instance
    
    Returns
    -------
    A 4-tuple of pandas.DataFrames
    '''
    
    x=df.drop(test_column, axis=1)
    y=df[test_column]
    (X_train, X_test, y_train, y_test) = train_test_split(x, y, test_size=test_size,random_state=random_state)
    X_train=np.array(X_train)
    X_test=np.array(X_test)
    y_train=np.array(y_train).flatten()
    y_test=np.array(y_test).flatten()
    return X_train, X_test, y_train, y_test

In [27]:
X_train, X_test, y_train, y_test = split(
    df=local_visi,
    test_column=['Delayed'],
    test_size=0.2,
    random_state=check_random_state(0)
    )

### Train a Decision Trees model

In [33]:
def fit_and_predict(X_train, y_train, X_test, random_state):
    '''
    Fits Decision Trees.
    
    Parameters
    ----------
    X: A pandas.DataFrame. Training attributes.
    y: A pandas.DataFrame. Truth labels.
    
    Returns
    -------
    A numpy array.
    '''
    
    dtc = tree.DecisionTreeClassifier(random_state=random_state)
    dtc.fit(X_train, y_train)
    prediction=dtc.predict(X_test)
    return prediction

In [34]:
y_pred = fit_and_predict(X_train, y_train, X_test, random_state=check_random_state(0))
accuracy = accuracy_score(y_test, y_pred)
print('The accuracy score is {:0.2f}.'.format(accuracy))

The accuracy score is 0.82.


In [36]:
assert_is_instance(y_pred, np.ndarray)
assert_equal(len(y_pred), len(y_test))
assert_almost_equal(accuracy, 0.817207853501)