## Pandas Database Programming

In [1]:
from nose.tools import assert_equal
import pandas as pd
import sqlite3 as sl
import numpy as np
import os

In [2]:
!mkdir -p ~/w14_p1

In [3]:
try:
    os.remove("/Users/Constance/w14_p1/p1.db")
except OSError as e:
    pass


In [4]:
db = '/Users/Constance/w14_p1/p1.db'

### Problem 1

In [5]:
def create_connector(database):
    '''
    Creates a connection to a sqlite database
    
    Parameters
    ----------
    w: str, a filepath
    
    Returns
    -------
    a sqlite3.Connection object
    '''
    
    return sl.connect(database)

In [6]:
c = create_connector(db)
assert_equal(type(c), sl.Connection)
assert_equal(c.in_transaction, False)

In [None]:
c.close()

### Problem 2

In [24]:
def csv_to_sql(csv_name, table_name, con):
    '''
    Converts a csv file to a SQL table in a given database
    
    Parameters
    ----------
    csv_name: str, a filepath
    table_name: str, a name for the new table
    con: a database connection object
    
    Returns
    -------
    None
    '''
    
    df=pd.read_csv(csv_name)
    table=df.to_sql(table_name, con)
    return None

In [27]:
# get airports.csv into a sql database as the airports table
d = create_connector(db)
csv_to_sql('/Users/Constance/airports.csv', 'airports', d)

# get a cursor object
c = d.cursor()
# check that the number of airports is correct
num_apts = c.execute("SELECT COUNT(*) FROM airports").fetchone()
assert_equal(num_apts[0], 3376)
# check that the first airport is
one_apt = c.execute("SELECT * FROM airports ORDER BY iata ASC").fetchone()
assert_equal(one_apt[1], "00M")

In [97]:
d.close()

In [29]:
print(num_apts)

(3376,)


### Problem 3

In [92]:
def create_query(table, city_col, state_col, city=None, state=None):
    '''
    Creates a SQL query to filter a table by city and state
    
    Parameters
    ----------
    table: str, a table name
    city_col: str, the name of the city field
    state_col: str, the name of the state field
    city: str or None, the name of the city to filter on
    state: str or None, the name of the state to filter on
    
    Returns
    -------
    a string representing a valid sql query the filters `table`
    by `city` and `state`
    '''

    if city==None and state==None:
        s='select * from'+' '+table
    elif city==None and state!=None:
        s="select * from"+' '+table+' '+'where'+' '+state_col+' '+'='+' '+"'"+state+"'"
    elif city!=None and state==None:
        s="select * from"+' '+table+' '+'where'+' '+city_col+' '+'='+' '+"'"+city+"'"
    elif city!=None and state!=None:
        s="select * from"+' '+table+' '+'where'+' '+city_col+' '+'='+' '+"'"+city+"'"+' '+'AND'+' '+state_col+' '+'='+' '+"'"+state+"'"
    return s

In [95]:
# test when neither are None
q = create_query('airports', 'city', 'state', 'Champaign', 'IL')
q_lower = q.lower()
assert("city = 'Champaign'" in q)
assert("state = 'IL'" in q)
assert('select * from airports where' in q_lower)
assert('airports' in q)
# test when both are None
q2 = create_query('airports', 'city', 'state')
q2_lower = q2.lower()
assert_equal(q2_lower, 'select * from airports')
# test when state is None
q3=create_query('airports', 'city', 'state', 'Chicago/Waukegan', None)
q3_lower = q3.lower()
assert_equal(q3_lower, "select * from airports where city = 'chicago/waukegan'")
# test when city is None
q4=create_query('airports', 'city', 'state', None, 'MO')
q4_lower = q4.lower()
assert_equal(q4_lower, "select * from airports where state = 'mo'")


### Problem 4

In [104]:
def get_citystate_apts(city, state, con):
    '''
    Gets the airports in a certain city and state from the airports table
    
    Parameters
    ----------
    city: str or None, the name of the city to filter on
    state: str or None, the name of the state to filter on
    con: a database connection object
    
    Returns
    -------
    a dataframe that is the result of the query created by `create_query`
    '''
    
    qry=create_query('airports', 'city', 'state', city=city, state=state)
    result=pd.read_sql(qry, con)
    return result

In [105]:
con = create_connector(db)

# check when only specifying state
q_data = get_citystate_apts(None, 'IL', con)
assert_equal(type(q_data), pd.DataFrame)
assert_equal(len(q_data), 88)
assert_equal(len(q_data.state.unique()), 1)
assert_equal(q_data.state.unique(), "IL")
# only specifying city
q_data2 = get_citystate_apts('Columbia', None, con)
assert_equal(type(q_data2), pd.DataFrame)
assert_equal(len(q_data2), 5)
assert_equal(len(q_data2.state.unique()), 4)
# specifying neither
q_data3 = get_citystate_apts(None, None, con)
assert_equal(type(q_data3), pd.DataFrame)
assert_equal(len(q_data3), 3376)
assert_equal(len(q_data3.state.unique()), 57)
# specifying both
q_data4 = get_citystate_apts("Chicago", "IL", con)
assert_equal(type(q_data4), pd.DataFrame)
assert_equal(len(q_data4), 3)
assert_equal(len(q_data4.state.unique()), 1)

In [106]:
con.close()

### Problem 5

In [117]:
df1=pd.read_csv('2001.csv',usecols=('DepDelay','Origin'))
df1.dropna(inplace=True)
df1=df1.groupby('Origin', as_index=False).median()
df1.columns=['iata','medianDepDelay']
df2=pd.read_csv('airports.csv',usecols=('iata','airport','city','state'))
df=df2.merge(df1, on='iata', how='right')

In [118]:
table=df.to_sql('AirportDelays', d, index=False,if_exists='replace')

In [142]:
d = create_connector(db)
c = d.cursor()

In [151]:
# check that the number of airports is correct
num_apts = c.execute("SELECT COUNT(*) FROM AirportDelays").fetchone()
assert_equal(num_apts[0], 231)

# get the champaign data
cmi_data = c.execute("SELECT * FROM AirportDelays WHERE iata = 'CMI'").fetchall()
# there should only be one record
assert_equal(len(cmi_data), 1)

# check the column names and get indices
cols = [x[0] for x in c.description]
iata_col = cols.index('iata')
airpt_col = cols.index('airport')
city_col = cols.index('city')
state_col = cols.index('state')
data_col = cols.index('medianDepDelay')
cols.sort()
assert_equal(['airport', 'city', 'iata', 'medianDepDelay', 'state'], cols)

# check the champaign data
assert_equal('Champaign/Urbana', cmi_data[0][city_col])
assert_equal(-2.0, cmi_data[0][data_col])

# get the Dallas Data
dfw_data = c.execute("SELECT * FROM AirportDelays WHERE iata = 'DFW'").fetchall()
assert_equal('Dallas-Fort Worth International', dfw_data[0][airpt_col])
assert_equal('TX', dfw_data[0][state_col])

# get the Boston Data
bos_data = c.execute("SELECT * FROM AirportDelays WHERE iata = 'BOS'").fetchall()
assert_equal('Dallas-Fort Worth', dfw_data[0][city_col])
assert_equal(0.0, bos_data[0][data_col])

In [152]:
d.close()

### Cleanup

In [153]:
!rm -rf /Users/Constance/w14_p1/
try:
    os.remove("/Users/Constance/w14_p1/p1.db")
except OSError as e:
    pass