In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
%matplotlib inline

In [42]:
# nicer for demo
pd.options.display.max_rows = 3

# Reading Data

There are three Pandas data structures: **DataFrame**, **Series** (column), **Index** (column, row labels)

In contrast to **Numpy**, Pandas supports 2-D heterogenous data. Numpy supports arrays of arbitrary dimensions, but the data must be homogenous. 

In [6]:
# create a dataframe from a csv
flights = pd.read_csv("data/ny-flights.csv.gz",
                      parse_dates=["fl_date", "arr", "dep"])

`df.values` forces column values into a Numpy Array
    * CAREFUL: columns with mixed types are cast into objects (which are slower than native numpy types)

In [11]:
flights["dest"].values

array(['LAX', 'LAX', 'LAX', ..., 'IAH', 'IAH', 'ORD'], dtype=object)

## df.loc and Indicies

`df.loc`: allows you to use the column, row indecies
    `df.loc[row_labels, column_labels]`

In [12]:
# column labels
flights.columns

Index(['fl_date', 'unique_carrier', 'airline_id', 'tail_num', 'fl_num',
       'origin', 'dest', 'dep_time', 'dep_delay', 'arr_time', 'arr_delay',
       'cancelled', 'arr', 'dep'],
      dtype='object')

In [18]:
# row labels
flights.index

RangeIndex(start=0, stop=20817, step=1)

In [26]:
flights.loc[[25, 303], ["fl_date", "origin"]]

Unnamed: 0,fl_date,origin
25,2014-01-01,LGA
303,2014-01-01,JFK


## Boolean Indexing

NumPy and pandas use **|** for elementwise or, and **&** for elementwise and between two boolean arrays

In [43]:
# flights on 2014-01-01 leaving from JFK
flights[(flights["fl_date"] == "2014-01-01") & (flights["origin"].isin(["JFK", "LGA"]))]

Unnamed: 0,fl_date,unique_carrier,airline_id,tail_num,fl_num,origin,dest,dep_time,dep_delay,arr_time,arr_delay,cancelled,arr,dep
0,2014-01-01,AA,19805,N338AA,1,JFK,LAX,914.0,14.0,1238.0,13.0,0.0,2014-01-01 12:38:00,2014-01-01 09:14:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,2014-01-01,WN,19393,N792SW,1880,LGA,STL,1245.0,0.0,1419.0,-16.0,0.0,2014-01-01 14:19:00,2014-01-01 12:45:00


## Dropping a Column

`df.drop("column_name", axis=1)` returns a new dataframe without the column/s

In [44]:
flights.drop(["fl_date", "unique_carrier", "airline_id"], axis=1)
# use axis=0 to drop rows

Unnamed: 0,tail_num,fl_num,origin,dest,dep_time,dep_delay,arr_time,arr_delay,cancelled,arr,dep
0,N338AA,1,JFK,LAX,914.0,14.0,1238.0,13.0,0.0,2014-01-01 12:38:00,2014-01-01 09:14:00
...,...,...,...,...,...,...,...,...,...,...,...
20816,N609MQ,3699,BUF,ORD,1208.0,-12.0,1251.0,-19.0,0.0,2014-01-31 12:51:00,2014-01-31 12:08:00
