# Demo: Pandas

## Two important datatypes in Pandas 
* Series (like a vector or array)
* Dataframe (like a 2-D array or Excel spreadsheet)

In [1]:
import pandas as pd

population_dict = {'California': 38332521,
                       'Texas': 26448193,
                       'New York': 19651127,
                       'Florida': 19552860,
                       'Illinois': 12882135}
# create a series from a Python dict
population = pd.Series(population_dict)
population

ImportError: No module named pandas

In [2]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
                 'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

NameError: name 'pd' is not defined

In [None]:
# Create a DataFrame from two dict–each will 
# be a column in the new DataFrame.
states = pd.DataFrame({'population': population,
                           'area': area})
states
# Note that print(states) doesn't look as nice. 
# That's because just typing 'states' as above invokes 
# the display() function for DataFrames:
# from IPython.display import display
#
# display(states)

In [None]:
# DataFrames have an index that we can inspect (or change)
states.index

In [None]:
# View column names
states.columns

In [None]:
# View a specific column
states['area'] # or states.area

In [None]:
# Generate a Boolean series based on a Boolean condition
# e.g., Which states have an area > 150,000 sq. miles?
 # or, Which states have a population > 20,000,000
large_area = states['area'] > 150_000 
large_pop = states['population'] > 20_000_000
# We can use the & operator (bitwise AND) to combine conditions
states[large_area & large_pop]
# states[large_pop]

In [None]:
# We can see that under the hood, the values in a DataFrame
# are represented as a matrix or 2-D array
states.values

## Reading CSV files into __`pandas`__

In [None]:
# Read data from a CSV file
data = pd.read_csv('data/agg_database_daily.csv')

In [None]:
# Show the first n rows, default = 5
data.head(10)

In [None]:
# Show the "shape" of the data, i.e., rows x columns
data.shape

## Don't treat first line as header

In [None]:
data = pd.read_csv('data/agg_database_daily.csv', header=None)

In [None]:
data.head()

In [None]:
data.shape

## Specify our own headers/column names

In [None]:
# We saw earlier that we can view the column names.
# We can also change them!
data.columns = ['date_key','datacenter','superpod','pod','max_redo_size',
  'max_active_sessions','max_db_cpu_user','max_peak_buffer','avg_db_cpu_system',
  'avg_db_cpu_user','total_db_size_in_tb','used_db_space_in_tb',
  'free_db_space_in_tb', 'asm_free_db_space_in_tb','asm_used_db_space_in_tb',
  'asm_total_db_size_in_tb', 'last_modified','asm_used_db_space_perc',
  'oem_cpu_util','oem_read_io_latency']

In [None]:
data.head()

## What if data are missing?

In [None]:
data['max_active_sessions']

## Handling missing values

In [None]:
data = pd.read_csv('data/agg_database_daily.csv', header=None,
                             na_values=[r'\N'])
data.columns = ['date_key','datacenter','superpod','pod','max_redo_size',
'max_active_sessions','max_db_cpu_user','max_peak_buffer','avg_db_cpu_system',
'avg_db_cpu_user','total_db_size_in_tb','used_db_space_in_tb',
'free_db_space_in_tb','asm_free_db_space_in_tb','asm_used_db_space_in_tb',
'asm_total_db_size_in_tb','last_modified','asm_used_db_space_perc',
'oem_cpu_util','oem_read_io_latency']

In [None]:
# Notice anything different here?
# Hint: precision
data['max_active_sessions']

## Dropping missing values

In [None]:
data['max_active_sessions'].dropna()

# Same Idea, Different Source: SQLite Data
* we will need to install sqlite3 if not already installed:
  * __`~/anaconda3/bin/conda install -c blaze sqlite3`__

## First we need to create a connection object...

In [None]:
import sqlite3
conn = sqlite3.connect("data/flights.db")

## ...next, create a cursor object and call its execute() method

In [None]:
cur = conn.cursor()

In [None]:
cur.execute("select * from airlines limit 25;")
results = cur.fetchall()
results

## We'll use __`pandas`__ to view the data easier...

In [None]:
import pandas as pd
import sqlite3

conn = sqlite3.connect("data/flights.db")
data = pd.read_sql_query("select * from airlines limit 50;", conn)
data.head(25) # only display first 5 rows