## Introduction to Numpy

#### Lec 7 - Creating Arrays

In [None]:
# Creating Numpy arrays
import numpy as np

# Converting from a list
#Lets start with a list

my_list1 = [1,2,3,4]

my_array1 = np.array(my_list1)


In [None]:
#Print out array

my_array

In [None]:
# Make another list
my_list2 = [11,22,33,44]

#Make a list of lists
my_lists = [my_list1,my_list2]

#Make multi-dimensional array
my_array2 = np.array(my_lists)

#Show array
my_array2

In [None]:

#Lets get the size of the array
my_array2.shape

In [None]:
#Find out the data tyoe of the array
my_array2.dtype

In [None]:
#Making special case arrays

#Zeros
np.zeros(5)

In [None]:

#Ones
np.ones((5,5))

In [None]:
# An empty array

np.empty(5)
np.empty((3,4))

In [None]:

#Identity array
np.eye(5)

In [None]:
# Using a range

np.arange(5)

#### Lec 8 - Using arrays and scalars

In [None]:
5/2

In [None]:
#Takes care of floats
from __future__ import division

In [None]:
# Create array
arr1 = np.array([[1,2,3],[8,9,10]])

#Show
arr1

In [None]:
#Multiplying Arrays
arr1*arr1

In [None]:

#Subtraction
arr1-arr1

In [None]:

#Arithmetic operations with scalars on array
1 / arr1

In [None]:
#Exponential operation
arr1 ** 3

#### Lec 9 -Indexing Arrays

In [None]:

#Creating sample array
arr = np.arange(0,11)

In [None]:
#Show
arr

In [None]:

#Get a value at an index
arr[8]

In [None]:
#Get values in a range
arr[1:5]

In [None]:
#Get values in a range
arr[0:5]

In [None]:
#Setting a value with index range (Broadcasting)
arr[0:5]=100

#Show
arr

In [None]:
 
# Reset array, we'll see why i had to reset in  a moment
arr = np.arange(0,11)

#Show
arr

In [None]:
#Important notes on Slices
slice_of_arr = arr[0:6]

#Show slice
slice_of_arr

In [None]:

#Change Slice
slice_of_arr[:]=99

#Show Slice again
slice_of_arr

In [None]:

# Now note the changes also occur in our original array!
arr

# Data is not copied, it's a view of the original array! This avoids memory problems!



In [None]:
#To get a copy, need to be explicit
arr_copy = arr.copy()

arr_copy

In [None]:
# Indexing a 2D array

arr_2d = np.array(([5,10,15],[20,25,30],[35,40,45]))

#Show
arr_2d


In [None]:
#Indexing row
arr_2d[1]

In [None]:
# Format is arr_2d[row][col] or arr_2d[row,col]

# Getting individual element value
arr_2d[1][0]


In [None]:

# Getting individual element value
arr_2d[1,0]

In [None]:
# 2D array slicing

#Shape (2,2) from top right corner
arr_2d[:2,1:]


In [None]:

#Shape bottom row
arr_2d[2]


In [None]:
#Shape bottom row
arr_2d[2,:]

In [None]:
# Fancy Indexing

#Set up matrix
arr2d = np.zeros((10,10))

In [None]:

#Length of array
arr_length = arr2d.shape[1]

In [None]:
#Set up array

for i in range(arr_length):
    arr2d[i] = i
    
arr2d

In [None]:

#Fancy indexing allows the following
arr2d[[2,4,6,8]]

In [None]:

#Allows in any order
arr2d[[6,4,2,7]]

#### Lec 10 - Array Transposition

In [None]:
#Create array
arr = np.arange(50).reshape((10,5))

#Show
arr

In [None]:
#Lets transpose
arr.T

In [None]:
# Taking dot product of matrices
np.dot(arr.T,arr)

In [None]:
# For 3D matrix
arr3d = np.arange(50).reshape((5,5,2))

#Show
arr3d

In [None]:
#We can also transpose a 3d matrix

arr3d.transpose((1,0,2))

In [None]:

# If you need to get more specific use swapaxes
arr = np.array([[1,2,3]])

#Show 
arr

In [None]:
arr.swapaxes(0,1)

#### Lec 11 - Universal Array Function

In [None]:
arr = np.arange(11)

arr

In [None]:
#Taking Square Roots
np.sqrt(arr)

In [None]:

#Calcualting exponential (e^)
np.exp(arr)

In [None]:
# Binary Functions require two arrays

#Random array (normal dist)
A = np.random.randn(10)

A


In [None]:
#Random array (normal dist)
B = np.random.randn(10)
B
  

In [None]:
#Addition
np.add(A,B)

In [None]:
#Finding max or min between two arrays
np.maximum(A,B)
     

In [None]:
#For full and extensive list of all universal functions
website = "http://docs.scipy.org/doc/numpy/reference/ufuncs.html#available-ufuncs"
import webbrowser
webbrowser.open(website)

#### Lec 12 - Array Processing

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#Set array for one side of grid
points = np.arange(-5,5,0.01)

In [None]:
#Create the grid
dx,dy=np.meshgrid(points,points)
     

In [None]:

#Show what one side looks like
dx

In [None]:
# Evaluating Function
z = (np.sin(dx) + np.sin(dy))
z

In [None]:
#Plot out the 2d array
plt.imshow(z)

#Plot with a colorbar
plt.colorbar()

#Give the plot a title
plt.title("Plot for sin(x)+sin(y)")
     

In [None]:
#Lets learn how to use the numpy where

#First the slow way to do things

A = np.array([1,2,3,4])

B= np.array([100,200,300,400])

#Now a boolean array
condition = np.array([True,True,False,False])

#Using a list comprehension
answer = [(A_val if cond else B_val) for A_val,B_val,cond in zip(A,B,condition)]

#Show the answer
answer

#Problems include speed issues and multi-dimensional array issues
     

In [None]:
#Now using numpy.where

answer2 = np.where(condition,A,B)

#Show
answer2
     

In [None]:
#Can use np.where  on 2d for manipulation

from numpy.random import randn

arr = randn(5,5)

#Show arr
arr
    

In [None]:
# Where array is less than zero, make that value zero, otherwise leave it as the array value
np.where(arr < 0,0,arr)


In [None]:
#Other Statistical Processing
arr = np.array([[1,2,3],[4,5,6],[7,8,9]])

arr

In [None]:
#SUM
arr.sum()

In [None]:

#Can also do along an axis (we shold expect a 3 diff between the columns)
arr.sum(0)

In [None]:

#Mean
arr.mean()

In [None]:
#Standard Deviation
arr.std()

In [None]:
#Variance
arr.var()

In [None]:

#Also any and all for processing boolean arrays

bool_arr = np.array([True,False,True])

#For any True
bool_arr.any()

In [None]:

# For all True
bool_arr.all()

In [None]:
# Finally sort array

#Create a random array
arr = randn(5)
#show
arr

In [None]:
#Sort it
arr.sort()
#show
arr

In [None]:
#Lets learn about unique
countries = np.array(['France', 'Germany', 'USA', 'Russia','USA','Mexico','Germany'])

np.unique(countries)
     

In [None]:
# in1d test values in one array
np.in1d(['France','USA','Sweden'],countries)

#### Lec 13 - Array Input and Output

In [None]:
#Create an array
arr = np.arange(5)

In [None]:

#Saving array on disk in binary format (file extension .npy)
np.save('my_array',arr)

In [None]:
#Change arr
arr = np.arange(10)
#Show
arr

In [None]:
#Lets see the original saved copy
np.load('my_array.npy')

In [None]:

#Saving multiple arrays into a zip file
np.savez('two_arrays.npz',x=arr,y=arr)

In [None]:
#Now loading multiple arrays
archive_array = np.load('two_arrays.npz')

#Show
archive_array['x']

In [None]:
#Now lets remove them from the memory
rm my_array.npy
rm two_arrays.npz

In [None]:

#Now saving and loading text files

arr = np.array([[1,2,3],[4,5,6]])
np.savetxt('my_test_text.txt',arr,delimiter=',')

In [None]:

arr = np.loadtxt('my_test_text.txt',delimiter = ',')
arr

#### Lec 14 - Series

In [None]:
import pandas as pd
from pandas import Series,DataFrame


In [None]:
#Lets create a Series (array of data and data labels, its index)

obj = Series([3,6,9,12])

#Show
obj

In [None]:
#Lets show the values
obj.values

In [None]:

#Lets show the index
obj.index

In [None]:
#Now lets create a Series with an index

#WW2 casualties 
ww2_cas = Series([8700000,4300000,3000000,2100000,400000],index=['USSR','Germany','China','Japan','USA'])

#Show
ww2_cas
     

In [None]:
#Now we can use index values to select Series values
ww2_cas['USA']

In [None]:
#Can also check with array operations

#Check who had casualties greater than 4 million
ww2_cas[ww2_cas>4000000]

In [None]:
#Can treat Series as ordered dictionary

#Check if USSR is in Series
'USSR' in ww2_cas
     

In [None]:
#Can convert Series into Python dictionary
ww2_dict = ww2_cas.to_dict()

#Show
ww2_dict
    

In [None]:
#Can convert back into a Series
WW2_Series = Series(ww2_dict)

In [None]:
#Show
WW2_Series

In [None]:
#Passing a dictionary the index will have the dict keys in order
countries = ['China','Germany','Japan','USA','USSR','Argentina']


In [None]:
#Lets redefine a Series
obj2 = Series(ww2_dict,index=countries)
     

#Show
obj2
     

In [None]:
#We can use isnull and notnull to find missing data
pd.isnull(obj2)

#obj2.isnull()

In [None]:
#Same for the opposite
pd.notnull(obj2)

#obj2.notnull()
     

In [None]:
#Lets see the ww2 Series again
WW2_Series

In [None]:
#Lets check our Series with Argentine again
obj2

In [None]:
#Now we can add and pandas automatically aligns data by index
WW2_Series + obj2

In [None]:
#We can give Series names
obj2.name = "World War 2 Casualties"
     
#Show
obj2

In [None]:
#We can also name index
obj2.index.name = 'Countries'
     

#Show
obj2


#### Next we'll learn DataFrames!

#### Lec 15 - DataFrames

In [None]:
#Now we'll learn DataFrames

#Let's get some data to play with. How about the NFL?
import webbrowser
website = 'http://en.wikipedia.org/wiki/NFL_win-loss_records'
webbrowser.open(website)

In [None]:
#Copy and read to get data
nfl_frame = pd.read_clipboard()

#Show
nfl_frame

In [None]:
# We can grab the oclumn names with .columns
nfl_frame.columns

In [None]:
#Lets see some specific data columns
DataFrame(nfl_frame,columns=['Team','First Season','Total Games'])

In [None]:
#What happens if we ask for a column that doesn't exist?
DataFrame(nfl_frame,columns=['Team','First Season','Total Games','Stadium'])


In [None]:
# Call columns
nfl_frame.columns

In [None]:
#We can retrieve individual columns
nfl_frame.Team

In [None]:
# Or try this method for multiple word columns
nfl_frame['Total Games']

In [None]:
#We can retrieve rows through indexing
nfl_frame.ix[3]

In [None]:
#We can also assign value sto entire columns
nfl_frame['Stadium']="Levi's Stadium" #Careful with the ' here


In [None]:
nfl_frame

In [None]:
#Putting numbers for stadiums
nfl_frame["Stadium"] = np.arange(5)

#Show
nfl_frame

In [None]:
# Call columns
nfl_frame.columns

In [None]:
#Adding a Series to a DataFrame
stadiums = Series(["Levi's Stadium","AT&T Stadium"],index=[4,0])

In [None]:
#Now input into the nfl DataFrame
nfl_frame['Stadium']=stadiums

#Show
nfl_frame

In [None]:
#We can also delete columns
del nfl_frame['Stadium']

nfl_frame

In [None]:
#DataFrames can be constructed many ways. Another way is from a dictionary of equal length lists
data = {'City':['SF','LA','NYC'],
        'Population':[837000,3880000,8400000]}

city_frame = DataFrame(data)

#Show
city_frame

In [None]:
#For full list of ways to create DataFrames from various sources go to teh documentation for pandas:
website = 'http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.html'
webbrowser.open(website)

#### Lec 16 - Index Objects

In [None]:
#Let's learn/review about Index Objects
my_ser = Series([1,2,3,4],index=['A','B','C','D'])

#Get the index
my_index = my_ser.index
     

#Show
my_index

In [None]:
#Can grab index ranges
my_index[2:]
     

In [None]:
#What happens if we try to change an index value?
my_index[0] = 'Z'

#Excellent! Indexes are immutable

#Next we'll learn about Reindexing.

#### Lec 17 -Reindexing

In [None]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
from numpy.random import randn

In [None]:
#Lets create a new series
ser1 = Series([1,2,3,4],index=['A','B','C','D'])
     
#Show
ser1

In [None]:
#Call reindex to rearrange the data to a new index
ser2 = ser1.reindex(['A','B','C','D','E','F'])
     
#Show
ser2

In [None]:
# We can alos fill in values for new indexes
ser2.reindex(['A','B','C','D','E','F','G'],fill_value=0)

In [None]:
#Using a particular method for filling values
ser3 = Series(['USA','Mexico','Canada'],index=[0,5,10])

#Show
ser3
     

In [None]:
#Can use a forward fill for interploating values vetween indices 
ser3.reindex(range(15),method='ffill')

In [None]:
#Can use a backward fill for interploating values vetween indices 
ser3.reindex(range(15),method='backfill')

In [None]:
#Reindexing rows, columns or both

#Lets make a datafram ewith some random values
dframe = DataFrame(randn(25).reshape((5,5)),index=['A','B','D','E','F'],columns=['col1','col2','col3','col4','col5'])

#Show
dframe

In [None]:
#Notice we forgot 'C' , lets reindex it into dframe
dframe2 = dframe.reindex(['A','B','C','D','E','F'])


In [None]:
#Can also explicitly reindex columns
new_columns = ['col1','col2','col3','col4','col5','col6']

dframe2.reindex(columns=new_columns)

In [None]:
#Reindex quickly using the label-indexing with ix (we'll see this more in the future)

#Show original
dframe
     

In [None]:
dframe.ix[['A','B','C','D','E','F'],new_columns]
     

#### Lec 18 -Drop Entry

In [None]:
#Create a new series to play with
ser1 = Series(np.arange(3),index=['a','b','c'])

#Show
ser1

In [None]:
#Now let's drop an index
ser1.drop('b')

In [None]:
#With a DataFrame we can drop values from either axis
dframe1 = DataFrame(np.arange(9).reshape((3,3)),index=['SF','LA','NY'],columns=['pop','size','year'])

#Show (remember just random values)
dframe1

In [None]:
#Now dropping a row
dframe1.drop('LA')

In [None]:
#Or we could drop a column

#Need to specify that axis is 1, not 0
dframe1.drop('year',axis=1)

#Next we'll learn about selecting entires in a DataFrame!

#### Lec 19 - Selecting Entries

In [None]:
#Lets try some Series indexing
ser1 = Series(np.arange(3),index=['A','B','C'])

#multiply all values by 2, to avoid confusion in future
ser1 = 2*ser1

#Show
ser1 

In [None]:
#Can grab entry by index name
ser1['B']

In [None]:

#Or grab by index 
ser1[1]

In [None]:

#Can also grab by index range
ser1[0:3]

In [None]:
#Or grab range by range of index values
ser1[['A','B','C']]
     

In [None]:
#Or grab by logic
ser1[ser1>3]

In [None]:
#Can also ser using these methods
ser1[ser1>3] = 10

#Show
ser1

In [None]:
#Now let's see sleection in a DataFrame

dframe = DataFrame(np.arange(25).reshape((5,5)),index=['NYC','LA','SF','DC','Chi'],columns=['A','B','C','D','E'])

#Show
dframe

In [None]:
#Select by column name
dframe['B']

In [None]:
#Select by multiple columns
dframe[['B','E']]

In [None]:
#Can also use boolean
dframe[dframe['C']>8]

In [None]:
#Can also just shoe a boolean DataFrame
dframe> 10

In [None]:
#Can alos use ix as previously discussed to label-index
dframe.ix['LA']
     

In [None]:
#Another example
dframe.ix[1]

#Next we'll learn about data alignment!

#### Lec 20 - Data Alignment

In [None]:
#Lets start by making two Series

ser1 = Series([0,1,2],index=['A','B','C'])

#Show
ser1

In [None]:
#Now second Series 2
ser2 = Series([3,4,5,6],index=['A','B','C','D'])

#Show 
ser2 

In [None]:
#So what happens when we add these together
ser1 + ser2

#Note the NaN values are added in automatically

In [None]:

# Now let's try it with DataFrames!
dframe1 = DataFrame(np.arange(4).reshape(2,2),columns=list('AB'),index=['NYC','LA'])

#Show
dframe1
     

In [None]:
#Second DataFrame
dframe2 = DataFrame(np.arange(9).reshape(3,3),columns=list('ADC'),index=['NYC','SF','LA'])

#Show
dframe2

In [None]:
#What happens when we add them together?

dframe1 + dframe2


In [None]:
#What if we want to replace the NaN values
# Then we can use .add()

dframe1.add(dframe2,fill_value=0)

Now we can see that the values are filled, however there was no SF,B value so that is still NaN

Lets learn about operations betwen a Series and a DataFrame

In [None]:
#Show
dframe2

In [None]:
#Create a Series from DataFrame's 0 row
ser3 = dframe2.ix[0]

#Show
ser3
     

In [None]:
#Now we can use arithmetic operations
dframe2-ser3

#Next we'll learn about sorting and ranking!

#### Lec 21 - Rank and Sort

In [None]:
#Sorting by index
ser1 = Series(range(3),index=['C','A','B'])

#show
ser1

In [None]:
#Now sort_index
ser1.sort_index()

In [None]:

#Can sort a Series by its values
ser1.order()

In [None]:
#Lets see how ranking works

from numpy.random import randn
ser2 = Series(randn(10))

#Show
ser2

In [None]:
#This will show you the rank used if you sort the series
ser2.rank()

In [None]:
#Lets sort it now
ser2.sort()

#Show
ser2

In [None]:
#After sorting let's check the rank and see iof it makes sense
ser2.rank()

On the left column we see th original index value and on the right we see it's rank!
Next we'll learn about using descriptive statistics on dataframes!

#### Lec 22 - Summary Statistics

In [None]:
#Let's create a dataframe to work with
arr = np.array([[1,2,np.nan],[np.nan,3,4]])
dframe1 = DataFrame(arr,index=['A','B'],columns = ['One','Two','Three'])

#Show
dframe1

In [None]:
#Let's see the sum() method in action
dframe1.sum()

#Notice how it ignores NaN values

In [None]:

#We can also over rows instead of columns
dframe1.sum(axis=1)

In [None]:
#Can also grab min and max values of dataframe
dframe1.min()

In [None]:
#As well as there index
dframe1.idxmin()


In [None]:
#Same deal with max, just replace min for max
dframe1.idxmax()

In [None]:
#Show
dframe1

In [None]:
#Can also do an accumulation sum
dframe1.cumsum()

In [None]:
#A very useful feature is describe, which provides summary statistics
dframe1.describe()

We can also get information on correlation and covariance

For more info on correlation and covariance, check out the videos below!
     

In [None]:
from IPython.display import YouTubeVideo
# For more information about Covariaance and Correlation
# Check out these great videos!
# Video credit: Brandon Foltz.

#CoVariance
YouTubeVideo('xGbpuFNR1ME')

In [None]:
#Correlation
YouTubeVideo('4EXNedimDMs')
  

In [None]:
#Now lets check correlation and covariance on some stock prices!

#Pandas can get info off the web
import pandas.io.data as pdweb

#Set datetime for date input
import datetime

#Get the closing prices

prices = pdweb.get_data_yahoo(['CVX','XOM','BP'], 
                               start=datetime.datetime(2010, 1, 1), 
                               end=datetime.datetime(2013, 1, 1))['Adj Close']
#Show preview
prices.head()
     

In [None]:
#Now lets get the volume trades

volume = pdweb.get_data_yahoo(['CVX','XOM','BP'], 
                               start=datetime.datetime(2010, 1, 1), 
                               end=datetime.datetime(2013, 1, 1))['Volume']

#Show preview
volume.head()

In [None]:
#Lets get the return
rets = prices.pct_change()
     

#Get the correlation of the stocks
corr = rets.corr

In [None]:
#Lets see the prices over time to get a very rough idea of the correlation between the stock prices
prices.plot()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#As expected pretty strong correlations with eachother
sns.heatmap(rets.corr())

#We'll learn much more about seaborn later!

In [None]:

# We can also check for unique values and their counts 

#For example
ser1 = Series(['w','w','x', 'y', 'z' ,'w' ,'w' ,'x' ,'x' ,'y' ,'a' ,'z' ])

#Show
ser1

In [None]:
#Grab the unique values
ser1.unique()
     

In [None]:
#Now get the count of the unique values
ser1.value_counts()
     
#Next we'll learn how to best deal with missing data!

#### Lec 23 - Missing Data.

In [None]:
#Now we'll learn how to deal with missing data, a very common task when analyzing datasets!

data = Series(['one','two', np.nan, 'four'])
     

#Show data
data

In [None]:
#Find the missing values
data.isnull()

In [None]:
#We can simply drop the NAN 
data.dropna()

In [None]:
# In a DataFrame we need to be a little more careful!

dframe = DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan,np.nan,np.nan]])
  

In [None]:
#Show
dframe

In [None]:
clean_dframe = dframe.dropna()
     

#Show
clean_dframe
     

In [None]:
#Note all rows where an NA occured was a drop of the entire row
     

#We can also specify to only drop rows that are complete missing all data
dframe.dropna(how='all')

In [None]:
#Or we can specify to drop columns with missing data
dframe.dropna(axis=1)

#This should drop all columns out since every column contains at least 1 NAN
 

In [None]:
#We can also threshold teh missing data as well

#For example if we only want rows with at least 3 data points
dframe2 = DataFrame([[1,2,3,np.nan],[2,np.nan,5,6],[np.nan,7,np.nan,9],[1,np.nan,np.nan,np.nan]])

#Show
dframe2
     

In [None]:
#Droppin any rows tht dont have at least 2 data points
dframe2.dropna(thresh=2)
     

In [None]:
#Dropiing rows without at least 3 data points
dframe2.dropna(thresh=3)
     

In [None]:
#We can also fill any NAN
dframe2.fillna(1)

In [None]:
#Can also fill in diff values for diff columns
dframe2.fillna({0:0,1:1,2:2,3:3})
 

In [None]:
#Note that we still have access to the original dframe
dframe2

In [None]:
#If we want to modify the exsisting object, use inplace
dframe2.fillna(0,inplace=True)
 

In [None]:
#Now let's see the dframe
dframe2

#Awesome! Next we'll learn about Index Hierarchy

#### Lec 24 - Index Hierarchy

In [None]:
from numpy.random import randn

In [None]:
#Now we'll learn about Index Hierarchy

#pandas allows you to have multiple index levels, which is very clear with this example:

ser = Series(np.random.randn(6),index=[[1,1,1,2,2,2],['a','b','c','a','b','c']])
 

In [None]:
#Show Series with multiple index levels
ser

In [None]:
# We can check the multiple levels
ser.index

In [None]:
#Now we can sleect specific subsets
ser[1]

In [None]:
# We can also select from an internal index level
ser[:,'a']

In [None]:
# We can also create Data Frames from Series with multiple levels
dframe = ser.unstack()

#Show
dframe

In [None]:
#Can also reverse
dframe.unstack()

In [None]:
# We can also apply multiple level indexing to DataFrames
dframe2 = DataFrame(np.arange(16).reshape(4,4),
                    index=[['a','a','b','b'],[1,2,1,2]],
                    columns=[['NY','NY','LA','SF'],['cold','hot','hot','cold']])
                                                   
dframe2  

In [None]:
# We can also give these index levels names

#Name the index levels
dframe2.index.names = ['INDEX_1','INDEX_2']

#Name the column levels
dframe2.columns.names = ['Cities','Temp']

dframe2

In [None]:
# We can also interchange level orders (note the axis=1 for columns)
dframe2.swaplevel('Cities','Temp',axis=1)

In [None]:
#We can also sort levels
dframe2.sortlevel(1)

#Note the change in sorting, now the Dframe index is sorted by the INDEX_2

In [None]:

#We can also perform operations on particular levels
dframe2.sum(level='Temp',axis=1)

#### Lec 25 - Reading and Writing Text Files

In [None]:
# Can open csv files as a dataframe
dframe = pd.read_csv('lec25.csv')

#Show
dframe

In [None]:
 #If we dont want the header to be the first row
dframe = pd.read_csv('lec25.csv',header=None)

#Show
dframe

In [None]:
# We can also indicate a particular number of rows to be read
pd.read_csv('lec25.csv',header=None,nrows=2)

In [None]:

# Let's see dframe again
dframe

In [None]:
# Now let's see how we can write DataFrames out to text files
dframe.to_csv('mytextdata_out.csv')

#You'll see this file where you're ipython Notebooks are saved (Usually under my documents)
 

In [None]:
#  We can also use other delimiters

#we'll import sys to see the output
import sys 

#Use sys.stdout to see the output directly and not save it
dframe.to_csv(sys.stdout,sep='_')
  

In [None]:
# Just to make sure we understand the delimiter
dframe.to_csv(sys.stdout,sep='?')
 

In [None]:
#We can also choose to write only a specific subset of columns
dframe.to_csv(sys.stdout,columns=[0,1,2])
 

#You should also check out pythons built-in csv reader and writer for more info:
 https://docs.python.org/2/library/csv.html
 

#### Lec 26 - JSON with Python

In [None]:
# Heres an example of what a JSON (JavaScript Object Notation) looks like:
json_obj = """
{   "zoo_animal": "Lion",
    "food": ["Meat", "Veggies", "Honey"],
    "fur": "Golden",
    "clothes": null, 
    "diet": [{"zoo_animal": "Gazelle", "food":"grass", "fur": "Brown"}]
}
"""

In [None]:
#Let import json module
import json

#Lets load json data
data = json.loads(json_obj)
     

In [None]:
#Show
data

In [None]:

#WE can also convert back to JSON
json.dumps(data)

In [None]:
#We can simply open JSON data after loading with a DataFrame
dframe = DataFrame(data['diet'])
     

#Show
dframe

#### Lec 27 - HTML with Python

In [None]:
#Lets grab a url for list of failed banks
url = 'http://www.fdic.gov/bank/individual/failed/banklist.html'


"""
IMPORTANT NOTE: NEED TO HAVE beautiful-soup INSTALLED as well as html5lib !!!!

"""

In [None]:
# Grab data from html and put it intop a list of DataFrame objects!
dframe_list = pd.io.html.read_html(url)
     

#Grab the first list item from the data base and set as a DataFrame
dframe = dframe_list[0]
     

#Show
dframe

In [None]:
dframe.columns.values

#### Lec 28 - Excel with Python

#Now we'll learn how to work with excel files
     

"""
IMPORTANT NOTE: NEED TO HAVE xlrd AND openpyxl INSTALLED!!!
"""

In [None]:
# Open the excel file as an object
xlsfile = pd.ExcelFile('Lec_28_test.xlsx')
     

In [None]:
# Parse the first sheet of the excel file and set as DataFrame
dframe = xlsfile.parse('Sheet1')
     

#Show!
dframe

#### Lec 29 - Merge

In [None]:
# Let's make a dframe

dframe1 = DataFrame({'key':['X','Z','Y','Z','X','X'],'data_set_1': np.arange(6)})

#Show
dframe1

In [None]:
#Now lets make another dframe

dframe2 = DataFrame({'key':['Q','Y','Z'],'data_set_2':[1,2,3]})

#Show
dframe2

In [None]:
# Now we can use merge the dataframes, this is a "many-to-one" situation

# Merge will automatically choose overlapping columns to merge on
pd.merge(dframe1,dframe2)

#Note no overlapping 'X's

In [None]:
# We could have also specified which column to merge on
pd.merge(dframe1,dframe2,on='key')
     

In [None]:
# We can choose which DataFrame's keys to use, this will choose left (dframe1)
pd.merge(dframe1,dframe2,on='key',how='left')
     

In [None]:
# Choosing the one on the right (dframe2)
pd.merge(dframe1,dframe2,on='key',how='right')

In [None]:
#Choosing the "outer" method selects the union of both keys
pd.merge(dframe1,dframe2,on='key',how='outer')

In [None]:
#Now we'll learn about a many to many merge

# Nnote that these DataFrames contain more than one instance of the key in BOTH datasets

dframe3 = DataFrame({'key': ['X', 'X', 'X', 'Y', 'Z', 'Z'],
                 'data_set_3': range(6)})
dframe4 = DataFrame({'key': ['Y', 'Y', 'X', 'X', 'Z'],
                 'data_set_4': range(5)})

#Show the merge
pd.merge(dframe3, dframe4)

So what happened? A many to many merge results in the product of the rows. Because there were 3 'X's in dframe3 and 2 'X's in dframe4 there ended up being a total of 6 'X' rows in the result (2*3=6)! Note how dframe3 repeats its 0,1,2 values for 'X' and dframe4 repeats its '2,3' pairs throughout the key set.

In [None]:
# We can also merge with multiple keys!

# Dframe on left
df_left = DataFrame({'key1': ['SF', 'SF', 'LA'],
                  'key2': ['one', 'two', 'one'],
                  'left_data': [10,20,30]})

#Dframe on right
df_right = DataFrame({'key1': ['SF', 'SF', 'LA', 'LA'],
                   'key2': ['one', 'one', 'one', 'two'],
                   'right_data': [40,50,60,70]})

#Merge
pd.merge(df_left, df_right, on=['key1', 'key2'], how='outer')
 

In [None]:
# Now using the above you can check mulitple data sets for multiple key combos, for instance what did the left data set have for LA,one?
# Answer =  60

In [None]:

#Note that the left and right DataFrames have overlapping key names (key1 and key2).
# pandas automatically adds suffixes to them

pd.merge(df_left,df_right,on='key1')

In [None]:
# We can also specify what the suffix becomes
pd.merge(df_left,df_right, on='key1',suffixes=('_lefty','_righty'))


 For more info on merge parameters check out:
url = 'http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.merge.html'

Next we'll learn how to merge on Index!


#### Lec 30 -Merge on Index.
     

In [None]:
# Lets get two dframes

df_left = DataFrame({'key': ['X','Y','Z','X','Y'],
                  'data': range(5)})
df_right = DataFrame({'group_data': [10, 20]}, index=['X', 'Y'])
    

In [None]:

#Show
df_left

In [None]:
#Show
df_right

In [None]:
# We can also get a union by using outer
pd.merge(df_left,df_right,left_on='key',right_index=True,how='outer')
  

In [None]:
#Now merge, we'll use the key for the left Dframe, and the index for the right
pd.merge(df_left,df_right,left_on='key',right_index=True)
 

In [None]:
#Now let's try something a little more complicated, remember hierarchal index?
df_left_hr = DataFrame({'key1': ['SF','SF','SF','LA','LA'],
                   'key2': [10, 20, 30, 20, 30],
                   'data_set': np.arange(5.)})
df_right_hr = DataFrame(np.arange(10).reshape((5, 2)),
                   index=[['LA','LA','SF','SF','SF'],
                          [20, 10, 10, 10, 20]],
                   columns=['col_1', 'col_2'])

     

In [None]:
#Show, this has a index hierarchy
df_right_hr
     

In [None]:
# Now we can merge the left by using keys and the right by its index
pd.merge(df_left_hr,df_right_hr,left_on=['key1','key2'],right_index=True)
 

In [None]:
# We can alo keep a union by choosing 'outer' method
pd.merge(df_left_hr,df_right_hr,left_on=['key1','key2'],right_index=True,how='outer')


In [None]:
# WE can also you .join()

# Shown on our first two DataFrames
df_left.join(df_right)

# Next we'll learn about the concatenate function!
     

#### Lec 31 - Concatenate

In [None]:
# First in just Numpy
     

# Create a matrix 
arr1 = np.arange(9).reshape((3,3))

# Show
arr1

In [None]:
# Concatenate along axis 1
np.concatenate([arr1,arr1],axis=1)

In [None]:
# Let's see other axis options
np.concatenate([arr1,arr1],axis=0)
 

In [None]:
# Now let's see how this works in pandas
     

# Lets create two Series with no overlap
ser1 =  Series([0,1,2],index=['T','U','V'])

ser2 = Series([3,4],index=['X','Y'])

#Now let use concat (default is axis=0)
pd.concat([ser1,ser2])

In [None]:
# Now passing along another axis will produce a DataFrame
pd.concat([ser1,ser2],axis=1)
 

In [None]:
# We can specify which specific axes to be used
pd.concat([ser1,ser2],axis=1,join_axes=[['U','V','Y']])
 

In [None]:
# Lets say we wanted to add markers.keys to the concatenation result

# WE can do this with a hierarchical index
pd.concat([ser1,ser2],keys=['cat1','cat2'])
 

In [None]:
# Along the axis=1 then these Keys become column headers
pd.concat([ser1,ser2],axis=1,keys=['cat1','cat2'])
 

In [None]:
#Lastly, everything works similarly in DataFrames

dframe1 = DataFrame(np.random.randn(4,3), columns=['X', 'Y', 'Z'])
dframe2 = DataFrame(np.random.randn(3, 3), columns=['Y', 'Q', 'X'])
 

In [None]:
#Concat on DataFrame
pd.concat([dframe1,dframe2])
 

In [None]:
#If we dont care about the index info and just awnt to make a complete DataFrame, just use ignore_index
pd.concat([dframe1,dframe2],ignore_index=True)
 

For more info in documentation:
url='http://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.html'
     
Next up: More on Combining DataFrames with Overlapping Indexes!

#### Lec 32 - Combining DataFrames
  

In [None]:
#Lets make some Series to work with

#First Series
ser1 = Series([2,np.nan,4,np.nan,6,np.nan],
           index=['Q','R','S','T','U','V'])

#Second Series (based off length of ser1)
ser2 = Series(np.arange(len(ser1), dtype=np.float64),
           index=['Q','R','S','T','U','V'])

ser2[-1] = np.nan

In [None]:
ser1

In [None]:
ser2

In [None]:
# Now let's get a series where the value of ser1 is chosen if ser2 is NAN,otherwise let the value be ser1
Series(np.where(pd.isnull(ser1),ser2,ser1),index=ser1.index)
 
#Take a moment to really understand how the above worked
 

In [None]:
#Now we can do the same thing simply by using combine_first with pandas
ser1.combine_first(ser2)

#This combines the Series values, choosing the values of the calling Series first, unless its a NAN
 

In [None]:
#Now lets how this works on a DataFrame!
     

#Lets make some 
dframe_odds = DataFrame({'X': [1., np.nan, 3., np.nan],
                     'Y': [np.nan, 5., np.nan, 7.],
                     'Z': [np.nan, 9., np.nan, 11.]})
dframe_evens = DataFrame({'X': [2., 4., np.nan, 6., 8.],
                     'Y': [np.nan, 10., 12., 14., 16.]})


In [None]:
#Show
dframe_odds

In [None]:
#Show
dframe_evens

In [None]:
#Now lets combine using odds values first, unless theres a NAN, then put the evens values
dframe_odds.combine_first(dframe_evens)
 

#### Lec 33 - Reshaping

In [None]:
#Let's see how stack and unstack work

# Create DataFrame
dframe1 = DataFrame(np.arange(8).reshape((2, 4)),
                 index=pd.Index(['LA', 'SF'], name='city'),
                 columns=pd.Index(['A', 'B', 'C','D'], name='letter'))
#Show
dframe1

In [None]:
# Use stack to pivot the columns into the rows
dframe_st = dframe1.stack()

#Show
dframe_st

In [None]:
#We can always rearrange back into a DataFrame
dframe_st.unstack()

In [None]:
#We can choose which level to unstack by
dframe_st.unstack(0)

In [None]:
# Also by which name to unstack by
dframe_st.unstack('letter')

In [None]:
# Also by which name to unstack by
dframe_st.unstack('city')

In [None]:
# Let's see how stack and unstack handle NAN

#Make two series
ser1 = Series([0, 1, 2], index=['Q', 'X', 'Y'])
ser2 = Series([4, 5, 6], index=['X', 'Y', 'Z'])

#Concat to make a dframe
dframe = pd.concat([ser1, ser2], keys=['Alpha', 'Beta'])

# Unstack resulting DataFrame
dframe.unstack()

In [None]:

# Now stack will filter out NAN by default
dframe.unstack().stack()

In [None]:
# IF we dont want this we can set it to False
dframe.unstack().stack(dropna=False)

#### Lec 34 - Pivoting.

In [None]:
# Lets create some data to play with:

# Note: It is not necessary to understand how this dataset was made to understand this Lecture.

#import pandas testing utility
import pandas.util.testing as tm; tm.N = 3

#Create a unpivoted function
def unpivot(frame):
    N, K = frame.shape
    
    data = {'value' : frame.values.ravel('F'),
            'variable' : np.asarray(frame.columns).repeat(N),
            'date' : np.tile(np.asarray(frame.index), K)}
    
    # Return the DataFrame
    return DataFrame(data, columns=['date', 'variable', 'value'])

#Set the DataFrame we'll be using
dframe = unpivot(tm.makeTimeDataFrame())

In [None]:
#Show the "stacked" data, note how there are multiple variables and values for the dates
dframe

In [None]:
# Now let's pivot the data

# First two value spassed are teh row and column indexes, then finally an optional fill value
dframe_piv = dframe.pivot('date','variable','value')

#Show
dframe_piv

#### Lec 35 - Duplicates in DataFrames

In [None]:
#Lets get a dataframe with duplicates

dframe = DataFrame({'key1': ['A'] * 2 + ['B'] * 3,
                  'key2': [2, 2, 2, 3, 3]})

#Show
dframe

In [None]:
#We can use duplicated to find duplicates
dframe.duplicated()

In [None]:
# We can also drop duplicates like this:
dframe.drop_duplicates()

In [None]:
#You can filter which duplicates to drop by a single column
dframe.drop_duplicates(['key1'])

In [None]:
#Show original
dframe
     

In [None]:
#By default the first value was taken for the duplicates, we can also take the last value instead
dframe.drop_duplicates(['key1'],take_last=True)

#### Lec 36 - Mapping

In [None]:
# Let's create a dframe to work with (Highest elevation cities in USA)
dframe = DataFrame({'city':['Alma','Brian Head','Fox Park'],
                    'altitude':[3158,3000,2762]})

#Show
dframe

In [None]:
#Now let's say we wanted to add a column for the States, we can do that with a mapping.
state_map={'Alma':'Colorado','Brian Head':'Utah','Fox Park':'Wyoming'}


In [None]:
# Now we can map that data to our current dframe
dframe['state'] = dframe['city'].map(state_map)
     

#Show result
dframe

# Mapping is a great way to do element-wise transfomations and other data cleaning operations!


#### Lec 37 - Replace

In [None]:
# Lets make  Series
ser1 = Series([1,2,3,4,1,2,3,4])
#Show
ser1

In [None]:
# Using replace we can select --> .replace(value to be replaced, new_value)
ser1.replace(1,np.nan)

In [None]:
#Can also input lists
ser1.replace([1,4],[100,400])
     

In [None]:
#Can also input dictionary
ser1.replace({4:np.nan})
     

#### Lec 38 - Rename Index

In [None]:

# Making a DataFrame
dframe= DataFrame(np.arange(12).reshape((3, 4)),
                 index=['NY', 'LA', 'SF'],
                 columns=['A', 'B', 'C', 'D'])

#Show
dframe

In [None]:
# Just like a Series, axis indexes can also use map

#Let's use map to lowercase the city initials
dframe.index.map(str.lower)

In [None]:
# If you want to assign this to the actual index, you can use index
dframe.index = dframe.index.map(str.lower)
#Show
dframe

In [None]:
# Use rename if you want to create a transformed version withour modifying the original!

#str.title will capitalize the first letter, lowercasing the columns
dframe.rename(index=str.title, columns=str.lower)

In [None]:
# We can also use rename to insert dictionaries providing new values for indexes or columns!
dframe.rename(index={'ny': 'NEW YORK'},
            columns={'A': 'ALPHA'})

In [None]:
# If you would like to actually edit the data set in place, set inplace=True
dframe.rename(index={'ny': 'NEW YORK'}, inplace=True)
dframe

#### Lec 39 - Binning

In [None]:
years = [1990,1991,1992,2008,2012,2015,1987,1969,2013,2008,1999]
     

In [None]:
# We can seperate these years by decade
decade_bins = [1960,1970,1980,1990,2000,2010,2020]

In [None]:
#Now we'll use cut to get somethign called a Category object
decade_cat = pd.cut(years,decade_bins)

In [None]:
#Show
decade_cat

In [None]:
# We can check the categories using .categories
decade_cat.categories

In [None]:
# Then we can check the value counts in each category
pd.value_counts(decade_cat)

In [None]:
# We can also pass data values to the cut.

#For instance, if we just wanted to make two bins, evenly spaced based on max and min year, with a 1 year precision
pd.cut(years,2,precision=1)

In [None]:
# Thats about it for binning basics
# One last thing to note, jus tlike in standard math notation, when setting up bins:
# () means open, while [] means closed/inclusive
     

#### Lec 40 - Outliers

In [None]:

# Let's see how we would find outliers in a dataset

# First we'll seed the numpy generator
np.random.seed(12345)

#Next we'll create the dataframe
dframe = DataFrame(np.random.randn(1000,4))

In [None]:
#Show preview
dframe.head()

In [None]:
# Lets describe the data
dframe.describe()
     

In [None]:

# Lets select the first column
col = dframe[0]

In [None]:
# NOw we can check which values in the column are greater than 3, for instance.
col[np.abs(col)>3]

In [None]:
# So we now know in column[0], rows 523 and 900 have values with abs > 3

#How about all the columns?

# We can use the "any" method
dframe[(np.abs(dframe)>3).any(1)]

In [None]:
# WE could also possibly cap the data at 3

dframe[np.abs(dframe)>3] = np.sign(dframe) *3

In [None]:
dframe.describe()
     

#### Lec 41 - Permutation

In [None]:
# WE can randomly reorder (permutate) a Series, or the rows in a DataFrame

#Let's take a look
dframe = DataFrame(np.arange(4 * 4).reshape((4, 4)))

#Create an array with a random perumation of 0,1,2,3
blender = np.random.permutation(4)

blender

In [None]:
dframe

In [None]:
# Now permutate the dframe based on the blender
dframe.take(blender)

In [None]:
# Now what if we want permuations WITH replacement
     

# Let imagine a box with 3 marbles in it: labeled 1, 2, and 3
box = np.array([1,2,3])

# Now lets create a random permuation WITH replacement using randint
shaker = np.random.randint(0, len(box), size=10)


In [None]:
# Let's check teh box "shaker"
shaker

In [None]:
#Now lets grab form the box
hand_grabs = box.take(shaker)

#show
hand_grabs

Congratulations! We're all done with this Section. Up next: Working with Data Part 3 !!!

#### Lec 42 - GroupBy on DataFrames

In [None]:
#Let's make a dframe
dframe = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)})

#Show
dframe

In [None]:
#Now let's see how to use groupby

#Lets grab the dataset1 column and group it by the k1 key
group1 = dframe['dataset1'].groupby(dframe['k1'])

#Show the groupby object
group1

In [None]:
#Now we can perform operations on this particular group
group1.mean()

In [None]:

# We can use group keys that are series as well

#For example:

#We'll make some arrays for use as keys
cities = np.array(['NY','LA','LA','NY','NY'])
month = np.array(['JAN','FEB','JAN','FEB','JAN'])

#Now using the data from dataset1, group the means by city and month
dframe['dataset1'].groupby([cities,month]).mean()

In [None]:
# let's see the original dframe again.
dframe

In [None]:

# WE can also pass column names as group keys
dframe.groupby('k1').mean()

In [None]:
# Or multiple column names
dframe.groupby(['k1','k2']).mean()

In [None]:

# Another useful groupby method is getting the group sizes
dframe.groupby(['k1']).size()

In [None]:
# We can also iterate over groups

#For example:
for name,group in dframe.groupby('k1'):
    print "This is the %s group" %name
    print group
    print '\n'

In [None]:
# We can also iterate with multiple keys
for (k1,k2) , group in dframe.groupby(['k1','k2']):
    print "Key1 = %s Key2 = %s" %(k1,k2)
    print group
    print '\n'

In [None]:
# A possibly useful tactic is creating a dictionary of the data pieces 
group_dict = dict(list(dframe.groupby('k1')))

#Show the group with X
group_dict['X']

In [None]:
# We could have also chosen to do this with axis = 1

# Let's creat a dictionary for dtypes of objects!
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes,axis=1)))

#show
group_dict_axis1

In [None]:
# Next we'll learn how to use groupby with columns

# For example if we only wanted to group the dataset2 column with both sets of keys
dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]

dataset2_group.mean()

#Next we'll have a quick lesson on grouping with dictionaries and series!


#### Lec 43 - Groupby on Dict and Series

In [None]:

# Let's make a Dframe

animals = DataFrame(np.arange(16).reshape(4, 4),
                   columns=['W', 'X', 'Y', 'Z'],
                   index=['Dog', 'Cat', 'Bird', 'Mouse'])

#Now lets add some NAN values
animals.ix[1:2, ['W', 'Y']] = np.nan 

#Show
animals

In [None]:
# Now let's say I had a dictionary with ebhavior values in it
behavior_map = {'W': 'good', 'X': 'bad', 'Y': 'good','Z': 'bad'}


In [None]:
# Now we can groupby using that mapping
animal_col = animals.groupby(behavior_map, axis=1)

# Show the sum accroding to the groupby with the mapping
animal_col.sum()

# For example [dog][good] = [dog][Y]+[dog][W]


In [None]:
# Now let's try it with a Series
behav_series = Series(behavior_map)

#Show
behav_series

In [None]:
#Now let's groupby the Series

animals.groupby(behav_series, axis=1).count()
 

In [None]:
# We can also groupby with functions!

#Show our dframe again
animals

In [None]:
# Lets assume we wanted to group by the length of the animal names, we can pass the len function into groupby!

# Show
animals.groupby(len).sum()

#Note the index is now number of letters in the animal name
 

In [None]:
# We can also mix functions with arrays,dicts, and Series for groupby methods

# Set a list for keys
keys = ['A', 'B', 'A', 'B']

# Now groupby length of name and the keys to show max values
animals.groupby([len, keys]).max()
 

In [None]:

# We can also use groupby with hierarchaly index levels

#Create a hierarchal column index
hier_col = pd.MultiIndex.from_arrays([['NY','NY','NY','SF','SF'],[1,2,3,1,2]],names=['City','sub_value'])

# Create a dframe with hierarchal index
dframe_hr = DataFrame(np.arange(25).reshape(5,5),columns=hier_col)

#Multiply values by 100 for clarity
dframe_hr = dframe_hr*100

#Show
dframe_hr

#### Lec 44 - Aggregation

In [None]:
# Data Agrregation consists of operations that result in a scalar (e.g. mean(),sum(),count(), etc)

#Let's get a csv data set to play with
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/'


# Save thewinquality.csv file in the same folder as your ipython notebooks, note the delimiter used ;
dframe_wine = pd.read_csv('winequality_red.csv',sep=';')
     

# Let's get a preview
dframe_wine.head()

In [None]:
# How about we find out the average alcohol content for the wine
dframe_wine['alcohol'].mean()


In [None]:
# That was an example of an aggregate, how about we make our own?
def max_to_min(arr):
    return arr.max() - arr.min()

# Let's group the wines by "quality"
wino = dframe_wine.groupby('quality')

# Show
wino.describe()

In [None]:
# We can now apply our own aggregate function, this function takes the max value of the col and subtracts the min value of the col
wino.agg(max_to_min)

In [None]:
# We can also pass string methods through aggregate
wino.agg('mean')

In [None]:
# Let's go back to the original dframe
dframe_wine.head()

In [None]:
# Let's adda  quality to alcohol content ratio
dframe_wine['qual/alc ratio'] = dframe_wine['quality']/dframe_wine['alcohol']
     
# Show
dframe_wine.head()

In [None]:
# WE can also use pivot tables instead of groupby

# Pivot table of quality
dframe_wine.pivot_table(index=['quality'])

In [None]:
%matplotlib inline
dframe_wine.plot(kind='scatter',x='quality',y='alcohol')


We can see that the data is probably better fit for a box plot for a more concise view of the data See if you can figure how to get a boxplot using the pandas documentation and what you have learned so far

Don't worry if you can't quite figure it out just yet, the next section will cover all sorts of data visualizations!

#### Lec 45 - Splitting, Applying and Combining

In [None]:
# Let's grab the wine data again
dframe_wine = pd.read_csv('winequality_red.csv',sep=';')

#Preview
dframe_wine.head()

What if we wanted to know the highest alcohol content for each quality range?

We can use groupby mechanics to split-apply-combine

In [None]:
# Create a function that assigns a rank to each wine based on alcohol content, with 1 being the highest alcohol content
def ranker(df):
    df['alc_content_rank'] = np.arange(len(df)) + 1
    return df
     

In [None]:
# Now sort the dframe by alcohol in ascending order
dframe_wine.sort('alcohol',ascending=False,inplace=True)

# Now we'll group by quality and apply our ranking function
dframe_wine = dframe_wine.groupby('quality').apply(ranker)
     

In [None]:
#Preview
dframe_wine.head()
     

In [None]:
# Now finally we can just call for the dframe where the alc_content_rank == 1

# Get the numebr of quality counts
num_of_qual = dframe_wine['quality'].value_counts()

#Show
num_of_qual

In [None]:
# Now we'll show the combined info for teh wines that had the highest alcohol content for their respective rank!
dframe_wine[dframe_wine.alc_content_rank == 1].head(len(num_of_qual))
 
# Awesome! Ask yourself if there are any trends you would like to find in this data?
# Is there a relationship between wine ranking and alcohol content?
 

#### Lec 46 - Cross-Tabulation

In [None]:
# Let's create a quick data set
from StringIO import StringIO

data ="""\
Sample   Animal   Intelligence
1        Dog     Smart
2 Dog Smart
3 Cat Dumb
4 Cat Dumb
5 Dog Dumb
6 Cat Smart"""

#Store as dframe
dframe = pd.read_table(StringIO(data),sep='\s+')
     

In [None]:
# Show
dframe
     

In [None]:
# Now we can create a cross-tabulation table, which is basically just a frequency table
pd.crosstab(dframe.Animal,dframe.Intelligence,margins=True)


#### Lec 47 - Installing Seaborn

To install file the directions at the following link, you should be able to use a simple pip install. Remember to install the dependencies!

http://stanford.edu/~mwaskom/software/seaborn/installing.html

#### Lec 48 - Histograms

First of all, source of information for what a histogram actually is: http://en.wikipedia.org/wiki/Histogram

In [None]:
# The normal imports
import numpy as np
from numpy.random import randn
import pandas as pd

# Import the stats librayr from numpy
from scipy import stats

# These are the plotting modules adn libraries we'll use:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Command so that plots appear in the iPython Notebook
%matplotlib inline

In [None]:
#Create a random normal-dist dataset
dataset1 = randn(100)

#Plot a histogram of the dataset, note bins=10 by default
plt.hist(dataset1)

In [None]:
# Lets make another dataset
dataset2 = randn(80)

#Plot
plt.hist(dataset2,color='indianred')

In [None]:
# We can use normed to plot on same plot

# Set normed=True for the plots to be normalized in order to comapre data sets with different number of observations
# Set alpha=0.5 for transperancy

plt.hist(dataset1,normed=True,color='indianred',alpha=0.5,bins=20)
plt.hist(dataset2,normed=True,alpha=0.5,bins=20)

In [None]:
# Make two more random normal dist data sets
data1 = randn(1000)
data2 = randn(1000)

#Can represent joint distributions using joint plots
sns.jointplot(data1,data2)

In [None]:
# Can also use hex bins for a more concise picture
sns.jointplot(data1,data2,kind='hex')

#### Lec 49 - Kernel Density Estimation Plots

In [None]:
# Let's start off with a carpet/rug plot
# A rug plot simpot puts ticks wherever a value occured

#Create dataset
dataset = randn(25)
#Create rugplot
sns.rugplot(dataset)
#Set y-axis limit
plt.ylim(0,1)

In [None]:
# Plot a histogram on top of 
plt.hist(dataset,alpha=0.3)
sns.rugplot(dataset)

The histogram sets up 10 bins and then just count how many ticks appeared in each bin, setting the height of each bar

The kernel density plot will represent each tick mark with a gaussian basis function. Let's see how we would do this manually

In [None]:
# Create another rugplot
sns.rugplot(dataset);

# Set up the x-axis for the plot
x_min = dataset.min() - 2
x_max = dataset.max() + 2

# 100 equally spaced points from x_min to x_max
x_axis = np.linspace(x_min,x_max,100)

# Set up the bandwidth, for info on this:
url = 'http://en.wikipedia.org/wiki/Kernel_density_estimation#Practical_estimation_of_the_bandwidth'

bandwidth = ((4*dataset.std()**5)/(3*len(dataset)))**.2


# Create an empty kernel list
kernel_list = []

# Plot each basis function
for data_point in dataset:
    
    # Create a kernel for each point and append to list
    kernel = stats.norm(data_point,bandwidth).pdf(x_axis)
    kernel_list.append(kernel)
    
    #Scale for plotting
    kernel = kernel / kernel.max()
    kernel = kernel * .4
    plt.plot(x_axis,kernel,color = 'grey',alpha=0.5)

plt.ylim(0,1)

In [None]:
# To get the kde plot we can sum these basis functions.



# Plot the sum of the basis function
sum_of_kde = np.sum(kernels,axis=0)

# Plot figure
fig = plt.plot(x_axis,sum_of_kde,color='indianred')

# Add the initial rugplot
sns.rugplot(dataset,c = 'indianred')

# Get rid of y-tick marks
plt.yticks([])

# Set title
plt.suptitle("Sum of the Basis Functions")

In [None]:
# Now we can see how to do it in one step with seaborn! Awesome!
sns.kdeplot(dataset)

In [None]:
# We can adjust the bandwidth of the sns kde to make the kde plot more or less sensitive to high frequency

# Rugplot
sns.rugplot(dataset,color='black')

# Plot various bandwidths
for bw in np.arange(0.5,2,0.25):
    sns.kdeplot(dataset,bw=bw,lw=1.8,label=bw)

In [None]:
# We can also choose different kernels

kernel_options = ["biw", "cos", "epa", "gau", "tri", "triw"]

# More info on types
url = 'http://en.wikipedia.org/wiki/Kernel_(statistics)'

# Use label to set legend
for kern in kernel_options:
    sns.kdeplot(dataset,kernel=kern,label=kern)

In [None]:
# We can also shade if desired
for kern in kernel_options:
    sns.kdeplot(dataset,kernel=kern,label=kern,shade=True,alpha=0.5)
 

In [None]:
# For vertical axis, use the vertical keyword
sns.kdeplot(dataset,vertical=True)
     

In [None]:
# Finally we can also use kde plot to create a cumulative distribution function (CDF) of the data

# URL for info on CDF
url = 'http://en.wikipedia.org/wiki/Cumulative_distribution_function'

sns.kdeplot(dataset,cumulative=True)
     

### Multivariate Density Estimation using kdeplot

We can also use kdeplot for multidimensional data. Lets see how it works!

In [None]:
# Let's create a new dataset

# Mean center of data
mean = [0,0]

# Diagonal covariance
cov = [[1,0],[0,100]]

# Create dataset using numpy
dataset2 = np.random.multivariate_normal(mean,cov,1000)

# Bring back our old friend pandas
dframe = pd.DataFrame(dataset2,columns=['X','Y'])

# Plot our dataframe
sns.kdeplot(dframe)

In [None]:
# We could have also passed two vectors seperately, and shade
sns.kdeplot(dframe.X,dframe.Y,shade=True)

In [None]:
# Can specify a particualr bandwidth
ns.kdeplot(dframe,bw=1)

In [None]:
# Or just use silverman again
sns.kdeplot(dframe,bw='silverman')

In [None]:
# We can also create a kde joint plot, simliar to the hexbin plots we saw before

sns.jointplot('X','Y',dframe,kind='kde')

#### Lec 50 - Combining Plot Styles

In [None]:
# Now we'l learn how to combine plot styles
     

# Create datset
dataset = randn(100)

# Use distplot for combining plots, by default a kde over a histogram is shown
sns.distplot(dataset,bins=25)

In [None]:
# hist, rug, and kde are all input arguments to turn those plots on or off
sns.distplot(dataset,rug=True,hist=False)
  

In [None]:
# TO control specific plots in distplot, use [plot]_kws argument with dictionaries.

#Here's an example

sns.distplot(dataset,bins=25,
             kde_kws={'color':'indianred','label':'KDE PLOT'},
             hist_kws={'color':'blue','label':"HISTOGRAM"})
     

In [None]:
# WE can also use pandas data objects for this

from pandas import Series

# Create Series form dataset
ser1 = Series(dataset,name='My_DATA')
     

In [None]:
# Plot Series
sns.distplot(ser1,bins=25)
     

#### Lec 51 - Box and Violin Plots

In [None]:
# Now we'll learn about box and violin plots
url = 'http://en.wikipedia.org/wiki/Box_plot#mediaviewer/File:Boxplot_vs_PDF.svg'

# Let's create two distributions
data1 = randn(100)
data2 = randn(100) + 2 # Off set the mean

In [None]:
# Now we can create a box plot
sns.boxplot([data1,data2])

In [None]:
# Notice how the previous plot had outlier points, we can include those with the "whiskers"
sns.boxplot([data1,data2],whis=np.inf)

In [None]:
# WE can also set horizontal by setting vertical to false
sns.boxplot([data1,data2],whis=np.inf, vert = False)

In [None]:
# While box plots are great, they can sometimes not give the full picture

# Violin/Viola plots can combine the simplicity of a box plot with the information of a kde plot
     

In [None]:
# Let's create an example where a box plot doesn't give the whole picture

# Normal Distribution
data1 = stats.norm(0,5).rvs(100)

# Two gamma distributions concatenated together (Second one is inverted)
data2 = np.concatenate([stats.gamma(5).rvs(50)-1,
                        -1*stats.gamma(5).rvs(50)])

# Box plot them
sns.boxplot([data1,data2],whis=np.inf)

In [None]:
# From the above plots, you may think that the distributions are fairly similar
# But lets check out what a violin plot reveals
sns.violinplot([data1,data2])

In [None]:
# Wow, quite revealing!
     

# We can also change the bandwidth of the kernel used for the density fit of the violin plots if desired
sns.violinplot(data2,bw=0.01)


In [None]:
# Much like a rug plot, we can also include the individual points, or sticks
sns.violinplot(data1,inner="stick")

#### Lec 52 - Regression Plots

In [None]:
# Now we'll learn how ot visualize multiple regression with lmplot()

# Luckily, Seaborn comes with an example dataset to use as a pandas DataFrame
tips = sns.load_dataset("tips")
     

# Preview
tips.head()


In [None]:
# Let's use lmplot() to plot the total bill versus tips
sns.lmplot("total_bill","tip",tips)


 First we can see a scatter plot of all the points, tip vs total_bill
 
 Then we see a linear regression line, which is an estimateed linear fit model to the data
  

In [None]:
# WE can also specify teh confidence interval to use for the linear fit

sns.lmplot("total_bill","tip",tips,ci=75) # 68% ci 
 

In [None]:
# Just like before, we can use dictionaries to edit individual parts of the plot

sns.lmplot("total_bill", "tip", tips,
           scatter_kws={"marker": "o", "color": "indianred"},
           line_kws={"linewidth": 1, "color": "blue"});
     

In [None]:
# WE can also check out higher-order trends
sns.lmplot("total_bill", "tip", tips,order=4,
           scatter_kws={"marker": "o", "color": "indianred"},
           line_kws={"linewidth": 1, "color": "blue"})
     

In [None]:
# We can also not fit a regression if desired
sns.lmplot("total_bill", "tip", tips,fit_reg=False)


In [None]:
# lmplot() also works on discrete variables, such as the percentage of the tip

# Create a new column for tip percentage
tips["tip_pect"]=100*(tips['tip']/tips['total_bill'])

#plot
sns.lmplot("size", "tip_pect", tips);
   

In [None]:
# We can also add jitter to this

#Info link
url = "http://en.wikipedia.org/wiki/Jitter"

#plot
sns.lmplot("size", "tip_pect", tips,x_jitter=.1);
  

In [None]:
# We can also estimate the tendency of each bin (size of party in this case)
sns.lmplot("size", "tip_pect", tips, x_estimator=np.mean);


Interesting, looks like there is more variance for party sizes of 1 then 2-4


In [None]:
# We can use the hue facet to automatically define subsets along a column

# Plot, note the markers argument
sns.lmplot("total_bill", "tip_pect", tips, hue="sex",markers=["x","o"])
 

In [None]:
# Does day make a difference?
sns.lmplot("total_bill", "tip_pect", tips, hue="day")
     

In [None]:
# Finally it should be noted that Seabron supports LOESS model fitting
url = 'http://en.wikipedia.org/wiki/Local_regression'

sns.lmplot("total_bill", "tip_pect", tips, lowess=True, line_kws={"color": 'black'});
  

In [None]:
# The lmplot() we've been using is actually using a lower-level function, regplot()

sns.regplot("total_bill","tip_pect",tips)


In [None]:
# reg_plot can be added to existing axes without modifying anything in the figure

# Create figure with 2 subplots
fig, (axis1,axis2) = plt.subplots(1,2,sharey =True)

sns.regplot("total_bill","tip_pect",tips,ax=axis1)
sns.violinplot(tips['tip_pect'],tips['size'],color='Reds_r',ax=axis2)


#### Lec 53 - Heatmaps and Clustered Matrices

In [None]:
# Again seaborn comes with a great dataset to play and learn with
flight_dframe = sns.load_dataset('flights')
 

In [None]:
#Preview
flight_dframe.head()


In [None]:
# Let's pivot this dataframe do its easier to manage
flight_dframe = flight_dframe.pivot("month","year","passengers")

#Show
flight_dframe

In [None]:
# This dataset is now in a clear format to be dispalyed as a heatmap
sns.heatmap(flight_dframe)

In [None]:
# We also have the option to annotate each cell
sns.heatmap(flight_dframe,annot=True,fmt='d')


In [None]:
# seaborn will automatically try to pick the best color scheme for your dataset, whether is be diverging or converging colormap
     

# We can choose our own 'center' for our colormap
sns.heatmap(flight_dframe,center=flight_dframe.loc['January',1955])
 

In [None]:
# heatmap() can be used on an axes for a subplot to create more informative figures
f, (axis1,axis2) = plt.subplots(2,1)

yearly_flights = flight_dframe.sum()

# Since yearly_flights is a weird format, we'll have to grab the values we want with a Series, then put them in a dframe

years = pd.Series(yearly_flights.index.values)
years = pd.DataFrame(years)

flights = pd.Series(yearly_flights.values) 
flights = pd.DataFrame(flights)

# Make the dframe and name columns
year_dframe = pd.concat((years,flights),axis=1)
year_dframe.columns = ['Year','Flights']



# Create the bar plot on top
sns.barplot('Year',y='Flights',data=year_dframe, ax = axis1)

# Create the heatmap on bottom
sns.heatmap(flight_dframe,cmap='Blues',ax=axis2,cbar_kws={"orientation": "horizontal"})
     

In [None]:
# Finally we'll learn about using a clustermap

# Clustermap will reformat the heatmap so similar rows are next to each other
sns.clustermap(flight_dframe)

In [None]:
# Let's uncluster the columns
sns.clustermap(flight_dframe,col_cluster=False)
 

In [None]:
# Since the number of flights increase every year, we should set a standard scale
sns.clustermap(flight_dframe,standard_scale=1) # standardize by columns (year)
 

In [None]:
# Or scale the rows
sns.clustermap(flight_dframe,standard_scale=0)
 

In [None]:
# Finally we can also normalize the rows by their Z-score.
# This subtracts the mean and devides by the STD of each column, then teh rows have amean of 0 and a variance of 1
sns.clustermap(flight_dframe,z_score=1)


Above we can see which values are greater than the mean and which are below very clearly
     

CONGRATULATIONS!! We've developed quite a toolbox to hammer out some great data anaysis projects!

Up next: Projects to apply what we've learned to real datasets!
   

### Introduction to SQL with Python

In this notebook we'll go over a brief introduction to the structure of the Sakila Database and setting up SQL in your Python Environment.

DISCLAIMER:

There are many ways to browse through a SQL database, throughout this Appendix we are only going to be focusing on learning about SQL queries using a combination of SQLite,Python,pandas, and SQLAlchemy. Please note that this is a pretty specific way of operating with a SQL Database, and may or may not fit other general needs. The primary goal of this section is to teach you how to use SQL queries to grab information and set it as a pandas DataFrame. We will not be going over more general topics of relational databases, MySQL, or using a SQL console directly.

To fully understand the content of this Appendix, I suggest you complete the course up to at least Lecture 28, although I really recommend completing up to Lecture 46 to get the most out of this Appendix!

Great, let's begin!
Step 1: Download SQL Alchemy
To start this appendix, download SQLAlchemy. You can do this by either downloading it here

Or - by typing pip install sqlalchemy in your command line.

Or - by typing conda install sqlalchemy if you are using the Anaconda installation of Python. (recommended)

Step 2: Download SQLite Broswer
Next up we will download a sql browser. We will be using SQLite Browser because it is lightweight and free to use. There are many alternatives you can use, check out a list of 10 free ones here

Download SQLite Browser here: http://sqlitebrowser.org/

Step 3: Download the sakila Database
You can download the fully constructed database here

Or - you can download the .sql file to construct the database yourself: http://dev.mysql.com/doc/index-other.html Then use SQLite Browser to construct the database by running the .sql

Either way, make sure to save it in the same directory as your iPython notebooks, or remember the file path for later so we can tell pandas exactly where to look for it.

All done! Now let's look at the database before diving into how to work with it in Python.
Check out the database either by opening it up using SQLite Browser or by checking out the diagram at this link: Diagram

I've posted it below as well inside this notebook, but fair warning, the picture is huge!

In [None]:
# Note: The picture is really big, I suggest you check out the link directly!

from IPython.display import Image
Image(url='http://www.dbquanti.eu/css/images/database.png')

Now that we have seen an overview of what the database looks like, let's go ahead and learn how to communicate with it with Python and pandas.

Python comes with SQLite3, which provides a lightweight disk-based database that doesn't require a seperate server process. It's useful to prototyp with SQLite and then port the code to a larger database system, like MySQL. Python comes with a pretty awesome module to connect to a SQL database with SQLite. The module is SQLite3, let's go ahead and import it (and pandas as well).

In [1]:
# imports!
import sqlite3
import pandas as pd

To use the module, you must first create a Connection object that represents the database. If the database name already exists SQLite3 will automatically connect to it, if it does not exsist, SQLite3 will automatically create.

For experienced users: You can also supply the special name :memory: to create a database in RAM.

Let's make the connection!

In [2]:
# Connect to the database (again, downloaded from here: https://www.dropbox.com/s/t049qmjzycrakro/sakila.db?dl=0
con = sqlite3.connect("sakila.db")

Now we can run a basic SQL query, pass it with pandas, and display the output as a DataFrame! Don't worry if you don't understand the query completely yet, this is just a usage example for connecting to the database, other lectures will dive deeper into SQL queries.

In [5]:
# Set SQL query as a comment
sql_query = ''' SELECT * FROM customer '''

# Use pandas to pass sql query using connection form SQLite3
df = pd.read_sql(sql_query, con)

# Show the resulting DataFrame
df

DatabaseError: Execution failed on sql ' SELECT * FROM customer ': no such table: customer

Congratulations! You just passed a SQL Query using pandas and Python! You're amazing! Subsequent lectures will go further into how to query with SQL, but if you already know SQL, you're good to go!

#### SQL SELECT,DISTINCT,WHERE,AND & OR

SQL SELECT Statement

The SELECT statement is used to select data from a database. The result is then stored in a result table, sometimes called the result-set.

Syntax for SQL SELECT
SELECT column_name FROM table_name

We could also select multiple columns:

SELECT column_name1,column_name2
FROM table_name

Or we could select everything in a table using *

SELECT * FROM table_name

To see how this and multiple other queries work, we'll connect to the database and make a function that automatically takes in our query and returns a DataFrame.

In [None]:
# Imports
import sqlite3
import pandas as pd
con = sqlite3.connect("sakila.db")

# Set function as our sql_to_pandas

def sql_to_df(sql_query):

    # Use pandas to pass sql query using connection form SQLite3
    df = pd.read_sql(sql_query, con)

    # Show the resulting DataFrame
    return df

#### Selecting Multiple Columns

In [None]:
# Select multiple columns example
query = ''' SELECT first_name,last_name
            FROM customer; '''

# Grab from first two columns
sql_to_df(query).head()

#### Selecting Everything from table with *

In [None]:
# Select multiple columns example
query = ''' SELECT *
            FROM customer; '''

# Grab 
sql_to_df(query).head()

#### Syntax for the SQL DISTINCT Statement

In a table, a column may contain duplicate values; and sometimes you only want to list the distinct (unique) values. The DISTINCT keyword can be used to return only distinct (unique) values.

SELECT DISTINCT column_name

FROM table_name;

In [None]:
# Select distinct country_ids from the city table.
query = ''' SELECT DISTINCT(country_id)
            FROM city'''

sql_to_df(query).head()

#### Syntax for the SQL WHERE
The WHERE clause is used to filter records, the WHERE clause is used to extract only the records that fulfill the specific parameter.

SELECT column_name

FROM table_name

WHERE column_name ( math operator) desired_value;

In [None]:
# Select all customer info from the 1st store.
query = ''' SELECT *
            FROM customer
            WHERE store_id = 1'''

sql_to_df(query).head()

Note, there are a variety of logical operators you can use for a SQL request.

Operator	Description
    %        Equal
    
<>	        Not equal. Note: In some versions of SQL this operator             may be written !=

>	        Greater than

<	        Less than

>=	        Greater than or equal

<=	        Less than or equal

SQL requires single quotes around text values, while numeric fields are not enclosed in quotes, for example a text value for the above where statement:

In [None]:
# Select all customer info from Mary.
query = ''' SELECT *
            FROM customer
            WHERE first_name = 'MARY'  '''

sql_to_df(query).head()

#### Syntax for AND
The AND operator is used to filter records based on more than one condition.

The AND operator displays a record if both the first condition AND the second condition are true.

In [None]:
# Select all films from 2006 that are rated R.

query = ''' SELECT *
            FROM film
            WHERE release_year = 2006
            AND rating = 'R' '''

sql_to_df(query).head()

#### Syntax for OR
The OR operator displays a record if either the first condition OR the second condition is true.

In [None]:

# Select all films from R or PG.

query = ''' SELECT *
            FROM film
            WHERE rating = 'PG'
            OR rating = 'R' '''

sql_to_df(query).head()

#### SQL WILDCARDS, ORDER BY, GROUP BY and Aggregate Functions
In this section, we will go over Wildcard statements, as well as ORDER BY and GROUP BY statements.

We will start by importing and connceting to our SQL database, then creating the function to convert SQL queries to a pandas DataFrame.

In [None]:
# Imports
import sqlite3
import pandas as pd
con = sqlite3.connect("sakila.db")

# Set function as our sql_to_pandas

def sql_to_df(sql_query):

    # Use pandas to pass sql query using connection form SQLite3
    df = pd.read_sql(sql_query, con)

    # Show the resulting DataFrame
    return df

Before we begin with Wildcards, ORDER BY, and GROUP BY. Let's take a look at aggregate functions.

- AVG() - Returns the average value.
- COUNT() - Returns the number of rows.
- FIRST() - Returns the first value.
- LAST() - Returns the last value.
- MAX() - Returns the largest value.
- MIN() - Returns the smallest value.
- SUM() - Returns the sum.

You can call any of these aggregate functions on a column to get the resulting values back. For example:



In [None]:
# Count the number of customers
query = ''' SELECT COUNT(customer_id)
            FROM customer; '''

# Grab 
sql_to_df(query).head()

Go ahead and experiment with the other aggregate functions. The usual syntax is:

SELECT column_name, aggregate_function(column_name)
FROM table_name
WHERE column_name

SQL Wildcards
A wildcard character can be used to substitute for any other characters in a string. In SQL, wildcard characters are used with the SQL LIKE operator. The LIKE operator is used in a WHERE clause to search for a specified pattern in a column.

There are several wildcard operators:

Wildcard	Description

%	A substitute for zero or more characters

_	A substitute for a single character

[character_list]	Sets and ranges of characters to match
Let's see them in action now!

In [None]:
# First the % wildcard

# Select any customers whose name start with an M
query = ''' SELECT *
            FROM customer
            WHERE first_name LIKE 'M%' ; '''

# Grab 
sql_to_df(query).head()

In [None]:
# Next the _ wildcard

# Select any customers whose last name ends with ing
query = ''' SELECT *
            FROM customer
            WHERE last_name LIKE '_ING' ; '''

# Grab 
sql_to_df(query).head()

Now we will move on to the [Character_list] wildcard.

**IMPORTANT NOTE!**

Using [charlist] with SQLite is a little different than with other SQL formats, such as MySQL.

In MySQL you would use:

WHERE value LIKE '[charlist]%'

In SQLite you use:

WHERE value GLOB '[charlist]*'

In [None]:
# Finally the [character_list] wildcard

# Select any customers whose first name begins with an A or a B
query = ''' SELECT *
            FROM customer
            WHERE first_name GLOB '[AB]*' ; '''

# Grab 
sql_to_df(query).head()

#### SQL ORDER BY
The ORDER BY keyword is used to sort the result-set by one or more columns. The ORDER BY keyword sorts the records in ascending order by default. To sort the records in a descending order, you can use the DESC keyword. The syntax is:

SELECT column_name
FROM table_name
ORDER BY column_name ASC|DESC

Let's see it in action:

In [None]:
# Select all customers and order results by last name
query = ''' SELECT *
            FROM customer
            ORDER BY last_name ; '''

# Grab 
sql_to_df(query).head()

In [None]:
# Select all customers and order results by last name, DESCENDING
query = ''' SELECT *
            FROM customer
            ORDER BY last_name DESC; '''

# Grab 
sql_to_df(query).head()

#### SQL GROUP BY
The GROUP BY statement is used with the aggregate functions to group the results by one or more columns. The syntax is:

SELECT column_name, aggregate_function(column_name)
FROM table_name
WHERE column_name operator value
GROUP BY column_name;

Let's see how it works.

In [None]:
# Count the number of customers per store

query = ''' SELECT store_id , COUNT(customer_id)
            FROM customer
            GROUP BY store_id; '''

# Grab 
sql_to_df(query).head()

#### Web Scraping

**Web Scraping in Python**

In this appendix lecture we'll go over how to scrape information from the web using Python.

We'll go to a website, decide what information we want, see where and how it is stored, then scrape it and set it as a pandas DataFrame!
Some things you should consider before web scraping a website:
1.) You should check a site's terms and conditions before you scrape them.

2.) Space out your requests so you don't overload the site's server, doing this could get you blocked.

3.) Scrapers break after time - web pages change their layout all the time, you'll more than likely have to rewrite your code.

4.) Web pages are usually inconsistent, more than likely you'll have to clean up the data after scraping it.

5.) Every web page and situation is different, you'll have to spend time configuring your scraper.

To learn more about HTML I suggest theses two resources:
W3School

Codecademy

There are three modules we'll need in addition to python are:
1.) BeautifulSoup, which you can download by typing: pip install beautifulsoup4 or conda install beautifulsoup4 (for the Anaconda distrbution of Python) in your command prompt.

2.) lxml , which you can download by typing: pip install lxml or conda install lxml (for the Anaconda distrbution of Python) in your command prompt.

3.) requests, which you can download by typing: pip install requests or conda install requests (for the Anaconda distrbution of Python) in your command prompt.

We'll start with our imports:

In [6]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from pandas import Series,DataFrame


For our quick web scraping tutorial, we'll look at some legislative reports from the University of California Web Page. Feel free to experiment with other webpages, but remember to be cautious and respectful in what you scrape and how often you do it. Always check the legality of a web scraping job.

Let's go ahead and set the url.

In [7]:
url = 'http://www.ucop.edu/operating-budget/budgets-and-reports/legislative-reports/2013-14-legislative-session.html'


Now let's go ahead and set up requests to grab content form the url, and set it as a Beautiful Soup object

In [8]:
# Request content from web page
result = requests.get(url)
c = result.content

# Set as Beautiful Soup Object
soup = BeautifulSoup(c)

In [9]:
# Go to the section of interest
summary = soup.find("div",{'class':'list-land','id':'content'})

# Find the tables in the HTML
tables = summary.find_all('table')

Now we need to use Beautiful Soup to find the table entries. A 'td' tag defines a standard cell in an HTML table. The 'tr' tag defines a row in an HTML table.

We'll parse through our tables object and try to find each cell using the findALL('td') method.

There are tons of options to use with findALL in beautiful soup. You can read about them here http://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all.

In [11]:
# Set up empty data list
data = []

# Set rows as first indexed object in tables with a row
rows = tables[0].findAll('tr')

# now grab every HTML cell in every row
for tr in rows:
    cols = tr.findAll('td')
    # Check to see if text is in the row
    for td in cols:
        text = td.find(text=True) 
        print(text),
        data.append(text)

IndexError: list index out of range

In [None]:
data

In [None]:
# Set up empty lists
reports = []
date = []

# Se tindex counter
index = 0

# Go find the pdf cells
for item in data:
    if 'pdf' in item:
        # Add the date and reports
        date.append(data[index-1])
        
        # Get rid of \xa0
        reports.append(item.replace(u'\xa0', u' '))
                    
    index += 1

You'll notice a line to take care of '\xa0 ' This is due to a unicode error that occurs if you don't do this. Web pages can be messy and inconsistent and it is very likely you'll have to do some research to take care of problems like these.

Here's the link I used to solve this particular issue: https://stackoverflow.com/questions/10993612/how-to-remove-xa0-from-string-in-python

Now all that is left is to organize our data into a pandas DataFrame!

In [None]:
# Set up Dates and Reports as Series
date = Series(date)
reports = Series(reports)


In [None]:
# Concatenate into a DataFrame
legislative_df = pd.concat([date,reports],axis=1)

In [None]:
# Set up the columns
legislative_df.columns = ['Date','Reports']
# Show the finished DataFrame
legislative_df

There are other less intense options for web scraping:

Check out these two companies:

https://import.io/

https://www.kimonolabs.com/

Good Job!