## Introduction to Pandas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# these are for navigating your system & finding the data
import os
from pathlib import Path

In [3]:
# where are we?
home = Path.cwd()
home

PosixPath('/Users/austinlasseter/atelier/generalassembly/python-2-dc/07-pandas-intro/code')

In [4]:
# one level above
home.parent

PosixPath('/Users/austinlasseter/atelier/generalassembly/python-2-dc/07-pandas-intro')

In [5]:
# where is the data folder?
data_dir = Path.joinpath(home.parent, 'data')
data_dir

PosixPath('/Users/austinlasseter/atelier/generalassembly/python-2-dc/07-pandas-intro/data')

In [6]:
# what's in it?
os.listdir(data_dir)

['old-faithful.csv',
 'collegeadmissions.csv',
 'u.item',
 'yelp.json',
 'msleep.csv',
 'beer.txt',
 '.DS_Store',
 'myresults.csv',
 'Production.ProductSubcategory.csv',
 'drinks.csv',
 'apply functions in pandas.ipynb',
 'imdb_1000.csv',
 'imdb_ids.txt',
 'oracle_10k.csv',
 'drinks_updated.csv',
 'airlines.csv',
 'u.data',
 'ozone.csv',
 'vti.csv',
 'user.tbl',
 'ufo.csv',
 'u.user_original',
 'rossmann-stores.csv',
 'Sales.SalesOrderHeader.csv',
 'titanic.csv',
 'wine.csv',
 'student_comments.csv',
 'haystack.csv',
 'drones.csv',
 'movie_ratings.tsv',
 'mtcars.csv',
 'u.user',
 'bikeshare.csv',
 'hitters.csv',
 'features.csv',
 'NBA_players_2015.csv',
 'Sales.SalesOrderDetail.csv',
 'Production.Product.csv',
 'bb_results.csv',
 'chipotle.tsv',
 'bank-additional.csv',
 'vehicles_train.csv',
 'vehicles_test.csv',
 'stores.csv']

In [7]:
# define a path to one datafile
filepath = Path.joinpath(home.parent, 'data', 'Production.Product.csv')
filepath

PosixPath('/Users/austinlasseter/atelier/generalassembly/python-2-dc/07-pandas-intro/data/Production.Product.csv')

In [14]:
# use pandas to read in the datafile
prod = pd.read_csv(filepath, sep='\t')
prod.sample(3)

Unnamed: 0,ProductID,Name,ProductNumber,MakeFlag,FinishedGoodsFlag,Color,SafetyStockLevel,ReorderPoint,StandardCost,ListPrice,...,ProductLine,Class,Style,ProductSubcategoryID,ProductModelID,SellStartDate,SellEndDate,DiscontinuedDate,rowguid,ModifiedDate
114,436,Thin-Jam Lock Nut 12,LJ-9080,0,0,,1000,750,0.0,0.0,...,,,,,,2008-04-30 00:00:00,,,{5D3E589F-4584-406B-B9CC-3C8C060CB9A5},2014-02-08 10:01:36.827000000
266,762,"Road-650 Red, 44",BK-R50R-44,1,1,Red,100,75,486.7066,782.99,...,R,L,U,2.0,30.0,2011-05-31 00:00:00,2013-05-29 00:00:00,,{F247AAAE-12E3-4048-A37B-CCE4A8999E81},2014-02-08 10:01:36.827000000
227,723,"LL Road Frame - Black, 60",FR-R38B-60,1,1,Black,500,375,204.6251,337.22,...,R,L,U,14.0,9.0,2011-05-31 00:00:00,,,{AA95E2A5-E7C4-4B74-B1EA-B52EE3B51537},2014-02-08 10:01:36.827000000


In [15]:
# what is prod?
type(prod)

pandas.core.frame.DataFrame

In [16]:
# how big is it?
prod.shape

(504, 25)

In [17]:
# what are the columns?
prod.columns

Index(['ProductID', 'Name', 'ProductNumber', 'MakeFlag', 'FinishedGoodsFlag',
       'Color', 'SafetyStockLevel', 'ReorderPoint', 'StandardCost',
       'ListPrice', 'Size', 'SizeUnitMeasureCode', 'WeightUnitMeasureCode',
       'Weight', 'DaysToManufacture', 'ProductLine', 'Class', 'Style',
       'ProductSubcategoryID', 'ProductModelID', 'SellStartDate',
       'SellEndDate', 'DiscontinuedDate', 'rowguid', 'ModifiedDate'],
      dtype='object')

In [19]:
# what is the index length?
print(len(prod))
print(prod.index)

504
RangeIndex(start=0, stop=504, step=1)


### How to call individual columns

In [20]:
prod.columns

Index(['ProductID', 'Name', 'ProductNumber', 'MakeFlag', 'FinishedGoodsFlag',
       'Color', 'SafetyStockLevel', 'ReorderPoint', 'StandardCost',
       'ListPrice', 'Size', 'SizeUnitMeasureCode', 'WeightUnitMeasureCode',
       'Weight', 'DaysToManufacture', 'ProductLine', 'Class', 'Style',
       'ProductSubcategoryID', 'ProductModelID', 'SellStartDate',
       'SellEndDate', 'DiscontinuedDate', 'rowguid', 'ModifiedDate'],
      dtype='object')

In [21]:
# what is the data type in each column?
prod.dtypes

ProductID                  int64
Name                      object
ProductNumber             object
MakeFlag                   int64
FinishedGoodsFlag          int64
Color                     object
SafetyStockLevel           int64
ReorderPoint               int64
StandardCost             float64
ListPrice                float64
Size                      object
SizeUnitMeasureCode       object
WeightUnitMeasureCode     object
Weight                   float64
DaysToManufacture          int64
ProductLine               object
Class                     object
Style                     object
ProductSubcategoryID     float64
ProductModelID           float64
SellStartDate             object
SellEndDate               object
DiscontinuedDate         float64
rowguid                   object
ModifiedDate              object
dtype: object

In [24]:
# How to select an individual column
# 3 ways to do this.
# First way. use brackets
prod['Color'].sample(5)

293       Red
461    Yellow
123       NaN
485    Silver
470      Blue
Name: Color, dtype: object

In [26]:
# another way.
prod.Color.sample(5)

311       NaN
215      Blue
330     Black
318     Black
384    Silver
Name: Color, dtype: object

In [30]:
# Third way (most typing, best results)
prod[['Color']].sample(5)

Unnamed: 0,Color
318,Black
6,Black
463,Blue
196,
495,Black


In [33]:
# The third method allows me to select multiple columns!
list_of_columns = ['Color', 'Style', 'Class'] # first bracket indicates "list"
prod[list_of_columns].sample(5) # second bracket indicates "dataframe filter"

Unnamed: 0,Color,Style,Class
82,,,
394,Blue,U,H
365,Black,U,
165,,,
159,,,


In [35]:
# Let's create another filter using brackets!
condition = prod['Color']=='Blue'
prod[  condition    ]

Unnamed: 0,ProductID,Name,ProductNumber,MakeFlag,FinishedGoodsFlag,Color,SafetyStockLevel,ReorderPoint,StandardCost,ListPrice,...,ProductLine,Class,Style,ProductSubcategoryID,ProductModelID,SellStartDate,SellEndDate,DiscontinuedDate,rowguid,ModifiedDate
215,711,"Sport-100 Helmet, Blue",HL-U509-B,0,1,Blue,4,3,13.0863,34.99,...,S,,,31.0,33.0,2011-05-31 00:00:00,,,{FD7C0858-4179-48C2-865B-ABD5DFC7BC1D},2014-02-08 10:01:36.827000000
368,864,"Classic Vest, S",VE-C304-S,0,1,Blue,4,3,23.749,63.5,...,S,,U,25.0,1.0,2013-05-30 00:00:00,,,{EB423EF3-409D-46FE-B35B-D69970820314},2014-02-08 10:01:36.827000000
369,865,"Classic Vest, M",VE-C304-M,0,1,Blue,4,3,23.749,63.5,...,S,,U,25.0,1.0,2013-05-30 00:00:00,,,{2E52F96E-64A1-4069-911C-E3FD6E094A1E},2014-02-08 10:01:36.827000000
370,866,"Classic Vest, L",VE-C304-L,0,1,Blue,4,3,23.749,63.5,...,S,,U,25.0,1.0,2013-05-30 00:00:00,,,{3211F5A8-B6C4-48BD-9AA4-D69CB40D97DD},2014-02-08 10:01:36.827000000
394,890,"HL Touring Frame - Blue, 46",FR-T98U-46,1,1,Blue,500,375,601.7437,1003.91,...,T,H,U,16.0,7.0,2013-05-30 00:00:00,,,{8BBD3437-A58B-41A0-9503-FC14B23E7678},2014-02-08 10:01:36.827000000
395,891,"HL Touring Frame - Blue, 50",FR-T98U-50,1,1,Blue,500,375,601.7437,1003.91,...,T,H,U,16.0,7.0,2013-05-30 00:00:00,,,{C4244F0C-ABCE-451B-A895-83C0E6D1F448},2014-02-08 10:01:36.827000000
396,892,"HL Touring Frame - Blue, 54",FR-T98U-54,1,1,Blue,500,375,601.7437,1003.91,...,T,H,U,16.0,7.0,2013-05-30 00:00:00,,,{E9AAE947-6BC3-4909-8937-3E1CDCEC8A8F},2014-02-08 10:01:36.827000000
397,893,"HL Touring Frame - Blue, 60",FR-T98U-60,1,1,Blue,500,375,601.7437,1003.91,...,T,H,U,16.0,7.0,2013-05-30 00:00:00,,,{B01951A4-A581-4D10-9DC2-515DA180F1B8},2014-02-08 10:01:36.827000000
399,895,"LL Touring Frame - Blue, 50",FR-T67U-50,1,1,Blue,500,375,199.8519,333.42,...,T,L,U,16.0,10.0,2013-05-30 00:00:00,,,{B78ECCCA-FA88-4071-9110-410585127E46},2014-02-08 10:01:36.827000000
400,896,"LL Touring Frame - Blue, 54",FR-T67U-54,1,1,Blue,500,375,199.8519,333.42,...,T,L,U,16.0,10.0,2013-05-30 00:00:00,,,{0FF799C9-DD11-4B81-AAF7-65410017405B},2014-02-08 10:01:36.827000000


In [36]:
# Try 2 things:
# call a subset of all columns
# create your own simple filter
# slack out the results as a screenshot.

In [48]:
# what exactly is the filter doing?
# prod['Color'].head(10) 
prod['Color'].head(10) == 'NaN'

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
Name: Color, dtype: bool

In [52]:
# Null values
prod['Color'].isnull()
prod['Color'].notnull().head()
# turn that into a filter
prod[ prod['Color'].notnull() ].head()

Unnamed: 0,ProductID,Name,ProductNumber,MakeFlag,FinishedGoodsFlag,Color,SafetyStockLevel,ReorderPoint,StandardCost,ListPrice,...,ProductLine,Class,Style,ProductSubcategoryID,ProductModelID,SellStartDate,SellEndDate,DiscontinuedDate,rowguid,ModifiedDate
5,317,LL Crankarm,CA-5965,0,0,Black,500,375,0.0,0.0,...,,L,,,,2008-04-30 00:00:00,,,{3C9D10B7-A6B2-4774-9963-C19DCEE72FEA},2014-02-08 10:01:36.827000000
6,318,ML Crankarm,CA-6738,0,0,Black,500,375,0.0,0.0,...,,M,,,,2008-04-30 00:00:00,,,{EABB9A92-FA07-4EAB-8955-F0517B4A4CA7},2014-02-08 10:01:36.827000000
7,319,HL Crankarm,CA-7457,0,0,Black,500,375,0.0,0.0,...,,,,,,2008-04-30 00:00:00,,,{7D3FD384-4F29-484B-86FA-4206E276FE58},2014-02-08 10:01:36.827000000
8,320,Chainring Bolts,CB-2903,0,0,Silver,1000,750,0.0,0.0,...,,,,,,2008-04-30 00:00:00,,,{7BE38E48-B7D6-4486-888E-F53C26735101},2014-02-08 10:01:36.827000000
9,321,Chainring Nut,CN-6137,0,0,Silver,1000,750,0.0,0.0,...,,,,,,2008-04-30 00:00:00,,,{3314B1D7-EF69-4431-B6DD-DC75268BD5DF},2014-02-08 10:01:36.827000000


In [57]:
# Another example of a filter
prod['FinishedGoodsFlag'] == 1
prod  [ prod ['FinishedGoodsFlag']  == 1 ] . head()

Unnamed: 0,ProductID,Name,ProductNumber,MakeFlag,FinishedGoodsFlag,Color,SafetyStockLevel,ReorderPoint,StandardCost,ListPrice,...,ProductLine,Class,Style,ProductSubcategoryID,ProductModelID,SellStartDate,SellEndDate,DiscontinuedDate,rowguid,ModifiedDate
209,680,"HL Road Frame - Black, 58",FR-R92B-58,1,1,Black,500,375,1059.31,1431.5,...,R,H,U,14.0,6.0,2008-04-30 00:00:00,,,{43DD68D6-14A4-461F-9069-55309D90EA7E},2014-02-08 10:01:36.827000000
210,706,"HL Road Frame - Red, 58",FR-R92R-58,1,1,Red,500,375,1059.31,1431.5,...,R,H,U,14.0,6.0,2008-04-30 00:00:00,,,{9540FF17-2712-4C90-A3D1-8CE5568B2462},2014-02-08 10:01:36.827000000
211,707,"Sport-100 Helmet, Red",HL-U509-R,0,1,Red,4,3,13.0863,34.99,...,S,,,31.0,33.0,2011-05-31 00:00:00,,,{2E1EF41A-C08A-4FF6-8ADA-BDE58B64A712},2014-02-08 10:01:36.827000000
212,708,"Sport-100 Helmet, Black",HL-U509,0,1,Black,4,3,13.0863,34.99,...,S,,,31.0,33.0,2011-05-31 00:00:00,,,{A25A44FB-C2DE-4268-958F-110B8D7621E2},2014-02-08 10:01:36.827000000
213,709,"Mountain Bike Socks, M",SO-B909-M,0,1,White,4,3,3.3963,9.5,...,M,,U,23.0,18.0,2011-05-31 00:00:00,2012-05-29 00:00:00,,{18F95F47-1540-4E02-8F1F-CC1BCB6828D0},2014-02-08 10:01:36.827000000


In [59]:
# You can call a single (or multiple) column(s) from filtered results
prod  [ prod ['FinishedGoodsFlag']  == 1 ][['FinishedGoodsFlag', 'Color', 'ListPrice']].head(5)

Unnamed: 0,FinishedGoodsFlag,Color,ListPrice
209,1,Black,1431.5
210,1,Red,1431.5
211,1,Red,34.99
212,1,Black,34.99
213,1,White,9.5


### Calculate some simple summary stats on your dataframe

In [62]:
# describe all columns
prod.describe()

Unnamed: 0,ProductID,MakeFlag,FinishedGoodsFlag,SafetyStockLevel,ReorderPoint,StandardCost,ListPrice,Weight,DaysToManufacture,ProductSubcategoryID,ProductModelID,DiscontinuedDate
count,504.0,504.0,504.0,504.0,504.0,504.0,504.0,205.0,504.0,295.0,295.0,0.0
mean,673.039683,0.474206,0.585317,535.150794,401.363095,258.602961,438.66625,74.06922,1.103175,12.294915,37.444068,
std,229.373142,0.49983,0.493157,374.112954,280.584715,461.632808,773.602843,182.166588,1.492616,9.860135,34.025442,
min,1.0,0.0,0.0,4.0,3.0,0.0,0.0,2.12,0.0,1.0,1.0,
25%,447.75,0.0,0.0,100.0,75.0,0.0,0.0,2.88,0.0,2.0,11.0,
50%,747.5,0.0,1.0,500.0,375.0,23.3722,49.99,17.9,1.0,12.0,26.0,
75%,873.25,1.0,1.0,1000.0,750.0,317.075825,564.99,27.35,1.0,17.0,48.5,
max,999.0,1.0,1.0,1000.0,750.0,2171.2942,3578.27,1050.0,4.0,37.0,128.0,


In [64]:
# describe a single column
prod['ListPrice'].describe()

count     504.000000
mean      438.666250
std       773.602843
min         0.000000
25%         0.000000
50%        49.990000
75%       564.990000
max      3578.270000
Name: ListPrice, dtype: float64

In [66]:
# Describe a subset of columns
prod[['ListPrice', 'Weight', 'StandardCost']].describe()

Unnamed: 0,ListPrice,Weight,StandardCost
count,504.0,205.0,504.0
mean,438.66625,74.06922,258.602961
std,773.602843,182.166588,461.632808
min,0.0,2.12,0.0
25%,0.0,2.88,0.0
50%,49.99,17.9,23.3722
75%,564.99,27.35,317.075825
max,3578.27,1050.0,2171.2942


In [72]:
# Describe a subset of these 8 statistics (for the entire dataframe)
prod.mean()
prod.median()
prod.min()
prod.max()
prod.count()
prod.quantile(.25)

ProductID               447.75
MakeFlag                  0.00
FinishedGoodsFlag         0.00
SafetyStockLevel        100.00
ReorderPoint             75.00
StandardCost              0.00
ListPrice                 0.00
Weight                    2.88
DaysToManufacture         0.00
ProductSubcategoryID      2.00
ProductModelID           11.00
DiscontinuedDate           NaN
Name: 0.25, dtype: float64

In [74]:
# Describe a subset of these 8 statistics (for a single column)
prod['ListPrice'].mean()
prod['ListPrice'].median()
prod['ListPrice'].min()
prod['ListPrice'].max()
prod['ListPrice'].count()
prod['ListPrice'].quantile(.5)

49.99

In [75]:
# Describe a subset of these 8 statistics (for a subset of columns)
prod[['ListPrice', 'Weight']].mean()
prod[['ListPrice', 'Weight']].median()
prod[['ListPrice', 'Weight']].min()
prod[['ListPrice', 'Weight']].max()
prod[['ListPrice', 'Weight']].count()
prod[['ListPrice', 'Weight']].quantile(.5)

ListPrice    49.99
Weight       17.90
Name: 0.5, dtype: float64

## Renaming some columns

In [76]:
prod.columns

Index(['ProductID', 'Name', 'ProductNumber', 'MakeFlag', 'FinishedGoodsFlag',
       'Color', 'SafetyStockLevel', 'ReorderPoint', 'StandardCost',
       'ListPrice', 'Size', 'SizeUnitMeasureCode', 'WeightUnitMeasureCode',
       'Weight', 'DaysToManufacture', 'ProductLine', 'Class', 'Style',
       'ProductSubcategoryID', 'ProductModelID', 'SellStartDate',
       'SellEndDate', 'DiscontinuedDate', 'rowguid', 'ModifiedDate'],
      dtype='object')

In [79]:
prod.rename(columns={'WeightUnitMeasureCode': 'WUMC'}, inplace=True)
print(prod.columns)

Index(['ProductID', 'Name', 'ProductNumber', 'MakeFlag', 'FinishedGoodsFlag',
       'Color', 'SafetyStockLevel', 'ReorderPoint', 'StandardCost',
       'ListPrice', 'Size', 'SizeUnitMeasureCode', 'WUMC', 'Weight',
       'DaysToManufacture', 'ProductLine', 'Class', 'Style',
       'ProductSubcategoryID', 'ProductModelID', 'SellStartDate',
       'SellEndDate', 'DiscontinuedDate', 'rowguid', 'ModifiedDate'],
      dtype='object')


In [None]:
# overwrite file
prod.to_csv('filename.csv')

In [80]:
# do this with multiple columns
prod.rename(columns={'ProductID':'ID', 'Name':'ProdName', 'ProductNumber':'Number'}, inplace=True)
prod.head()

Unnamed: 0,ID,ProdName,Number,MakeFlag,FinishedGoodsFlag,Color,SafetyStockLevel,ReorderPoint,StandardCost,ListPrice,...,ProductLine,Class,Style,ProductSubcategoryID,ProductModelID,SellStartDate,SellEndDate,DiscontinuedDate,rowguid,ModifiedDate
0,1,Adjustable Race,AR-5381,0,0,,1000,750,0.0,0.0,...,,,,,,2008-04-30 00:00:00,,,{694215B7-08F7-4C0D-ACB1-D734BA44C0C8},2014-02-08 10:01:36.827000000
1,2,Bearing Ball,BA-8327,0,0,,1000,750,0.0,0.0,...,,,,,,2008-04-30 00:00:00,,,{58AE3C20-4F3A-4749-A7D4-D568806CC537},2014-02-08 10:01:36.827000000
2,3,BB Ball Bearing,BE-2349,1,0,,800,600,0.0,0.0,...,,,,,,2008-04-30 00:00:00,,,{9C21AED2-5BFA-4F18-BCB8-F11638DC2E4E},2014-02-08 10:01:36.827000000
3,4,Headset Ball Bearings,BE-2908,0,0,,800,600,0.0,0.0,...,,,,,,2008-04-30 00:00:00,,,{ECFED6CB-51FF-49B5-B06C-7D8AC834DB8B},2014-02-08 10:01:36.827000000
4,316,Blade,BL-2036,1,0,,800,600,0.0,0.0,...,,,,,,2008-04-30 00:00:00,,,{E73E9750-603B-4131-89F5-3DD15ED5FF80},2014-02-08 10:01:36.827000000


## A few common operations with Pandas

In [84]:
# Show all values of a categorical variable
prod['Color'].value_counts() # don't display null
prod['Color'].value_counts(dropna=False) # do display nulls

NaN             248
Black            93
Silver           43
Red              38
Yellow           36
Blue             26
Multi             8
Silver/Black      7
White             4
Grey              1
Name: Color, dtype: int64

In [88]:
# display frequencies sorted by the index
prod['Style'].value_counts().sort_index()

M       7
U     176
W      28
Name: Style, dtype: int64

In [94]:
# What if I just want a list of the colors?
prod['Color'].value_counts().index

Index(['Black', 'Silver', 'Red', 'Yellow', 'Blue', 'Multi', 'Silver/Black',
       'White', 'Grey'],
      dtype='object')

In [95]:
# What if I just want the freqs?
prod['Color'].value_counts().values

array([93, 43, 38, 36, 26,  8,  7,  4,  1])

In [97]:
# How many UNIQUE colors are there?
len(prod['Color'].value_counts().index)
prod['Color'].nunique()

9

In [100]:
# List the unique colors
prod['Color'].value_counts(dropna=False).index
prod['Color'].unique()

array([nan, 'Black', 'Silver', 'Red', 'White', 'Blue', 'Multi', 'Yellow',
       'Grey', 'Silver/Black'], dtype=object)