# Demo - More! Pandas

In [1]:
import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [2]:
# view raw values
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [3]:
# view index
data.index

RangeIndex(start=0, stop=4, step=1)

In [4]:
# we can index, just like a standard Python list
data[1]

0.5

In [5]:
# Because index is a RangeIndex, we can do
# normal slicing from offset 1 to offset 2
# 
data[1:3]

1    0.50
2    0.75
dtype: float64

In [6]:
# create a series with non-integer indices
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

In [7]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [8]:
data.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [9]:
# similar to dict indexing
data['c']

0.75

In [12]:
# if you are particularly perverse...
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[2, 5, 3, 7])
data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [13]:
data[3]

0.75

# Implicit and Explicit Indexing

In [21]:
data = pd.Series(['a', 'b', 'c'], index=['Python', 'C++', 'Ruby'])
data

Python    a
C++       b
Ruby      c
dtype: object

In [22]:
# index is no longer a range
# indices are like keys in a dictionary
data.index

Index(['Python', 'C++', 'Ruby'], dtype='object')

In [23]:
data[1] # index by numeric index, not offset!

'b'

In [24]:
# but slicing still works as before
data[1:3] # offset 1...offset 2

C++     b
Ruby    c
dtype: object

# __`loc`__ and __`iloc`__
* .loc is a __*label*__-based indexing method
* .iloc is an __*integer*__-based indexing method

In [27]:
data

Python    a
C++       b
Ruby      c
dtype: object

In [25]:
data.loc['Python'] # 1 here is a label, not an offset

'a'

In [26]:
data.loc['Python':'C++'] # 1 and 3 are labels, not integer offsets

Python    a
C++       b
dtype: object

In [32]:
data.iloc[1] # 1 is an offset, not a label

'b'

In [34]:
data.iloc[1:3] # 1..3 is a Python slice based on offsets

C++     b
Ruby    c
dtype: object

# Sales Data

In [36]:
dat = pd.read_csv("data/WA_Fn-UseC_-Sales-Win-Loss.csv")

In [37]:
dat.columns

Index(['Opportunity Number', 'Supplies Subgroup', 'Supplies Group', 'Region',
       'Route To Market', 'Elapsed Days In Sales Stage', 'Opportunity Result',
       'Sales Stage Change Count', 'Total Days Identified Through Closing',
       'Total Days Identified Through Qualified', 'Opportunity Amount USD',
       'Client Size By Revenue', 'Client Size By Employee Count',
       'Revenue From Client Past Two Years', 'Competitor Type',
       'Ratio Days Identified To Total Days',
       'Ratio Days Validated To Total Days',
       'Ratio Days Qualified To Total Days', 'Deal Size Category'],
      dtype='object')

In [38]:
dat['Opportunity Result']

0         Won
1        Loss
2         Won
3        Loss
4        Loss
5        Loss
6         Won
7        Loss
8        Loss
9        Loss
10       Loss
11       Loss
12       Loss
13       Loss
14       Loss
15        Won
16       Loss
17       Loss
18       Loss
19       Loss
20       Loss
21       Loss
22       Loss
23       Loss
24        Won
25       Loss
26       Loss
27       Loss
28       Loss
29       Loss
         ... 
77995    Loss
77996     Won
77997     Won
77998    Loss
77999    Loss
78000    Loss
78001     Won
78002    Loss
78003     Won
78004     Won
78005    Loss
78006     Won
78007    Loss
78008     Won
78009    Loss
78010    Loss
78011     Won
78012     Won
78013     Won
78014     Won
78015    Loss
78016     Won
78017     Won
78018     Won
78019     Won
78020    Loss
78021     Won
78022    Loss
78023    Loss
78024    Loss
Name: Opportunity Result, Length: 78025, dtype: object

# Counting Values

In [39]:
dat['Opportunity Result'].value_counts()

Loss    60398
Won     17627
Name: Opportunity Result, dtype: int64

In [40]:
dat['Supplies Group'].value_counts()

Car Accessories           49810
Performance & Non-auto    27325
Tires & Wheels              609
Car Electronics             281
Name: Supplies Group, dtype: int64

In [41]:
dat['Elapsed Days In Sales Stage'].value_counts()

16     5010
44     2388
62     1738
7      1629
23     1455
37     1412
45     1238
24     1233
35     1226
18     1220
89     1184
28     1173
26     1135
54     1124
27     1115
63     1107
49     1098
9      1058
91     1051
74     1020
81      976
64      976
41      952
65      945
0       934
47      922
73      912
17      905
84      899
90      870
       ... 
121       6
102       6
103       5
118       5
128       5
115       5
116       4
122       4
112       4
129       4
113       3
104       3
114       3
123       2
106       2
126       2
105       2
125       2
130       2
210       2
124       1
135       1
131       1
132       1
134       1
148       1
108       1
137       1
138       1
127       1
Name: Elapsed Days In Sales Stage, Length: 138, dtype: int64

# Top Five Values

In [42]:
dat['Supplies Subgroup'].value_counts()[:5]

Motorcycle Parts           15174
Exterior Accessories       13876
Garage & Car Care           9733
Shelters & RV               9606
Batteries & Accessories     9192
Name: Supplies Subgroup, dtype: int64

# Extracting Columns

In [43]:
dat.head()

Unnamed: 0,Opportunity Number,Supplies Subgroup,Supplies Group,Region,Route To Market,Elapsed Days In Sales Stage,Opportunity Result,Sales Stage Change Count,Total Days Identified Through Closing,Total Days Identified Through Qualified,Opportunity Amount USD,Client Size By Revenue,Client Size By Employee Count,Revenue From Client Past Two Years,Competitor Type,Ratio Days Identified To Total Days,Ratio Days Validated To Total Days,Ratio Days Qualified To Total Days,Deal Size Category
0,1641984,Exterior Accessories,Car Accessories,Northwest,Fields Sales,76,Won,13,104,101,0,5,5,0,Unknown,0.69636,0.113985,0.154215,1
1,1658010,Exterior Accessories,Car Accessories,Pacific,Reseller,63,Loss,2,163,163,0,3,5,0,Unknown,0.0,1.0,0.0,1
2,1674737,Motorcycle Parts,Performance & Non-auto,Pacific,Reseller,24,Won,7,82,82,7750,1,1,0,Unknown,1.0,0.0,0.0,1
3,1675224,Shelters & RV,Performance & Non-auto,,Reseller,16,Loss,5,124,124,0,1,1,0,Known,1.0,0.0,0.0,1
4,1689785,Exterior Accessories,Car Accessories,Pacific,Reseller,69,Loss,11,91,13,69756,1,1,0,Unknown,0.0,0.141125,0.0,4


In [44]:
region_results = dat[["Region", "Opportunity Result"]]

In [45]:
region_results.shape

(78025, 2)

In [46]:
region_results.head()

Unnamed: 0,Region,Opportunity Result
0,Northwest,Won
1,Pacific,Loss
2,Pacific,Won
3,,Loss
4,Pacific,Loss


# Creating a DataFrame from dicts

In [47]:
presidents = pd.DataFrame([
    { 'name': 'Barack Obama', 'elect': 2008, 'born': 1961 },
    { 'name': 'George W. Bush', 'elect': 2000, 'born': 1946 },
    { 'name': 'Bill Clinton', 'elect': 1992, 'born': 1946 },
    { 'name': 'George H.W. Bush', 'elect': 1988, 'born': 1924 },
])
presidents

Unnamed: 0,born,elect,name
0,1961,2008,Barack Obama
1,1946,2000,George W. Bush
2,1946,1992,Bill Clinton
3,1924,1988,George H.W. Bush


# Setting the Index of a DataFrame

In [48]:
presidents.columns

Index(['born', 'elect', 'name'], dtype='object')

In [49]:
presidents.set_index('name', inplace=True)

In [50]:
presidents

Unnamed: 0_level_0,born,elect
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Barack Obama,1961,2008
George W. Bush,1946,2000
Bill Clinton,1946,1992
George H.W. Bush,1924,1988


In [52]:
presidents['born'].idxmax() # who is the youngest president?

'Barack Obama'

In [53]:
presidents['born']['Bill Clinton']

1946

In [54]:
presidents.loc['Bill Clinton']

born     1946
elect    1992
Name: Bill Clinton, dtype: int64

In [55]:
presidents.loc['Bill Clinton']['born']

1946

In [56]:
# note that we are looking at presidents dataframe here
presidents['born']
# pd.DataFrame(presidents['born'])

name
Barack Obama        1961
George W. Bush      1946
Bill Clinton        1946
George H.W. Bush    1924
Name: born, dtype: int64

In [57]:
presidents['born'][2]

1946

In [58]:
presidents.iloc[2]

born     1946
elect    1992
Name: Bill Clinton, dtype: int64

In [59]:
presidents.iloc[2]['born']

1946

In [60]:
presidents['born']['Bill Clinton']

1946

In [61]:
presidents.loc['Bill Clinton']['born']

1946

In [62]:
presidents.loc['Bill Clinton']['elect']

1992

# Merging Two DataFrames

In [63]:
presidents_dads = pd.DataFrame([
    { 'son': 'Barack Obama', 'father': 'Barack Obama, Sr.' },
    { 'son': 'George W. Bush', 'father': 'George H.W. Bush' },
    { 'son': 'George H.W. Bush', 'father': 'Prescott Bush' },
])

presidents_dads

Unnamed: 0,father,son
0,"Barack Obama, Sr.",Barack Obama
1,George H.W. Bush,George W. Bush
2,Prescott Bush,George H.W. Bush


In [64]:
presidents

Unnamed: 0_level_0,born,elect
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Barack Obama,1961,2008
George W. Bush,1946,2000
Bill Clinton,1946,1992
George H.W. Bush,1924,1988


In [65]:
# in order to merge, we're going to need 'name' as a column,
# but right now it's the index, so let's add it as a column too
presidents['name'] = presidents.index

In [66]:
presidents

Unnamed: 0_level_0,born,elect,name
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Barack Obama,1961,2008,Barack Obama
George W. Bush,1946,2000,George W. Bush
Bill Clinton,1946,1992,Bill Clinton
George H.W. Bush,1924,1988,George H.W. Bush


In [67]:
pd.merge(presidents, presidents_dads, 
         left_on='name', right_on='son')

Defaulting to column, but this will raise an ambiguity error in a future version
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,born,elect,name,father,son
0,1961,2008,Barack Obama,"Barack Obama, Sr.",Barack Obama
1,1946,2000,George W. Bush,George H.W. Bush,George W. Bush
2,1924,1988,George H.W. Bush,Prescott Bush,George H.W. Bush


In [68]:
pd.merge(presidents, presidents_dads, left_on='name',
         right_on='son').drop('son', axis=1)

Defaulting to column, but this will raise an ambiguity error in a future version
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,born,elect,name,father
0,1961,2008,Barack Obama,"Barack Obama, Sr."
1,1946,2000,George W. Bush,George H.W. Bush
2,1924,1988,George H.W. Bush,Prescott Bush


In [69]:
pd.merge(presidents, presidents_dads, left_on='name',
         right_on='son', how='left').drop('son', axis=1)

Defaulting to column, but this will raise an ambiguity error in a future version
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,born,elect,name,father
0,1961,2008,Barack Obama,"Barack Obama, Sr."
1,1946,2000,George W. Bush,George H.W. Bush
2,1946,1992,Bill Clinton,
3,1924,1988,George H.W. Bush,Prescott Bush


In [70]:
final = pd.merge(presidents, presidents_dads, left_on='name',
         right_on='son', how='left').drop('son', axis=1).fillna('MISSING')

Defaulting to column, but this will raise an ambiguity error in a future version
  exec(code_obj, self.user_global_ns, self.user_ns)


In [71]:
final

Unnamed: 0,born,elect,name,father
0,1961,2008,Barack Obama,"Barack Obama, Sr."
1,1946,2000,George W. Bush,George H.W. Bush
2,1946,1992,Bill Clinton,MISSING
3,1924,1988,George H.W. Bush,Prescott Bush


In [72]:
final.set_index('name', inplace=True)
final

Unnamed: 0_level_0,born,elect,father
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Barack Obama,1961,2008,"Barack Obama, Sr."
George W. Bush,1946,2000,George H.W. Bush
Bill Clinton,1946,1992,MISSING
George H.W. Bush,1924,1988,Prescott Bush
