# Demo - Pandas

In [1]:
import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [2]:
# view raw values
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [3]:
# view index
data.index

RangeIndex(start=0, stop=4, step=1)

In [4]:
# we can index, just like a standard Python list
data[1]

0.5

In [5]:
# Because index is a RangeIndex, we can do
# normal slicing from offset 1 to offset 2
# 
data[1:3]

1    0.50
2    0.75
dtype: float64

In [6]:
# create a series with non-integer indices
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

In [7]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [9]:
# similar to dict indexing
data['c']

0.75

In [10]:
# if you are particularly perverse...
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[2, 5, 3, 7])
data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [11]:
data[5]

0.5

# Implicit and Explicit Indexing

In [14]:
data = pd.Series(['a', 'b','d', 'c'], index=[1, 3,3, 5])
data

1    a
3    b
3    d
5    c
dtype: object

In [15]:
# index is no longer a range
# indices are like keys in a dictionary
data.index

Int64Index([1, 3, 3, 5], dtype='int64')

In [17]:
data[3] # index by numeric index, not offset!

3    b
3    d
dtype: object

In [18]:
# but slicing still works as before
data[1:3] # offset 1...offset 2

3    b
3    d
dtype: object

# __`loc`__ and __`iloc`__
* .loc is a __*label*__-based indexing method
* .iloc is an __*integer*__-based indexing method

In [19]:
data

1    a
3    b
3    d
5    c
dtype: object

In [20]:
data.loc[1] # 1 here is a label, not an offset

'a'

In [21]:
data.loc[1:3] # 1 and 3 are labels, not integer offsets

1    a
3    b
3    d
dtype: object

In [22]:
data.iloc[1] # 1 is an offset, not a label

'b'

In [23]:
data.iloc[1:3] # 1..3 is a Python slice based on offsets

3    b
3    d
dtype: object

# Python Dicts as Series

In [24]:
population_dict = {'California': 38332521,
                       'Texas': 26448193,
                       'New York': 19651127,
                       'Florida': 19552860,
                       'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64

In [25]:
population['California']

38332521

In [26]:
population['California':'Illinois']

California    38332521
Florida       19552860
Illinois      12882135
dtype: int64

# Pandas DataFrame

In [27]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
                 'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
dtype: int64

In [28]:
states = pd.DataFrame({'population': population,
                           'area': area})
states

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [29]:
states.index

Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')

In [30]:
states.columns

Index(['area', 'population'], dtype='object')

In [31]:
states['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [32]:
states.values

array([[  423967, 38332521],
       [  170312, 19552860],
       [  149995, 12882135],
       [  141297, 19651127],
       [  695662, 26448193]])

In [33]:
# get descriptive statistics
states.describe()

Unnamed: 0,area,population
count,5.0,5.0
mean,316246.6,23373370.0
std,242437.411951,9640386.0
min,141297.0,12882140.0
25%,149995.0,19552860.0
50%,170312.0,19651130.0
75%,423967.0,26448190.0
max,695662.0,38332520.0


# Sales Data

In [34]:
dat = pd.read_csv("data/WA_Fn-UseC_-Sales-Win-Loss.csv")

In [35]:
dat.columns

Index(['Opportunity Number', 'Supplies Subgroup', 'Supplies Group', 'Region',
       'Route To Market', 'Elapsed Days In Sales Stage', 'Opportunity Result',
       'Sales Stage Change Count', 'Total Days Identified Through Closing',
       'Total Days Identified Through Qualified', 'Opportunity Amount USD',
       'Client Size By Revenue', 'Client Size By Employee Count',
       'Revenue From Client Past Two Years', 'Competitor Type',
       'Ratio Days Identified To Total Days',
       'Ratio Days Validated To Total Days',
       'Ratio Days Qualified To Total Days', 'Deal Size Category'],
      dtype='object')

In [36]:
dat['Opportunity Result']

0         Won
1        Loss
2         Won
3        Loss
4        Loss
5        Loss
6         Won
7        Loss
8        Loss
9        Loss
10       Loss
11       Loss
12       Loss
13       Loss
14       Loss
15        Won
16       Loss
17       Loss
18       Loss
19       Loss
20       Loss
21       Loss
22       Loss
23       Loss
24        Won
25       Loss
26       Loss
27       Loss
28       Loss
29       Loss
         ... 
77995    Loss
77996     Won
77997     Won
77998    Loss
77999    Loss
78000    Loss
78001     Won
78002    Loss
78003     Won
78004     Won
78005    Loss
78006     Won
78007    Loss
78008     Won
78009    Loss
78010    Loss
78011     Won
78012     Won
78013     Won
78014     Won
78015    Loss
78016     Won
78017     Won
78018     Won
78019     Won
78020    Loss
78021     Won
78022    Loss
78023    Loss
78024    Loss
Name: Opportunity Result, Length: 78025, dtype: object

# Counting Values

In [41]:
dat['Opportunity Result'].value_counts()

Loss    60398
Won     17627
Name: Opportunity Result, dtype: int64

In [42]:
dat['Supplies Group'].value_counts()

Car Accessories           49810
Performance & Non-auto    27325
Tires & Wheels              609
Car Electronics             281
Name: Supplies Group, dtype: int64

In [43]:
dat['Elapsed Days In Sales Stage'].value_counts()

16     5010
44     2388
62     1738
7      1629
23     1455
37     1412
45     1238
24     1233
35     1226
18     1220
89     1184
28     1173
26     1135
54     1124
27     1115
63     1107
49     1098
9      1058
91     1051
74     1020
81      976
64      976
41      952
65      945
0       934
47      922
73      912
17      905
84      899
90      870
       ... 
121       6
102       6
103       5
118       5
128       5
115       5
116       4
122       4
112       4
129       4
113       3
104       3
114       3
123       2
106       2
126       2
105       2
125       2
130       2
210       2
124       1
135       1
131       1
132       1
134       1
148       1
108       1
137       1
138       1
127       1
Name: Elapsed Days In Sales Stage, Length: 138, dtype: int64

# Top Five Values

In [44]:
dat['Supplies Subgroup'].value_counts()[:5]

Motorcycle Parts           15174
Exterior Accessories       13876
Garage & Car Care           9733
Shelters & RV               9606
Batteries & Accessories     9192
Name: Supplies Subgroup, dtype: int64

# Extracting Columns

In [45]:
dat.head()

Unnamed: 0,Opportunity Number,Supplies Subgroup,Supplies Group,Region,Route To Market,Elapsed Days In Sales Stage,Opportunity Result,Sales Stage Change Count,Total Days Identified Through Closing,Total Days Identified Through Qualified,Opportunity Amount USD,Client Size By Revenue,Client Size By Employee Count,Revenue From Client Past Two Years,Competitor Type,Ratio Days Identified To Total Days,Ratio Days Validated To Total Days,Ratio Days Qualified To Total Days,Deal Size Category
0,1641984,Exterior Accessories,Car Accessories,Northwest,Fields Sales,76,Won,13,104,101,0,5,5,0,Unknown,0.69636,0.113985,0.154215,1
1,1658010,Exterior Accessories,Car Accessories,Pacific,Reseller,63,Loss,2,163,163,0,3,5,0,Unknown,0.0,1.0,0.0,1
2,1674737,Motorcycle Parts,Performance & Non-auto,Pacific,Reseller,24,Won,7,82,82,7750,1,1,0,Unknown,1.0,0.0,0.0,1
3,1675224,Shelters & RV,Performance & Non-auto,,Reseller,16,Loss,5,124,124,0,1,1,0,Known,1.0,0.0,0.0,1
4,1689785,Exterior Accessories,Car Accessories,Pacific,Reseller,69,Loss,11,91,13,69756,1,1,0,Unknown,0.0,0.141125,0.0,4


In [46]:
region_results = dat[["Region", "Opportunity Result"]]

In [47]:
region_results.shape

(78025, 2)

In [48]:
region_results.head()

Unnamed: 0,Region,Opportunity Result
0,Northwest,Won
1,Pacific,Loss
2,Pacific,Won
3,,Loss
4,Pacific,Loss


# Setting the Index
* oftentimes the index of the DataFrame is something we don't care about, e.g., a default numeric index or sequence number

In [49]:
presidents = pd.DataFrame([
    { 'name': 'James Madison', 'elect': 1808, 'born': 1751 },
    { 'name': 'Thomas Jefferson', 'elect': 1800, 'born': 1743 },
    { 'name': 'John Adams', 'elect': 1796, 'born': 1735 },
    { 'name': 'George Washington', 'elect': 1788, 'born': 1724 },
])
presidents

Unnamed: 0,born,elect,name
0,1751,1808,James Madison
1,1743,1800,Thomas Jefferson
2,1735,1796,John Adams
3,1724,1788,George Washington


In [50]:
# the default numeric isn't interesting
# let's use name as the index
presidents.set_index('name', inplace=True)

In [51]:
presidents

Unnamed: 0_level_0,born,elect
name,Unnamed: 1_level_1,Unnamed: 2_level_1
James Madison,1751,1808
Thomas Jefferson,1743,1800
John Adams,1735,1796
George Washington,1724,1788
