# Basic Data Processing with Pandas

In [1]:
import pandas as pd

In [10]:
# list of strings
animals = ['Tiger', 'Bear', 'Moose']
pd.Series(animals)

0    Tiger
1     Bear
2    Moose
dtype: object

In [11]:
# list of numbers
numbers = [1, 2, 3]
pd.Series(numbers)

0    1
1    2
2    3
dtype: int64

In [12]:
# list of strings with None
animals = ['Tiger', 'Bear', None]
pd.Series(animals)

0    Tiger
1     Bear
2     None
dtype: object

In [13]:
# list of numbers with None
numbers = [1, 2, None]
pd.Series(numbers)

0    1.0
1    2.0
2    NaN
dtype: float64

In [14]:
# NaN
import numpy as np
np.nan == None

False

In [15]:
np.isnan(np.nan)

True

In [23]:
# labeled data instead of index
sports = {
    'Archery': 'Bhutan',
    'Golf': 'Scotland',
    'Sumo': 'Japan',
    'Taekwondo': 'South Korea'
}
s = pd.Series(sports)
s

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [24]:
# get index
s.index

Index([u'Archery', u'Golf', u'Sumo', u'Taekwondo'], dtype='object')

In [26]:
# set index
s = pd.Series(['Tiger', 'Bear', 'Moose'], index=['India', 'America', 'Canada'])
s

India      Tiger
America     Bear
Canada     Moose
dtype: object

## Querying a Series

In [28]:
sports = {
    'Archery': 'Bhutan',
    'Golf': 'Scotland',
    'Sumo': 'Japan',
    'Taekwondo': 'South Korea'
}
s = pd.Series(sports)
s

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [31]:
s.iloc[3]

'South Korea'

In [32]:
s.loc['Golf']

'Scotland'

In [33]:
s['Golf']

'Scotland'

In [34]:
s = pd.Series([100.00, 120.00, 101.00, 3.00])
s

0    100.0
1    120.0
2    101.0
3      3.0
dtype: float64

In [35]:
# get total (slow)
total = 0
for item in s:
    total += item
print(total)

324.0


In [37]:
# get total using numpy (fast)
import numpy as np

total = np.sum(s)
print(total)

324.0


In [41]:
# speed tests on larger dataset
s = pd.Series(np.random.randint(0, 1000, 10000))

# show top 5
s.head()

0    750
1     23
2    678
3    492
4    637
dtype: int64

In [42]:
len(s)

10000

In [43]:
%%timeit -n 100
summary = 0
for item in s:
    summary += item

100 loops, best of 3: 2.11 ms per loop


In [44]:
%%timeit -n 100
summary = np.sum(s)

100 loops, best of 3: 35.6 µs per loop


In [45]:
s += 2
s.head()

0    752
1     25
2    680
3    494
4    639
dtype: int64

In [46]:
%%timeit -n 10
s = pd.Series(np.random.randint(0, 1000, 10000))

for label, value in s.iteritems():
    s.loc[label] = value + 2

10 loops, best of 3: 907 ms per loop


In [47]:
%%timeit -n 10
s = pd.Series(np.random.randint(0, 1000, 10000))
s += 2

10 loops, best of 3: 476 µs per loop


In [48]:
# add item to series
s = pd.Series([1, 2, 3])

s.loc['Animal'] = 'Bears'
s

0             1
1             2
2             3
Animal    Bears
dtype: object

## The DataFrame Data Structure

In [49]:
import pandas as pd

In [53]:
purchase_1 = pd.Series({'Name': 'Chris',
                        'Item': 'Dog Food',
                        'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
                        'Item': 'Kitten Litter',
                        'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
                        'Item': 'Bird Seed',
                        'Cost': 5.00})

df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index = ['Store 1', 'Store 1', 'Store 2'])
df.head()

Unnamed: 0,Cost,Item,Name
Store 1,22.5,Dog Food,Chris
Store 1,2.5,Kitten Litter,Kevyn
Store 2,5.0,Bird Seed,Vinod


In [54]:
df.loc['Store 2']

Cost            5
Item    Bird Seed
Name        Vinod
Name: Store 2, dtype: object

In [55]:
df.loc['Store 1']

Unnamed: 0,Cost,Item,Name
Store 1,22.5,Dog Food,Chris
Store 1,2.5,Kitten Litter,Kevyn


In [62]:
# get a column
df['Item']

Store 1         Dog Food
Store 1    Kitten Litter
Store 2        Bird Seed
Name: Item, dtype: object

In [64]:
# get all rows from two columns
df.loc[:, ['Name', 'Cost']]

Unnamed: 0,Name,Cost
Store 1,Chris,22.5
Store 1,Kevyn,2.5
Store 2,Vinod,5.0


In [66]:
# drop
df.drop('Store 1')

Unnamed: 0,Cost,Item,Name
Store 2,5.0,Bird Seed,Vinod


In [67]:
df

Unnamed: 0,Cost,Item,Name
Store 1,22.5,Dog Food,Chris
Store 1,2.5,Kitten Litter,Kevyn
Store 2,5.0,Bird Seed,Vinod


In [69]:
# permanent drop
df_copy = df.copy()
df_copy = df_copy.drop('Store 1')

df_copy

Unnamed: 0,Cost,Item,Name
Store 2,5.0,Bird Seed,Vinod


In [70]:
# adding data to dataframe
df['Location'] = None
df

Unnamed: 0,Cost,Item,Name,Location
Store 1,22.5,Dog Food,Chris,
Store 1,2.5,Kitten Litter,Kevyn,
Store 2,5.0,Bird Seed,Vinod,


In [71]:
# deleting data from dataframe
del df_copy['Name']
df_copy

Unnamed: 0,Cost,Item
Store 2,5.0,Bird Seed


In [72]:
# example: apply discount
df['Cost'] *= 0.8
df

Unnamed: 0,Cost,Item,Name,Location
Store 1,18.0,Dog Food,Chris,
Store 1,2.0,Kitten Litter,Kevyn,
Store 2,4.0,Bird Seed,Vinod,
