![alt text](null.png "Title")

In [None]:
import pandas as pd
import numpy as np

In [2]:
# starting out with fundamentals

obj = pd.Series([-4, 7, 3, 5])
obj

0   -4
1    7
2    3
3    5
dtype: int64

In [4]:
obj.values

array([-4,  7,  3,  5])

In [5]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
obj2 = pd.Series([-3, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d   -3
b    7
a   -5
c    3
dtype: int64

In [7]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [8]:
obj2['a']

-5

In [9]:
obj2['d']

-3

In [10]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d   -3
dtype: int64

In [11]:
obj2[obj2 > 0]

b    7
c    3
dtype: int64

In [12]:
obj2*2

d    -6
b    14
a   -10
c     6
dtype: int64

In [13]:
# a series is a mapping of index values to data values

sdata = {'ohio': 35000, 'texas': 14000, 'oregon': 16000, 'utah': 34990}
obj3 = pd.Series(sdata)
obj3

ohio      35000
texas     14000
oregon    16000
utah      34990
dtype: int64

In [15]:
states = ['cali', 'ohio', 'ohio', 'texas']
obj3 = pd.Series(sdata, index=states)
obj3

cali         NaN
ohio     35000.0
ohio     35000.0
texas    14000.0
dtype: float64

In [16]:
# detecting missing data

pd.isnull(obj3)

cali      True
ohio     False
ohio     False
texas    False
dtype: bool

In [18]:
pd.notnull(obj3)

cali     False
ohio      True
ohio      True
texas     True
dtype: bool

In [19]:
obj3.isnull()

cali      True
ohio     False
ohio     False
texas    False
dtype: bool

In [21]:
# getting started with dataframes

data = {'state': ['ohio', 'ohio', 'ohio', 'nevada', 'nevada'], 
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,ohio,2000,1.5
1,ohio,2001,1.7
2,ohio,2002,3.6
3,nevada,2001,2.4
4,nevada,2002,2.9


In [22]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,ohio,1.5
1,2001,ohio,1.7
2,2002,ohio,3.6
3,2001,nevada,2.4
4,2002,nevada,2.9


In [23]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four', 'five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,ohio,1.5,
two,2001,ohio,1.7,
three,2002,ohio,3.6,
four,2001,nevada,2.4,
five,2002,nevada,2.9,


In [24]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [25]:
frame2['state']

one        ohio
two        ohio
three      ohio
four     nevada
five     nevada
Name: state, dtype: object

In [26]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [32]:
frame2.loc['one']

year     2000
state    ohio
pop       1.5
debt      NaN
Name: one, dtype: object

In [35]:
frame2['debt'] = np.arange(5)

In [36]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,ohio,1.5,0
two,2001,ohio,1.7,1
three,2002,ohio,3.6,2
four,2001,nevada,2.4,3
five,2002,nevada,2.9,4


In [37]:
frame2.values

array([[2000, 'ohio', 1.5, 0],
       [2001, 'ohio', 1.7, 1],
       [2002, 'ohio', 3.6, 2],
       [2001, 'nevada', 2.4, 3],
       [2002, 'nevada', 2.9, 4]], dtype=object)

In [38]:
# dropping entries from an axis

obj = pd.Series(np.arange(5), index=list('abcde'))
obj

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [39]:
new_obj = obj.drop('c')

In [40]:
new_obj

a    0
b    1
d    3
e    4
dtype: int64

In [41]:
obj.drop(['d', 'c'])

a    0
b    1
e    4
dtype: int64

In [45]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)), columns=list('abcd'),
                    index=['utah', 'oregon', 'cali', 'ny'])
data

Unnamed: 0,a,b,c,d
utah,0,1,2,3
oregon,4,5,6,7
cali,8,9,10,11
ny,12,13,14,15


In [46]:
data.drop(['utah', 'oregon'])

Unnamed: 0,a,b,c,d
cali,8,9,10,11
ny,12,13,14,15


In [47]:
data.drop('a', axis=1)

Unnamed: 0,b,c,d
utah,1,2,3
oregon,5,6,7
cali,9,10,11
ny,13,14,15


In [48]:
data.drop(['b', 'c'], axis=1)

Unnamed: 0,a,d
utah,0,3
oregon,4,7
cali,8,11
ny,12,15


In [50]:
# adding two differently indexed dataframe and filling NA values with 0

df1 = pd.DataFrame(np.arange(12).reshape((3, 4)), columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20).reshape((4, 5)), columns=list('abcde'))
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [51]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [55]:
# applying element wise array functions on data

frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['utah', 'oregon', 'cali', 'texas'])
frame

Unnamed: 0,b,d,e
utah,-0.305941,-0.352865,0.826753
oregon,-0.995458,0.953315,-0.002231
cali,0.396779,1.063587,-0.320867
texas,-0.921026,-0.880694,0.48923


In [56]:
np.abs(frame)

Unnamed: 0,b,d,e
utah,0.305941,0.352865,0.826753
oregon,0.995458,0.953315,0.002231
cali,0.396779,1.063587,0.320867
texas,0.921026,0.880694,0.48923


In [58]:
f = lambda x: x.max() - x.min()
frame.apply(f)

b    1.392237
d    1.944281
e    1.147620
dtype: float64

In [59]:
frame.apply(f, axis=1)

utah      1.179618
oregon    1.948773
cali      1.384454
texas     1.410256
dtype: float64

In [63]:
# sort a dataframe by its index

frame = pd.DataFrame(np.arange(8).reshape(2, 4), index=['two', 'one'],
                     columns=list('bdea'))
frame

Unnamed: 0,b,d,e,a
two,0,1,2,3
one,4,5,6,7


In [64]:
frame.sort_index()

Unnamed: 0,b,d,e,a
one,4,5,6,7
two,0,1,2,3


# Data Wrangling

In [68]:
df1 = pd.DataFrame({'key': list('bbacaab'),
                   'data1': range(7)})
df2 = pd.DataFrame({'key': list('abd'),
                    'data2': range(3)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [69]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [70]:
pd.merge(df1, df2)

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [71]:
pd.merge(df1, df2, on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [72]:
df3 = pd.DataFrame({'lkey': list('bbacaab'),
                    'data3': range(7)})
df4 = pd.DataFrame({'rkey': list('abd'),
                    'data4': range(3)})
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,data3,rkey,data4
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [73]:
pd.merge(df1, df2, how='outer')

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


In [74]:
df1 = pd.DataFrame({'key': list('bbacab'),
                    'data1': range(6)})
df2 = pd.DataFrame({'key': list('ababd'),
                    'data2': range(5)})
pd.merge(df1, df2, on='key', how='left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,0,3.0
2,b,1,1.0
3,b,1,3.0
4,a,2,0.0
5,a,2,2.0
6,c,3,
7,a,4,0.0
8,a,4,2.0
9,b,5,1.0


In [76]:
pd.merge(df1, df2, how='inner')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,0,3
2,b,1,1
3,b,1,3
4,b,5,1
5,b,5,3
6,a,2,0
7,a,2,2
8,a,4,0
9,a,4,2


In [77]:
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                     'key2': ['one', 'two', 'one'],
                     'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                      'key2': ['one', 'one', 'one', 'two'],
                      'rval': [4, 5, 6, 7]})
pd.merge(left, right, on=['key1', 'key2'], how='outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


In [81]:
# merging datasets on index

left1 = pd.DataFrame({'key': list('abaabc'),
                      'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]},
                        index=['a', 'b'])
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [82]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [86]:
pd.merge(left1, right1, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [87]:
# concatenating dataframes along axes

arr = np.arange(12).reshape(3, 4)
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [91]:
np.concatenate([arr, arr], axis=0)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [92]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])
pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [94]:
pd.concat([s1, s2, s3], axis=1, sort=False)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [96]:
s4 = pd.concat([s1*5, s3])
s4

a    0
b    5
f    5
g    6
dtype: int64

In [100]:
pd.concat([s1, s4], axis=1, sort=False)

Unnamed: 0,0,1
a,0.0,0
b,1.0,5
f,,5
g,,6


In [101]:
pd.concat([s1, s4], axis=1, join='inner')

Unnamed: 0,0,1
a,0,0
b,1,5


In [102]:
# discretization and binning of data

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]

cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [104]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [107]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [108]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [109]:
group_names = ['young', 'youngadult', 'adult', 'senior']
pd.cut(ages, bins, labels=group_names)

[young, young, young, youngadult, young, ..., youngadult, senior, adult, adult, youngadult]
Length: 12
Categories (4, object): [young < youngadult < adult < senior]

In [111]:
data = np.random.randn(1000)
cats = pd.qcut(data, 4)
cats

[(-0.587, 0.0351], (0.0351, 0.698], (-0.587, 0.0351], (0.0351, 0.698], (0.698, 2.881], ..., (0.0351, 0.698], (-3.131, -0.587], (-3.131, -0.587], (-3.131, -0.587], (-0.587, 0.0351]]
Length: 1000
Categories (4, interval[float64]): [(-3.131, -0.587] < (-0.587, 0.0351] < (0.0351, 0.698] < (0.698, 2.881]]

In [112]:
pd.value_counts(cats)

(0.698, 2.881]      250
(0.0351, 0.698]     250
(-0.587, 0.0351]    250
(-3.131, -0.587]    250
dtype: int64

In [113]:
# obtaining dummy encodings out of string variables

df = pd.DataFrame({'key': list('bbacab'),
                   'data1': range(6)})
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [115]:
# joining actual data with dummy columns

dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [116]:
# using groupby functions to compute grouped summaries

df = pd.DataFrame({'key1': list('aabba'),
                   'key2': ['one', 'two', 'one', 'two', 'one'],
                   'data1': np.random.randn(5),
                   'data2': np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.023729,2.09102
1,a,two,-0.002829,-0.028943
2,b,one,0.859461,-0.184776
3,b,two,0.438793,0.161739
4,a,one,1.387854,0.083728


In [117]:
grouped = df['data1'].groupby(df['key1'])

In [118]:
grouped.mean()

key1
a    0.469585
b    0.649127
Name: data1, dtype: float64

In [119]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()

In [120]:
means

key1  key2
a     one     0.705792
      two    -0.002829
b     one     0.859461
      two     0.438793
Name: data1, dtype: float64

In [121]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.705792,-0.002829
b,0.859461,0.438793


# Reiterating time series fundamentals

In [122]:
a = np.random.standard_normal((9, 4))
a.round(6)

array([[-0.198338,  1.046148, -1.33085 , -0.088516],
       [ 1.229988,  0.242111, -0.213885, -0.442962],
       [ 0.975946,  1.055299, -0.322172,  0.900755],
       [-1.52225 ,  0.849127,  0.520433, -1.205591],
       [ 0.875398, -1.295689,  0.002739, -0.439649],
       [-0.76426 , -1.058431,  0.641963, -0.959755],
       [ 0.103998, -0.876935,  1.017916, -0.267087],
       [ 0.284545,  0.294717,  1.075   ,  1.746698],
       [-0.707951,  0.045374, -0.702073, -0.320826]])

In [124]:
df = pd.DataFrame(a)
df

Unnamed: 0,0,1,2,3
0,-0.198338,1.046148,-1.33085,-0.088516
1,1.229988,0.242111,-0.213885,-0.442962
2,0.975946,1.055299,-0.322172,0.900755
3,-1.52225,0.849127,0.520433,-1.205591
4,0.875398,-1.295689,0.002739,-0.439649
5,-0.76426,-1.058431,0.641963,-0.959755
6,0.103998,-0.876935,1.017916,-0.267087
7,0.284545,0.294717,1.075,1.746698
8,-0.707951,0.045374,-0.702073,-0.320826


In [132]:
df.columns = ['No1', 'No2', 'No3', 'No4']
df

Unnamed: 0,No1,No2,No3,No4
0,-0.198338,1.046148,-1.33085,-0.088516
1,1.229988,0.242111,-0.213885,-0.442962
2,0.975946,1.055299,-0.322172,0.900755
3,-1.52225,0.849127,0.520433,-1.205591
4,0.875398,-1.295689,0.002739,-0.439649
5,-0.76426,-1.058431,0.641963,-0.959755
6,0.103998,-0.876935,1.017916,-0.267087
7,0.284545,0.294717,1.075,1.746698
8,-0.707951,0.045374,-0.702073,-0.320826


In [134]:
df['No1'][3]

-1.5222502941194003

In [135]:
# create a data time index using date_range

dates = pd.date_range('2015-1-1', periods=9, freq='M')
dates

DatetimeIndex(['2015-01-31', '2015-02-28', '2015-03-31', '2015-04-30',
               '2015-05-31', '2015-06-30', '2015-07-31', '2015-08-31',
               '2015-09-30'],
              dtype='datetime64[ns]', freq='M')

In [136]:
df.index = dates
df

Unnamed: 0,No1,No2,No3,No4
2015-01-31,-0.198338,1.046148,-1.33085,-0.088516
2015-02-28,1.229988,0.242111,-0.213885,-0.442962
2015-03-31,0.975946,1.055299,-0.322172,0.900755
2015-04-30,-1.52225,0.849127,0.520433,-1.205591
2015-05-31,0.875398,-1.295689,0.002739,-0.439649
2015-06-30,-0.76426,-1.058431,0.641963,-0.959755
2015-07-31,0.103998,-0.876935,1.017916,-0.267087
2015-08-31,0.284545,0.294717,1.075,1.746698
2015-09-30,-0.707951,0.045374,-0.702073,-0.320826


In [137]:
np.array(df).round(6)

array([[-0.198338,  1.046148, -1.33085 , -0.088516],
       [ 1.229988,  0.242111, -0.213885, -0.442962],
       [ 0.975946,  1.055299, -0.322172,  0.900755],
       [-1.52225 ,  0.849127,  0.520433, -1.205591],
       [ 0.875398, -1.295689,  0.002739, -0.439649],
       [-0.76426 , -1.058431,  0.641963, -0.959755],
       [ 0.103998, -0.876935,  1.017916, -0.267087],
       [ 0.284545,  0.294717,  1.075   ,  1.746698],
       [-0.707951,  0.045374, -0.702073, -0.320826]])

In [138]:
df.sum()

No1    0.277076
No2    0.301720
No3    0.689071
No4   -1.076935
dtype: float64

In [139]:
df.mean()

No1    0.030786
No2    0.033524
No3    0.076563
No4   -0.119659
dtype: float64

In [140]:
df.cumsum()

Unnamed: 0,No1,No2,No3,No4
2015-01-31,-0.198338,1.046148,-1.33085,-0.088516
2015-02-28,1.03165,1.288259,-1.544735,-0.531479
2015-03-31,2.007596,2.343558,-1.866907,0.369277
2015-04-30,0.485345,3.192685,-1.346475,-0.836315
2015-05-31,1.360743,1.896996,-1.343735,-1.275964
2015-06-30,0.596483,0.838564,-0.701772,-2.235719
2015-07-31,0.700481,-0.038371,0.316144,-2.502806
2015-08-31,0.985027,0.256346,1.391144,-0.756108
2015-09-30,0.277076,0.30172,0.689071,-1.076935


In [141]:
df.describe()

Unnamed: 0,No1,No2,No3,No4
count,9.0,9.0,9.0,9.0
mean,0.030786,0.033524,0.076563,-0.119659
std,0.91902,0.910674,0.808981,0.913758
min,-1.52225,-1.295689,-1.33085,-1.205591
25%,-0.707951,-0.876935,-0.322172,-0.442962
50%,0.103998,0.242111,0.002739,-0.320826
75%,0.875398,0.849127,0.641963,-0.088516
max,1.229988,1.055299,1.075,1.746698


In [142]:
df['quarter'] = ['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2', 'Q3', 'Q3', 'Q3']
df

Unnamed: 0,No1,No2,No3,No4,quarter
2015-01-31,-0.198338,1.046148,-1.33085,-0.088516,Q1
2015-02-28,1.229988,0.242111,-0.213885,-0.442962,Q1
2015-03-31,0.975946,1.055299,-0.322172,0.900755,Q1
2015-04-30,-1.52225,0.849127,0.520433,-1.205591,Q2
2015-05-31,0.875398,-1.295689,0.002739,-0.439649,Q2
2015-06-30,-0.76426,-1.058431,0.641963,-0.959755,Q2
2015-07-31,0.103998,-0.876935,1.017916,-0.267087,Q3
2015-08-31,0.284545,0.294717,1.075,1.746698,Q3
2015-09-30,-0.707951,0.045374,-0.702073,-0.320826,Q3


In [143]:
groups = df.groupby(df['quarter'])
groups.mean()

Unnamed: 0_level_0,No1,No2,No3,No4
quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Q1,0.669199,0.781186,-0.622302,0.123092
Q2,-0.470371,-0.501665,0.388378,-0.868332
Q3,-0.106469,-0.178948,0.463614,0.386262
