In [3]:
import pandas as pd
import numpy as np

In [4]:
# 5.1 Introduction to pandas Data Structures

In [5]:
### Series

In [6]:
obj = pd.Series([4, 7, -5, 3])

In [7]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

And so on...

In [8]:
obj.values

array([ 4,  7, -5,  3])

In [9]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [10]:
obj2 = pd.Series([4,7,-5,3], index=['d', 'b', 'a', 'c'])

In [11]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [12]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [13]:
obj2['a']

-5

In [14]:
obj2['d']

4

In [15]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    4
dtype: int64

In [16]:
obj2[obj2 >0]

d    4
b    7
c    3
dtype: int64

In [17]:
obj2 *2

d     8
b    14
a   -10
c     6
dtype: int64

In [18]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [19]:
'b' in obj2

True

In [20]:
'e' in obj2

False

In [21]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [22]:
obj3 = pd.Series(sdata)

In [23]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [24]:
states = ['California', 'Ohio', 'Oregon', 'Texas']

In [25]:
obj4 = pd.Series(sdata, index=states)

In [26]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [27]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [28]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [29]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [30]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [31]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [32]:
obj3 + obj4 

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [33]:
obj4.name = 'population'

In [34]:
obj4.index.name = 'state'

In [35]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [36]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [37]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']

In [38]:
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

In [39]:
 data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
            'year': [2000, 2001, 2002, 2001, 2002, 2003],
            'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

In [40]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [41]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [42]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [43]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
       ....:                       index=['one', 'two', 'three', 'four',
       ....:                              'five', 'six'])

In [44]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [45]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [46]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [47]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [48]:
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [49]:
frame2['debt'] = 16.5

In [50]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [51]:
frame2['debt'] = np.arange(6.)

In [52]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [53]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])

In [54]:
frame2['debt'] = val

In [55]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [56]:
frame2['eastern'] = frame2.state == 'Ohio'

In [57]:
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [58]:
del frame2['eastern']

In [59]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [60]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [61]:
frame3 = pd.DataFrame(pop)

In [62]:
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [63]:
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [64]:
pd.DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [65]:
pdata = {'Ohio': frame3['Ohio'][:-1], 'Nevada': frame3['Nevada'][:2]}

In [66]:
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [67]:
frame3.index.name = 'year'; frame3.columns.name = 'state'

In [68]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [69]:
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [70]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

In [71]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])

In [72]:
index = obj.index

In [73]:
index

Index(['a', 'b', 'c'], dtype='object')

In [74]:
index[1:]

Index(['b', 'c'], dtype='object')

index[1] = 'd'

In [75]:
labels = pd.Index(np.arange(3))

In [76]:
labels

Index([0, 1, 2], dtype='int64')

In [77]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [78]:
frame3.columns

Index(['Nevada', 'Ohio'], dtype='object', name='state')

In [79]:
'Ohio' in frame3.columns

True

In [80]:
2003 in frame3.index

False

In [81]:
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])

In [82]:
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

In [83]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])

In [84]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])

In [85]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [86]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])


In [87]:
obj3

0      blue
2    purple
4    yellow
dtype: object

In [88]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [89]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
....:                index=['a', 'c', 'd'],
....:                columns=['Ohio', 'Texas', 'California'])

In [90]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [91]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])

In [92]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [93]:
states = ['Texas', 'Utah', 'California']

In [94]:
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [95]:
frame.loc[['a', 'b', 'c', 'd'], states]

KeyError: "['b'] not in index"

In [96]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])

In [97]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [98]:
new_obj = obj.drop('c')

In [99]:
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [100]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [101]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

In [102]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [103]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [104]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [105]:
data.drop(['two', 'four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [106]:
obj.drop('c', inplace=True)

In [107]:
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [108]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

In [109]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [110]:
obj['b']

1.0

In [111]:
obj[1]

  obj[1]


1.0

In [112]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [113]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [114]:
obj[[1, 3]]

  obj[[1, 3]]


b    1.0
d    3.0
dtype: float64

In [115]:
obj[obj <2]

a    0.0
b    1.0
dtype: float64

In [116]:
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [117]:
obj['b':'c']=5

In [118]:
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [120]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
       .....:                     index=['Ohio', 'Colorado', 'Utah', 'New York'],
       .....:                     columns=['one', 'two', 'three', 'four'])

In [121]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [122]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [124]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [126]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [127]:
data[data['three']>5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [128]:
data <5 

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [129]:
data[data <5] =0

In [130]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [131]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

In [132]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int64

In [133]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [134]:
data.iloc[[1,2],[3,0,1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [135]:
data.loc[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64

In [136]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [137]:
ser = pd.Series(np.arange(3.))

In [138]:
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [139]:
ser[-1]

KeyError: -1

In [140]:
ser2 = pd.Series(np.arange(3. ), index=['a', 'b', 'c'])

In [141]:
ser2[-1]

  ser2[-1]


2.0

In [142]:
ser[:1]

0    0.0
dtype: float64

In [143]:
ser.loc[:1]

0    0.0
1    1.0
dtype: float64

In [144]:
ser.iloc[:1]

0    0.0
dtype: float64

In [145]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])

In [146]:
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
       .....:                index=['a', 'c', 'e', 'f', 'g'])

In [147]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [148]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [149]:
s1 +s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [150]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
       .....:                    index=['Ohio', 'Texas', 'Colorado'])

In [151]:
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
       .....:                    index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [152]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [153]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [154]:
df1+ df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [155]:
df1 = pd.DataFrame({'A': [1, 2]})

In [156]:
df2 = pd.DataFrame({'B': [3, 4]})

In [157]:
df1

Unnamed: 0,A
0,1
1,2


In [158]:
df2

Unnamed: 0,B
0,3
1,4


In [159]:
df1 -df2

Unnamed: 0,A,B
0,,
1,,


In [160]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
       .....:                    columns=list('abcd'))

In [161]:
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
       .....:                    columns=list('abcde'))

In [162]:
df2.loc[1, 'b']= np.nan

In [163]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [164]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [165]:
df2.loc[1, 'b']=np.nan

In [166]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [167]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [168]:
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [169]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [171]:
1/df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [172]:
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [173]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


In [174]:
arr=np.arange(12.).reshape((3, 4))

In [175]:
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [176]:
arr[0]

array([0., 1., 2., 3.])

In [177]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [185]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
       .....:                      columns=list('bde'),
       .....:                      index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [186]:
series = frame.iloc[0]

In [187]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [188]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [189]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [190]:
series2 = pd.Series(range(3), index=['b', 'e', 'f'])

In [191]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [193]:
series3 = frame['d']

In [194]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [195]:
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [196]:
frame.sub(series3, axis='index')

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


In [197]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
       .....:                      index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [198]:
frame

Unnamed: 0,b,d,e
Utah,0.741556,0.922232,-1.01929
Ohio,-0.119483,0.111091,0.359538
Texas,0.127565,0.233578,0.259418
Oregon,-0.656907,0.749551,-1.053115


In [199]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.741556,0.922232,1.01929
Ohio,0.119483,0.111091,0.359538
Texas,0.127565,0.233578,0.259418
Oregon,0.656907,0.749551,1.053115


In [200]:
f= lambda x: x.max() - x.min()

In [201]:
frame.apply(f)

b    1.398463
d    0.811141
e    1.412654
dtype: float64

In [202]:
frame.apply(f, axis='columns')

Utah      1.941522
Ohio      0.479021
Texas     0.131853
Oregon    1.802667
dtype: float64

In [204]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

In [205]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.656907,0.111091,-1.053115
max,0.741556,0.922232,0.359538


In [206]:
format = lambda x: '%.2f' % x

In [207]:
frame.applymap(format)

  frame.applymap(format)


Unnamed: 0,b,d,e
Utah,0.74,0.92,-1.02
Ohio,-0.12,0.11,0.36
Texas,0.13,0.23,0.26
Oregon,-0.66,0.75,-1.05


In [208]:
frame['e'].map(format)

Utah      -1.02
Ohio       0.36
Texas      0.26
Oregon    -1.05
Name: e, dtype: object

In [210]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])

In [211]:
obj.sort_index()


a    1
b    2
c    3
d    0
dtype: int64

In [212]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
       .....:                      index=['three', 'one'],
       .....:                      columns=['d', 'a', 'b', 'c'])

In [213]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [214]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [215]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [216]:
obj = pd.Series([4,7,-3,2])

In [217]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [218]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])

In [219]:
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [231]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1], })
#frame = pd.DataFrame({'a': [0, 1, 0, 1], 'b': [4, 7, -3, 2]})
# I switched the order to prove Lydia's point that the book is pasted at times incorrectly!
# the b column defaults FIRST column unless I put A column first as above.

In [232]:
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [233]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [234]:
frame.sort_values(by=['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [238]:
frame.sort_values(by=['a', 'b'])
#this IS odd, why does it NOT SORT by 'a' column first when I specify?

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [239]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])

In [240]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [241]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [242]:
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [243]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
       .....:                       'c': [-2, 5, 8, -2.5]})

In [244]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [245]:
frame.rank(axis='columns')

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


In [246]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])

In [247]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [248]:
obj.index.is_unique

False

In [249]:
obj['a']

a    0
a    1
dtype: int64

In [251]:
obj['c']

4

In [252]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])

In [253]:
df

Unnamed: 0,0,1,2
a,-0.654576,-0.900821,-0.54333
a,-0.749328,-0.582595,-1.421691
b,1.474251,1.276629,-1.469145
b,-1.477647,-0.511586,-0.664665


In [254]:
df.loc['b']

Unnamed: 0,0,1,2
b,1.474251,1.276629,-1.469145
b,-1.477647,-0.511586,-0.664665


In [255]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                    [np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])

In [256]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [257]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [258]:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [259]:
df.mean(axis='columns', skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [260]:
df.idxmax()

one    b
two    d
dtype: object

In [261]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [262]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [263]:
obj = pd.Series(['a', 'a', 'b', 'c']*4)

In [264]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [274]:
conda install pandas-datareader

Channels:
 - defaults
 - conda-forge
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done


    current version: 24.1.2
    latest version: 24.3.0

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /Users/nathan/Library/jupyterlab-desktop/jlab_server

  added / updated specs:
    - pandas-datareader


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2024.3.11  |       hca03da5_0         128 KB
    certifi-2024.2.2           |  py312hca03da5_0         161 KB
    conda-24.3.0               |  py312hca03da5_0         1.2 MB
    libxslt-1.1.39             |       h223e5b9_0         220 KB  conda-forge
    lxml-5.1.0                 |  py312h9bf3b9e_0         1.2 MB  conda-forge
    pandas-datareader-0.10.0   |     pyhd3eb1b0_0          71 KB
    -------------

In [276]:
import pandas_datareader.data as web

In [279]:
import datetime as dt

In [280]:
pip install "yfinance[optional]"

Collecting yfinance[optional]
  Downloading yfinance-0.2.37-py2.py3-none-any.whl.metadata (11 kB)
Collecting multitasking>=0.0.7 (from yfinance[optional])
  Downloading multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Collecting appdirs>=1.4.4 (from yfinance[optional])
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting frozendict>=2.3.4 (from yfinance[optional])
  Using cached frozendict-2.4.1-cp312-cp312-macosx_10_9_universal2.whl
Collecting peewee>=3.16.2 (from yfinance[optional])
  Using cached peewee-3.17.1-py3-none-any.whl
Collecting html5lib>=1.1 (from yfinance[optional])
  Downloading html5lib-1.1-py2.py3-none-any.whl.metadata (16 kB)
Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.2/112.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multitasking-0.0.11-py3-none-any.whl (8.5 kB)
Downloading yfinance-0

In [281]:
import yfinance as yf

In [282]:
tickers = yf.Tickers('msft ibm aapl goog')

In [284]:
tickers.tickers['MSFT'].info

{'address1': 'One Microsoft Way',
 'city': 'Redmond',
 'state': 'WA',
 'zip': '98052-6399',
 'country': 'United States',
 'phone': '425 882 8080',
 'website': 'https://www.microsoft.com',
 'industry': 'Software - Infrastructure',
 'industryKey': 'software-infrastructure',
 'industryDisp': 'Software - Infrastructure',
 'sector': 'Technology',
 'sectorKey': 'technology',
 'sectorDisp': 'Technology',
 'longBusinessSummary': 'Microsoft Corporation develops and supports software, services, devices and solutions worldwide. The Productivity and Business Processes segment offers office, exchange, SharePoint, Microsoft Teams, office 365 Security and Compliance, Microsoft viva, and Microsoft 365 copilot; and office consumer services, such as Microsoft 365 consumer subscriptions, Office licensed on-premises, and other office services. This segment also provides LinkedIn; and dynamics business solutions, including Dynamics 365, a set of intelligent, cloud-based applications across ERP, CRM, power 

In [285]:
tickers.tickers['AAPL'].history(period="1mo")

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-03-04 00:00:00-05:00,176.149994,176.899994,173.789993,175.100006,81510100,0.0,0.0
2024-03-05 00:00:00-05:00,170.759995,172.039993,169.619995,170.119995,95132400,0.0,0.0
2024-03-06 00:00:00-05:00,171.059998,171.240005,168.679993,169.119995,68587700,0.0,0.0
2024-03-07 00:00:00-05:00,169.149994,170.729996,168.490005,169.0,71765100,0.0,0.0
2024-03-08 00:00:00-05:00,169.0,173.699997,168.940002,170.729996,76114600,0.0,0.0
2024-03-11 00:00:00-04:00,172.940002,174.380005,172.050003,172.75,60139500,0.0,0.0
2024-03-12 00:00:00-04:00,173.149994,174.029999,171.009995,173.229996,59825400,0.0,0.0
2024-03-13 00:00:00-04:00,172.770004,173.190002,170.759995,171.130005,52488700,0.0,0.0
2024-03-14 00:00:00-04:00,172.910004,174.309998,172.050003,173.0,72913500,0.0,0.0
2024-03-15 00:00:00-04:00,171.169998,172.619995,170.289993,172.619995,121664700,0.0,0.0


In [286]:
tickers.tickers['GOOG'].actions

Unnamed: 0_level_0,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-03-27 00:00:00-04:00,0.0,2.002
2015-04-27 00:00:00-04:00,0.0,1.002746
2022-07-18 00:00:00-04:00,0.0,20.0


In [291]:
tickers.tickers['MSFT'].info
tickers.tickers['AAPL'].history(period="1mo")


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-03-04 00:00:00-05:00,176.149994,176.899994,173.789993,175.100006,81510100,0.0,0.0
2024-03-05 00:00:00-05:00,170.759995,172.039993,169.619995,170.119995,95132400,0.0,0.0
2024-03-06 00:00:00-05:00,171.059998,171.240005,168.679993,169.119995,68587700,0.0,0.0
2024-03-07 00:00:00-05:00,169.149994,170.729996,168.490005,169.0,71765100,0.0,0.0
2024-03-08 00:00:00-05:00,169.0,173.699997,168.940002,170.729996,76114600,0.0,0.0
2024-03-11 00:00:00-04:00,172.940002,174.380005,172.050003,172.75,60139500,0.0,0.0
2024-03-12 00:00:00-04:00,173.149994,174.029999,171.009995,173.229996,59825400,0.0,0.0
2024-03-13 00:00:00-04:00,172.770004,173.190002,170.759995,171.130005,52488700,0.0,0.0
2024-03-14 00:00:00-04:00,172.910004,174.309998,172.050003,173.0,72913500,0.0,0.0
2024-03-15 00:00:00-04:00,171.169998,172.619995,170.289993,172.619995,121664700,0.0,0.0


In [300]:
all_data = 
for ticker in {[ticker: web.get_data_yahoo(ticker),
                tickers.tickers['MSFT'].info,
                tickers.tickers['AAPL'].history(period="1mo"),
                tickers.tickers['GOOG'].actions]}

SyntaxError: invalid syntax (240807227.py, line 1)

In [310]:
import yfinance as yf
yf.download(['MSFT', 'AAPL', 'GOOG'])

[*********************100%%**********************]  3 of 3 completed


Price,Adj Close,Adj Close,Adj Close,Close,Close,Close,High,High,High,Low,Low,Low,Open,Open,Open,Volume,Volume,Volume
Ticker,AAPL,GOOG,MSFT,AAPL,GOOG,MSFT,AAPL,GOOG,MSFT,AAPL,GOOG,MSFT,AAPL,GOOG,MSFT,AAPL,GOOG,MSFT
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
1980-12-12,0.099192,,,0.128348,,,0.128906,,,0.128348,,,0.128348,,,469033600,,
1980-12-15,0.094017,,,0.121652,,,0.122210,,,0.121652,,,0.122210,,,175884800,,
1980-12-16,0.087117,,,0.112723,,,0.113281,,,0.112723,,,0.113281,,,105728000,,
1980-12-17,0.089273,,,0.115513,,,0.116071,,,0.115513,,,0.115513,,,86441600,,
1980-12-18,0.091861,,,0.118862,,,0.119420,,,0.118862,,,0.118862,,,73449600,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-03-26,169.710007,151.699997,421.649994,169.710007,151.699997,421.649994,171.419998,153.199997,425.989990,169.580002,151.029999,421.350006,170.000000,151.240005,425.609985,57388400,19312700.0,16725600.0
2024-03-27,173.309998,151.940002,421.429993,173.309998,151.940002,421.429993,173.600006,152.690002,424.450012,170.110001,150.130005,419.010010,170.410004,152.145004,424.440002,60273300,16622000.0,16705000.0
2024-03-28,171.479996,152.259995,420.720001,171.479996,152.259995,420.720001,172.229996,152.669998,421.869995,170.509995,151.330002,419.119995,171.750000,152.000000,420.959991,65672700,21105600.0,21871200.0
2024-04-01,170.029999,156.500000,424.570007,170.029999,156.500000,424.570007,171.250000,157.000000,427.890015,169.479996,151.649994,422.220001,171.190002,151.830002,423.950012,46240500,24469800.0,16316000.0


In [311]:
all_data = {ticker: web.get_data_yahoo(ticker)
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

TypeError: string indices must be integers, not 'str'