In [12]:
import pandas as pd
import numpy as np

# 5.1 Introduction to pandas Data Structures

## Series

- A one-dimensional array-like object
- Contains a sequence of values of the same type
- And an associated array of data labels, called the index

In [2]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [3]:
# only the array
obj.array

<NumpyExtensionArray>
[4, 7, -5, 3]
Length: 4, dtype: int64

In [4]:
# or the index
obj.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
# can also specify index explicitly
obj2 = pd.Series([4, 7, -5, 3], index=["d", "b", "a", "c"])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [6]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [7]:
# dictionary-like accessing of values
obj2["a"]

np.int64(-5)

In [8]:
# assignment through access
obj2["d"] = 6
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [9]:
# access multiple elements
obj2[["c", "a", "d"]]

c    3
a   -5
d    6
dtype: int64

In [10]:
# or using a boolean mask
obj2[obj2 > 0]

d    6
b    7
c    3
dtype: int64

In [11]:
# numpy-like operations
obj2 * 2

d    12
b    14
a   -10
c     6
dtype: int64

In [13]:
# note how index-value links are preserved
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [22]:
# A series is like a fixed-size, ordered dict
"b" in obj2

True

In [23]:
"e" in obj2

False

In [15]:
# getting a Series from a dict, keys become the index
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [16]:
# turn back into dict
obj3.to_dict()

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [17]:
# Index order depends on dictionary key insertion order
# By passing a custom index, the order can be defined
# - Missing keys in the dict will contain NaN value
# - Keys in the dictionary that are not in the index are ignored
states = ["California", "Ohio", "Oregon", "Texas"]
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [18]:
# check for missing values
pd.isna(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [19]:
# check for non-missing values
pd.notna(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [20]:
# also exists as methods of the Series object
obj4.isna()
obj4.notna()

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [21]:
# series objects align automatically by index label in arithmetic operations
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [22]:
# series and indexes have a name attribute
obj4.name = "population"
obj4.index.name = "state"
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [23]:
# index can be changed in place
obj.index = ["Bob", "Steve", "Jeff", "Ryan"]
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

## DataFrame

- a rectangular table of data
- contains an ordered, named collection of columns
- each column can have a different type
- sort of like a dictionary of series, that all share the same index

In [24]:
data = {
    "state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
    "year": [2000, 2001, 2002, 2001, 2002, 2003],
    "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [26]:
# First 5
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [27]:
# Last 5
frame.tail()

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [29]:
# arrange the columns in a specific order
pd.DataFrame(data, columns=["year", "state", "pop"])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [42]:
# passing a column that is not in the data will make it appear with missing values
frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [43]:
# a column can be retrieved as a series by dict-like notation
frame2["state"]

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [44]:
# or attribute as an attribute (though this is only possible if the name does not contain certain characters
frame2.state

# note how the series has the same index as the dataframe!

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [45]:
# rows can be retrieved by position or name with the iloc and loc attributes
frame2.loc[2]   # by name

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: 2, dtype: object

In [46]:
frame2.iloc[2]  # by position

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: 2, dtype: object

In [47]:
# columns can be modified by assignment
frame2["debt"] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,16.5
1,2001,Ohio,1.7,16.5
2,2002,Ohio,3.6,16.5
3,2001,Nevada,2.4,16.5
4,2002,Nevada,2.9,16.5
5,2003,Nevada,3.2,16.5


In [48]:
# assignment is also possible with an array
frame2["debt"] = np.arange(6.)
frame2

# when assigning a list or array, the lengths must match

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,0.0
1,2001,Ohio,1.7,1.0
2,2002,Ohio,3.6,2.0
3,2001,Nevada,2.4,3.0
4,2002,Nevada,2.9,4.0
5,2003,Nevada,3.2,5.0


In [49]:
# when assigning a series, its labels will be aligned automatically
val = pd.Series([-1.2, -1.5, -1.7], index=[2, 4, 5])
frame2["debt"] = val
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,-1.2
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,-1.5
5,2003,Nevada,3.2,-1.7


In [50]:
# assigning a column that doesn't exist will create a new column
frame2["eastern"] = frame2.state == "Ohio"
frame2

Unnamed: 0,year,state,pop,debt,eastern
0,2000,Ohio,1.5,,True
1,2001,Ohio,1.7,,True
2,2002,Ohio,3.6,-1.2,True
3,2001,Nevada,2.4,,False
4,2002,Nevada,2.9,-1.5,False
5,2003,Nevada,3.2,-1.7,False


In [51]:
# del can be used to delete columns
del frame2["eastern"]
print(frame2.columns)

frame2

Index(['year', 'state', 'pop', 'debt'], dtype='object')


Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,-1.2
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,-1.5
5,2003,Nevada,3.2,-1.7


In [52]:
# if the column does not exist, an error will be thrown
del frame2["easter"]    # KeyError

KeyError: 'easter'

In [55]:
# DataFrames can also be created using nested dicts
populations = {
    "Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
    "Nevada": {2001: 2.4, 2002: 2.9}
}

# keys in inner dicts become the index
frame3 = pd.DataFrame(populations)
frame3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [54]:
# dataframes can be transposed
frame3.T

Unnamed: 0,2000,2001,2002
Ohio,1.5,1.7,3.6
Nevada,,2.4,2.9


In [56]:
# when passing an explicit index, the keys will be overriden
pd.DataFrame(populations, index=[2001, 2002, 2003])

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9
2003,,


In [57]:
# dictionaries of series can be used as well
pdata = {
    "Ohio": frame3["Ohio"][:-1],
    "Nevada": frame3["Nevada"][:2]
}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


In [58]:
# the df can have its index and column named, but not itself
frame3.index.name = "year"
frame3.columns.name = "state"
frame3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [59]:
# there is a convenient method to convert the df to a numpy array
frame3.to_numpy()

array([[1.5, nan],
       [1.7, 2.4],
       [3.6, 2.9]])

In [60]:
# when the df contains several datatypes, the object dtype is used
frame2.to_numpy()

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, nan],
       [2002, 'Ohio', 3.6, -1.2],
       [2001, 'Nevada', 2.4, nan],
       [2002, 'Nevada', 2.9, -1.5],
       [2003, 'Nevada', 3.2, -1.7]], dtype=object)

## Index Objects

In [None]:
# pandas' Index objects are responsible for holding the axis labels
# Any array or other sequence of labels you use when constructing a Series or DataFrame is internally converted to an Index
obj = pd.Series(np.arange(3), index=["a", "b", "c"])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [69]:
index[1:]

Index(['b', 'c'], dtype='object')

In [70]:
# they are immutable
index[1] = "d"

TypeError: Index does not support mutable operations

In [None]:
# columns attribute is also an Index object
frame3.columns

Index(['Ohio', 'Nevada'], dtype='object', name='state')

In [72]:
# they behave like fixed-size sets
"Ohio" in frame3.columns

True

In [73]:
# but they can have duplicate labels
pd.Index(["foo", "foo", "bar", "bar"])

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

In [86]:
# typical index operations

index = pd.Index(np.arange(3))
index = index.append([index + 3])
print(index)

index2 = pd.Index(np.arange(3))
print(index2)

print(index.difference(index2))
print(index.intersection(index2))
print(index.union(index2))
print(index.isin([2, 3]))
print(index.delete(1))
print(index.drop([4, 5]))
print(index.insert(1, 5))
print(index.is_monotonic_increasing)
print(index.is_unique)
print(index.unique())

Index([0, 1, 2, 3, 4, 5], dtype='int64')
Index([0, 1, 2], dtype='int64')
Index([3, 4, 5], dtype='int64')
Index([0, 1, 2], dtype='int64')
Index([0, 1, 2, 3, 4, 5], dtype='int64')
[False False  True  True False False]
Index([0, 2, 3, 4, 5], dtype='int64')
Index([0, 1, 2, 3], dtype='int64')
Index([0, 5, 1, 2, 3, 4, 5], dtype='int64')
True
True
Index([0, 1, 2, 3, 4, 5], dtype='int64')


# Essential Functionality

In [None]:
# Re-indexing: rearranges the data according to a new index
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])
obj2 = obj.reindex(["a", "b", "c", "d", "e"])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [None]:
# To better deal with missing data, the reindex method can take a method parameter
obj3 = pd.Series(["blue", "purple", "yellow"], index=[0, 2, 4])
obj3.reindex(range(6), method="ffill")  # forward-fill: propagate last valid observation forward

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [None]:
# for a dataframe, the columns or rows can be re-indexed independently
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=["a", "c", "d"],
                     columns=["Ohio", "Texas", "California"])
frame.reindex(["a", "b", "c", "d"])  # reindex rows: default is rows

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [93]:
# note how columns are dropped
frame.reindex(columns=["Texas", "Utah", "California"])  # reindex columns

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [94]:
# Dropping entries
frame.drop(["a", "d"])  # drop rows by label

Unnamed: 0,Ohio,Texas,California
c,3,4,5


In [95]:
frame.drop("Texas", axis=1)  # drop columns by label

Unnamed: 0,Ohio,California
a,0,2
c,3,5
d,6,8


In [None]:
# Series indexing
obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])

# by index
obj["c"]

# slicing
obj[2:4]

# using a list
obj[["b", "a", "d"]]

# boolean indexing
obj[obj < 2]

  obj[1]


a    0.0
b    1.0
dtype: float64

In [97]:
# prefer to use loc, though!
obj.loc[["b", "a", "d"]]

b    1.0
a    0.0
d    3.0
dtype: float64

In [98]:
# iloc works with integer positions
obj.iloc[2:4]

c    2.0
d    3.0
dtype: float64

In [99]:
obj.iloc[[1, 0, 2]]

b    1.0
a    0.0
c    2.0
dtype: float64

In [101]:
# you can also assign with indexing
obj.iloc[2] = 99
obj

a     0.0
b     1.0
c    99.0
d     3.0
e     4.0
dtype: float64

In [103]:
obj.loc["d"] = 77
obj

a     0.0
b     1.0
c    99.0
d    77.0
e     4.0
dtype: float64

In [104]:
# dataframes have similar indexing capabilities
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [106]:
data.loc["Ohio"]

one      0
two      1
three    2
four     3
Name: Ohio, dtype: int64

In [107]:
data.loc[:, ["two"]]

Unnamed: 0,two
Ohio,1
Colorado,5
Utah,9
New York,13


In [109]:
data.loc[data["three"] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [110]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [111]:
data.iloc[[2, 1]]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
Colorado,4,5,6,7


In [112]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int64

In [113]:
# Slices are possible too
data.loc[:"Utah", "two"]

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int64

In [114]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


In [116]:
# boolean arrays can be used with loc only
data.loc[data.three > 2]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [119]:
# some convenient methods

# data[column] pick a column
# data.loc[rows] pick row or rows
# data.loc[:, columns] pick one or more columns
# data.loc[rows, columns] pick one or more rows and columns
# data.iloc[rows] pick row or rows by integer position
# ...
# data.at[row, col] pick a single value
# data.iat[row_pos, col_pos] pick a single value by integer position

In [120]:
# numpy ufuncs also work with pandas objects
frame = pd.DataFrame(np.random.randn(4, 3), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
frame

Unnamed: 0,b,d,e
Utah,-0.175271,0.61406,-0.551756
Ohio,0.614699,0.588569,1.618503
Texas,0.074966,-0.792843,-1.497117
Oregon,0.413437,1.119069,0.361936


In [121]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.175271,0.61406,0.551756
Ohio,0.614699,0.588569,1.618503
Texas,0.074966,0.792843,1.497117
Oregon,0.413437,1.119069,0.361936


In [122]:
# apply lets you apply an arbitrary function to the data
def f1(x):
    return x.max() - x.min()

frame.apply(f1)

b    0.789970
d    1.911912
e    3.115620
dtype: float64

In [123]:
# by using the axis parameter, you can apply the function to each row instead of each column
frame.apply(f1, axis="columns")

Utah      1.165816
Ohio      1.029934
Texas     1.572083
Oregon    0.757133
dtype: float64

In [124]:
# the function can also return a series
def f2(x):
    return pd.Series([x.min(), x.max()], index=["min", "max"])
frame.apply(f2)

Unnamed: 0,b,d,e
min,-0.175271,-0.792843,-1.497117
max,0.614699,1.119069,1.618503


In [126]:
# and an element-wise version exists, too
def my_format(x):
    return f"{x:.2f}"

frame.map(my_format)

Unnamed: 0,b,d,e
Utah,-0.18,0.61,-0.55
Ohio,0.61,0.59,1.62
Texas,0.07,-0.79,-1.5
Oregon,0.41,1.12,0.36


In [127]:
# there is a function for index sorting
obj = pd.Series(np.arange(4), index=["d", "a", "c", "b"])
obj

d    0
a    1
c    2
b    3
dtype: int64

In [128]:
obj.sort_index()

a    1
b    3
c    2
d    0
dtype: int64

In [129]:
# for dataframes, you can sort by index on either axis
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=["three", "one"],
                     columns=["d", "a", "b", "c"])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [130]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [131]:
frame.sort_index(axis="columns")

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [132]:
# you can also sort in a decreasing fashion
frame.sort_index(axis="columns", ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [133]:
# a series can be sorted by values
obj = pd.Series([4, 7, -2, 3])
obj.sort_values()

2   -2
3    3
0    4
1    7
dtype: int64

In [None]:
# NaN values get pushed to the end by default
obj = pd.Series([4, np.nan, 7, np.nan, 3])
obj.sort_values()

4    3.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [136]:
# to sort dataframes, the column name can be specified
frame = pd.DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
frame.sort_values("b")

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [137]:
# multiple column names can be used as well
frame.sort_values(["a", "b"])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [140]:
# the rank method assigns a rank based on the value
obj = pd.Series([7, -5, 7, 4, 2, 0, 4, 4])
obj.rank()

0    7.5
1    1.0
2    7.5
3    5.0
4    3.0
5    2.0
6    5.0
7    5.0
dtype: float64

In [141]:
# there are also tie-braking functions
obj.rank(method="first")

0    7.0
1    1.0
2    8.0
3    4.0
4    3.0
5    2.0
6    5.0
7    6.0
dtype: float64

In [143]:
# in dataframes, the rank can be computed across rows or columns
frame = pd.DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1], "c": [-2, 5, 8, -2.5]})
frame.rank(axis="columns")

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


In [145]:
frame.rank(axis="index")

Unnamed: 0,b,a,c
0,3.0,1.5,2.0
1,4.0,3.5,3.0
2,1.0,1.5,4.0
3,2.0,3.5,1.0


In [146]:
# with duplicate axis labels, some functions do not work anymore
# ,but it is not a strict requirement
obj = pd.Series(np.arange(5), index=["a", "a", "b", "b", "c"])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [148]:
obj.index.is_unique

False

In [149]:
obj["a"]

a    0
a    1
dtype: int64

# Summarizing and Computing Descriptive Statistics

In [151]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], 
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=["a", "b", "c", "d"],
                  columns=["one", "two"])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [152]:
# methods usually have built-in handling of missing values
df.sum()

one    9.25
two   -5.80
dtype: float64

In [153]:
# passing axis="columns" works as well
df.sum(axis="columns")

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [154]:
# in some cases, at least one non-NA value is required
df.mean(axis="columns")

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [155]:
# idxmax return index values
df.idxmax()

one    b
two    d
dtype: object

In [156]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [157]:
# describe produces multiple statistics at the same time
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [158]:
obj = pd.Series(["a", "a", "b", "c"] * 4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [159]:
# also works on non-numeric data
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [162]:
price = pd.read_pickle("examples/yahoo_price.pkl")
volume = pd.read_pickle("examples/yahoo_volume.pkl")

In [163]:
# returns are computed from pairs of data points
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-17,-0.00068,0.001837,0.002072,-0.003483
2016-10-18,-0.000681,0.019616,-0.026168,0.00769
2016-10-19,-0.002979,0.007846,0.003583,-0.002255
2016-10-20,-0.000512,-0.005652,0.001719,-0.004867
2016-10-21,-0.00393,0.003011,-0.012474,0.042096


In [164]:
# corr method computes the correlation of two series
returns["MSFT"].corr(returns["IBM"])

np.float64(0.4997636114415114)

In [165]:
# on the dataframe, the method computes a full correlation matrix
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.407919,0.386817,0.389695
GOOG,0.407919,1.0,0.405099,0.465919
IBM,0.386817,0.405099,1.0,0.499764
MSFT,0.389695,0.465919,0.499764,1.0


In [166]:
# also for covariance
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000277,0.000107,7.8e-05,9.5e-05
GOOG,0.000107,0.000251,7.8e-05,0.000108
IBM,7.8e-05,7.8e-05,0.000146,8.9e-05
MSFT,9.5e-05,0.000108,8.9e-05,0.000215


In [167]:
# with corrwith, pairwise correlations can be computed
returns.corrwith(volume)

AAPL   -0.075565
GOOG   -0.007067
IBM    -0.204849
MSFT   -0.092950
dtype: float64

In [168]:
# unique returns the unique values in the series/dataframe
obj = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [169]:
# value counts returns the count of each unique value
obj.value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [170]:
# with isin(), we can check whether values are in some set
mask = obj.isin(["b", "c"])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [171]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object