In [4]:
import pandas as pd
import numpy as np
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.set_printoptions(precision=4, suppress=True)

In [5]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

In [7]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [16]:
frame_reindex = frame.set_index("state")
frame_reindex

Unnamed: 0_level_0,year,pop
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Ohio,2000,1.5
Ohio,2001,1.7
Ohio,2002,3.6
Nevada,2001,2.4
Nevada,2002,2.9
Nevada,2003,3.2


Unnamed: 0_level_0,year,pop
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Nevada,2001,2.4
Nevada,2002,2.9
Nevada,2003,3.2


In [22]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four','five', 'six'])
frame2   # the column 'debt' is missing

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [27]:
frame2["debt"] = np.arange(6.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [28]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
print(val)
frame2['debt'] = val
frame2

two    -1.2
four   -1.5
five   -1.7
dtype: float64


Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [47]:
idx = ["one", "two", "c"]
print(frame2.head())
frame2.reindex(idx, columns = ["year", "state"])
frame2.drop("three")

       year   state  pop  debt
one    2000    Ohio  1.5   NaN
two    2001    Ohio  1.7  -1.2
three  2002    Ohio  3.6   NaN
four   2001  Nevada  2.4  -1.5
five   2002  Nevada  2.9  -1.7


Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [51]:
frame2.drop("state", axis = "columns")

Unnamed: 0,year,pop,debt
one,2000,1.5,
two,2001,1.7,-1.2
three,2002,3.6,
four,2001,2.4,-1.5
five,2002,2.9,-1.7
six,2003,3.2,


In [68]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [73]:
frame2.loc["five", "year"] = 2003

In [74]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2003,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [75]:
frame2.iloc[:, :3]

Unnamed: 0,year,state,pop
one,2000,Ohio,1.5
two,2001,Ohio,1.7
three,2002,Ohio,3.6
four,2001,Nevada,2.4
five,2003,Nevada,2.9
six,2003,Nevada,3.2


In [76]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list('abcde'))
df2.loc[1, 'b'] = np.nan
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [77]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [78]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [79]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [81]:
f = lambda x: x.max() - x.min()
df2.apply(f)

a    8.0
b    8.0
c    8.0
d    8.0
dtype: float64

In [120]:
df2.apply(f, axis = "columns")

0    4.0
1    4.0
2    4.0
3    4.0
dtype: float64

In [92]:
format = lambda x: '%.5f' % x
df1.applymap(format)

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [93]:
df1["d"].map(format)

0     3.00000
1     7.00000
2    11.00000
Name: d, dtype: object

In [96]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj
obj.sort_index()   # sort lexicographically

a    1
b    2
c    3
d    0
dtype: int64

In [98]:
df1.sort_values(by = ["a", "b"], ascending= False)

Unnamed: 0,a,b,c,d
2,8.0,9.0,10.0,11.0
1,4.0,5.0,6.0,7.0
0,0.0,1.0,2.0,3.0


In [105]:
obj = pd.DataFrame([7, -5, 7, 4, 2, 0, 4], columns=["Value"])
obj

Unnamed: 0,Value
0,7
1,-5
2,7
3,4
4,2
5,0
6,4


In [121]:
obj["Rank_average"] = obj["Value"].rank()
obj["Rank_first"] = obj["Value"].rank(method = "first")
obj["Rank_max"] = obj["Value"].rank(method="max", axis = 0)
obj

Unnamed: 0,Value,Rank_average,Rank_first,Rank_max
0,7,6.5,6.0,7.0
1,-5,1.0,1.0,1.0
2,7,6.5,7.0,7.0
3,4,4.5,4.0,5.0
4,2,3.0,3.0,3.0
5,0,2.0,2.0,2.0
6,4,4.5,5.0,5.0


In [122]:
obj.index.is_unique

True

In [123]:
obj.cumsum()

Unnamed: 0,Value,Rank_average,Rank_first,Rank_max
0,7,6.5,6.0,7.0
1,2,7.5,7.0,8.0
2,9,14.0,14.0,15.0
3,13,18.5,18.0,20.0
4,15,21.5,21.0,23.0
5,15,23.5,23.0,25.0
6,19,28.0,28.0,30.0


In [124]:
obj.idxmax()

Value           0
Rank_average    0
Rank_first      2
Rank_max        0
dtype: int64

In [125]:
obj["Value"].corr(obj["Rank_average"])

0.9655206380201663

In [126]:
obj.corrwith(df1)

Rank_average   NaN
Rank_first     NaN
Rank_max       NaN
Value          NaN
a              NaN
b              NaN
c              NaN
d              NaN
dtype: float64

In [128]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [129]:
uniques = obj.unique()   # gives you an array of the unique values in a Series
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [130]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [131]:
pd.value_counts(obj.values, sort=False)

c    3
a    3
d    1
b    2
dtype: int64

In [132]:
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [133]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2])

In [134]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [146]:
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
