In [2]:
import numpy as np
import pandas as pd

In [3]:
s = pd.Series(np.random.randn(6))

s[::2] = np.nan

s.fillna(s.mean())

0    0.756022
1    1.683747
2    0.756022
3    0.866740
4    0.756022
5   -0.282419
dtype: float64

In [4]:
states = ['ohio', 'newyork', 'vermont', 'florida',
          'oregon', 'nevada', 'california', 'idaho']

In [5]:
group_key = ['East'] * 4 + ['West'] * 4

In [6]:
data = pd.Series(np.random.randn(8), index=states)

In [7]:
data

ohio         -0.399544
newyork      -0.392278
vermont      -0.418838
florida       1.325358
oregon       -0.674639
nevada       -0.557969
california   -0.293272
idaho        -0.737841
dtype: float64

In [8]:
data[['vermont', 'nevada', 'idaho']] = np.nan

In [9]:
data

ohio         -0.399544
newyork      -0.392278
vermont            NaN
florida       1.325358
oregon       -0.674639
nevada             NaN
california   -0.293272
idaho              NaN
dtype: float64

In [10]:
data.groupby(group_key).mean()

East    0.177845
West   -0.483955
dtype: float64

In [11]:
fill_mean = lambda g: g.fillna(g.mean())

In [12]:
data.groupby(group_key).apply(fill_mean)

ohio         -0.399544
newyork      -0.392278
vermont       0.177845
florida       1.325358
oregon       -0.674639
nevada       -0.483955
california   -0.293272
idaho        -0.483955
dtype: float64

In [13]:
fill_values = {'East': 0.5, 'West': -1}

In [14]:
fill_func = lambda g: g.fillna(fill_values[g.name])

In [15]:
data.groupby(group_key).apply(fill_func)

ohio         -0.399544
newyork      -0.392278
vermont       0.500000
florida       1.325358
oregon       -0.674639
nevada       -1.000000
california   -0.293272
idaho        -1.000000
dtype: float64

In [22]:
suitu = ['H', 'S', 'C', 'D']
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']
cards = []
for suit in ['H', 'S', 'C', 'D']:
    cards.extend(str(num) + suit for num in base_names)

deck = pd.Series(card_val, index=cards)

In [23]:
deck[:13]

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64

In [24]:
def draw(deck, n=5):
    return deck.sample(n)

In [25]:
draw(deck)

4D     4
4S     4
5S     5
QH    10
AD     1
dtype: int64

In [26]:
get_suit = lambda card: card[-1]

In [27]:
deck.groupby(get_suit).apply(draw, n=2)

C  5C     5
   7C     7
D  6D     6
   QD    10
H  6H     6
   QH    10
S  5S     5
   KS    10
dtype: int64

In [29]:
df = pd.DataFrame({'category': ['a', 'a', 'a', 'a',
                                'b', 'b', 'b', 'b'],
                  'data':np.random.randn(8),
                  'weights': np.random.rand(8)})

In [30]:
df

Unnamed: 0,category,data,weights
0,a,-1.379419,0.78961
1,a,0.701816,0.934656
2,a,-0.42872,0.746696
3,a,0.865673,0.337721
4,b,-0.145078,0.344433
5,b,0.055554,0.674419
6,b,-0.855879,0.770023
7,b,0.830858,0.101442


In [31]:
grouped = df.groupby('category')

In [32]:
get_wevg = lambda g: np.average(g['data'], weights=g['weights'])

In [34]:
grouped.apply(get_wevg)

category
a   -0.164139
b   -0.310671
dtype: float64

In [37]:
close_px = pd.read_csv('C:/Users/trivial system/pydata-book-2nd-edition/examples/stock_px_2.csv', parse_dates=True,index_col=0)

In [38]:
close_px.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    2214 non-null   float64
 1   MSFT    2214 non-null   float64
 2   XOM     2214 non-null   float64
 3   SPX     2214 non-null   float64
dtypes: float64(4)
memory usage: 86.5 KB


In [39]:
close_px[-4:]

Unnamed: 0,AAPL,MSFT,XOM,SPX
2011-10-11,400.29,27.0,76.27,1195.54
2011-10-12,402.19,26.96,77.16,1207.25
2011-10-13,408.43,27.18,76.37,1203.66
2011-10-14,422.0,27.27,78.11,1224.58


In [40]:
spx_corr = lambda x: x.corrwith(x['SPX'])

In [43]:
rets = close_px.pct_change().dropna()

In [44]:
get_year = lambda x: x.year

In [45]:
by_year = rets.groupby(get_year)

In [46]:
by_year.apply(spx_corr)

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,0.541124,0.745174,0.661265,1.0
2004,0.374283,0.588531,0.557742,1.0
2005,0.46754,0.562374,0.63101,1.0
2006,0.428267,0.406126,0.518514,1.0
2007,0.508118,0.65877,0.786264,1.0
2008,0.681434,0.804626,0.828303,1.0
2009,0.707103,0.654902,0.797921,1.0
2010,0.710105,0.730118,0.839057,1.0
2011,0.691931,0.800996,0.859975,1.0


In [47]:
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))

2003    0.480868
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

In [51]:
import statsmodels.api as sm
def regress(data, yvar, xvars):
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1.
    result = sm.OLS(X, Y).fit()
    return result.params

In [52]:
by_year.apply(regress, 'AAPL', ['SPX'])

Unnamed: 0,Unnamed: 1,0,1
2003,AAPL,0.246326,3.175114
2004,AAPL,0.101922,7.04035
2005,AAPL,0.122066,5.732395
2006,AAPL,0.112147,1.61661
2007,AAPL,0.211554,6.365622
2008,AAPL,0.48032,-1.965627
2009,AAPL,0.559087,8.135493
2010,AAPL,0.4769,6.396784
2011,AAPL,0.588582,5.437102
