In [None]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc("figure", figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.DataFrame({"key1" : ["a", "a", None, "b", "b", "a", None],
                   "key2" : pd.Series([1, 2, 1, 2, 1, None, 1],
                                      dtype="Int64"),
                   "data1" : np.random.standard_normal(7),
                   "data2" : np.random.standard_normal(7)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.204708,0.281746
1,a,2.0,0.478943,0.769023
2,,1.0,-0.519439,1.246435
3,b,2.0,-0.55573,1.007189
4,b,1.0,1.965781,-1.296221
5,a,,1.393406,0.274992
6,,1.0,0.092908,0.228913


Suppose you wanted to compute the mean of the data1 column using the labels from
 key1.

In [None]:
grouped = df[["key1","data1"]].groupby(df["key1"],dropna=False)
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x79479169d4e0>

In [None]:
grouped = df[["key1", "key2", "data1"]].groupby(["key1", "key2"])
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7947d0d306a0>

In [None]:
grouped = df.groupby(["key1", "key2"])[["key1", "key2", "data1"]]
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x79478b0396f0>

In [None]:
df2 = df[["key1", "key2", "data1"]] # sub set

grouped = df2.groupby(["key1", "key2"]) # group by
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x79478b039780>

In [None]:
for x ,y in grouped:
  # print(type(x))
  print(x)
  # print(type(y))
  print(y)

('a', 1)
  key1  key2     data1
0    a     1 -0.204708
('a', 2)
  key1  key2     data1
1    a     2  0.478943
('b', 1)
  key1  key2     data1
4    b     1  1.965781
('b', 2)
  key1  key2    data1
3    b     2 -0.55573


 This grouped variable is now a special “GroupBy” object. It has not actually computed
 anything yet except for some intermediate data about the group key df["key1"].
 The idea is that this object has all of the information needed to then apply some
 operation to each of the groups. For example, to compute group means we can call
 the GroupBy’s mean method:

In [None]:
grouped.mean()

Unnamed: 0_level_0,data1
key1,Unnamed: 1_level_1
a,0.555881
b,0.705025
,-0.213265


 If instead we had passed multiple arrays as a list, we’d get something different:

In [None]:
means = df["data1"].groupby([df["key1"], df["key2"]]).mean()
# means = df["data1"].groupby(["key1","key2"]).mean()
means

Unnamed: 0_level_0,Unnamed: 1_level_0,data1
key1,key2,Unnamed: 2_level_1
a,1,-0.204708
a,2,0.478943
b,1,1.965781
b,2,-0.55573


 Here we grouped the data using two keys, and the resulting Series now has a hier
archical index consisting of the unique pairs of keys observed:

In [None]:
means.unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.204708,0.478943
b,1.965781,-0.55573


In this example, the group keys are all Series, though they could be any arrays of the
 right length:

In [None]:
states = np.array(["OH", "CA", "CA", "OH", "OH", "CA", "OH"])
years = [2005, 2005, 2006, 2005, 2006, 2005, 2006]
# df["data1"].groupby([states, years]).mean()

In [None]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.204708,0.281746
1,a,2.0,0.478943,0.769023
2,,1.0,-0.519439,1.246435
3,b,2.0,-0.55573,1.007189
4,b,1.0,1.965781,-1.296221
5,a,,1.393406,0.274992
6,,1.0,0.092908,0.228913


In [None]:
df["data1"].groupby([1,1,1,1,1,1,2]).mean()

Unnamed: 0,data1
1,0.426376
2,0.092908


In [None]:
df["data1"].groupby([states, years]).mean()

Unnamed: 0,Unnamed: 1,data1
CA,2005,0.936175
CA,2006,-0.519439
OH,2005,-0.380219
OH,2006,1.029344


 Frequently, the grouping information is found in the same DataFrame as the data you
 want to work on. In that case, you can pass column names (whether those are strings,
 numbers, or other Python objects) as the group keys:

In [None]:
# df.groupby("key1").mean()
# df.groupby("key2").mean(numeric_only=True)
df.groupby(["key1", "key2"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,-0.204708,0.281746
a,2,0.478943,0.769023
b,1,1.965781,-1.296221
b,2,-0.55573,1.007189


 Regardless of the objective in using groupby, a generally useful GroupBy method is
 size, which returns a Series containing group sizes:

In [None]:
df.groupby(["key1", "key2"]).size()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
key1,key2,Unnamed: 2_level_1
a,1,1
a,2,1
b,1,1
b,2,1


 Note that any missing values in a group key are excluded from the result by default.
 This behavior can be disabled by passing dropna=False to groupby:

In [None]:
df.groupby("key1", dropna=False).size()
df.groupby(["key1", "key2"], dropna=False).size()

 A group function similar in spirit to size is count, which computes the number of
 nonnull values in each group:

In [None]:
df.groupby("key1").count()

 # Iterating over Groups
 The object returned by groupby supports iteration, generating a sequence of 2-tuples
 containing the group name along with the chunk of data. Consider the following:

In [None]:
for name, group in df.groupby("key1", dropna= False):
    print(name)
    print(group)


a
  key1  key2     data1     data2
0    a     1 -0.204708  0.281746
1    a     2  0.478943  0.769023
5    a  <NA>  1.393406  0.274992
b
  key1  key2     data1     data2
3    b     2 -0.555730  1.007189
4    b     1  1.965781 -1.296221
nan
   key1  key2     data1     data2
2  None     1 -0.519439  1.246435
6  None     1  0.092908  0.228913


 In the case of multiple keys, the first element in the tuple will be a tuple of key values:

In [None]:
for (k1, k2), group in df.groupby(["key1", "key2"]):
    print((k1, k2))
    print(group)


 Of course, you can choose to do whatever you want with the pieces of data. A recipe
 you may find useful is computing a dictionary of the data pieces as a one-liner:

In [None]:
pieces = {name: group for name, group in df.groupby("key1")}
pieces["b"]

Unnamed: 0,key1,key2,data1,data2
3,b,2,-0.55573,1.007189
4,b,1,1.965781,-1.296221


 By default groupby groups on axis="index", but you can group on any of the other
 axes. For example, we could group the columns of our example df here by whether
 they start with "key" or "data":

In [None]:
grouped = df.groupby({"key1": "key", "key2": "key",
                      "data1": "data", "data2": "data"}, axis="columns")

  grouped = df.groupby({"key1": "key", "key2": "key",


 We can print out the groups like so:

In [None]:
for group_key, group_values in grouped:
    print(group_key)
    print(group_values)


 # Selecting a Column or Subset of Columns
 Indexing a GroupBy object created from a DataFrame with a column name or array
 of column names has the effect of column subsetting for aggregation. This means
 that:



```
df.groupby("key1")["data1"]
df.groupby("key1")[["data2"]]
```




 are conveniences for:
```
 df["data1"].groupby(df["key1"])
 df[["data2"]].groupby(df["key1"])
 ```

 Especially for large datasets, it may be desirable to aggregate only a few columns. For
 example, in the preceding dataset, to compute the means for just the data2 column
 and get the result as a DataFrame, we could write:

In [None]:
df.groupby(["key1", "key2"])[["data2"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,1,0.281746
a,2,0.769023
b,1,-1.296221
b,2,1.007189


 The object returned by this indexing operation is a grouped DataFrame if a list or
 array is passed, or a grouped Series if only a single column name is passed as a scalar:

In [None]:
s_grouped = df.groupby(["key1", "key2"])["data2"]
s_grouped
s_grouped.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,1,0.281746
a,2,0.769023
b,1,-1.296221
b,2,1.007189


 # Grouping with Dictionaries and Series
 Grouping information may exist in a form other than an array. Let’s consider another
 example DataFrame:

In [None]:
people = pd.DataFrame(np.random.standard_normal((5, 5)),
                      columns=["a", "b", "c", "d", "e"],
                      index=["Joe", "Steve", "Wanda", "Jill", "Trey"])
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values
people

Unnamed: 0,a,b,c,d,e
Joe,1.352917,0.886429,-2.001637,-0.371843,1.669025
Steve,-0.43857,-0.539741,0.476985,3.248944,-1.021228
Wanda,-0.577087,,,0.523772,0.00094
Jill,1.34381,-0.713544,-0.831154,-2.370232,-1.860761
Trey,-0.860757,0.560145,-1.265934,0.119827,-1.063512


 Now, suppose I have a group correspondence for the columns and want to sum the
 columns by group:

In [None]:
mapping = {"a": "red", "b": "red", "c": "blue",
           "d": "blue", "e": "red", "f" : "orange"}

 Now, you could construct an array from this dictionary to pass to groupby, but
 instead we can just pass the dictionary (I included the key "f" to highlight that
 unused grouping keys are OK):

In [None]:
by_column = people.groupby(mapping, axis="columns")
by_column.sum()

  by_column = people.groupby(mapping, axis="columns")


Unnamed: 0,blue,red
Joe,-2.37348,3.908371
Steve,3.725929,-1.999539
Wanda,0.523772,-0.576147
Jill,-3.201385,-1.230495
Trey,-1.146107,-1.364125


 The same functionality holds for Series, which can be viewed as a fixed-size mapping:

In [None]:
map_series = pd.Series(mapping)
print(map_series)
people.groupby(map_series, axis="columns").count()

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object


 # Grouping with Functions
 Using Python functions is a more generic way of defining a group mapping compared
 with a dictionary or Series. Any function passed as a group key will be called once
 per index value (or once per column value if using axis="columns"), with the
 return values being used as the group names. More concretely, consider the example
 DataFrame from the previous section, which has people’s first names as index values.
 Suppose you wanted to group by name length. While you could compute an array of
 string lengths, it’s simpler to just pass the len function:

In [None]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,1.352917,0.886429,-2.001637,-0.371843,1.669025
4,0.483052,-0.153399,-2.097088,-2.250405,-2.924273
5,-1.015657,-0.539741,0.476985,3.772716,-1.020287


 Mixing functions with arrays, dictionaries, or Series is not a problem, as everything
 gets converted to arrays internally:

In [None]:
key_list = ["one", "one", "one", "two", "two"]
people.groupby([len, key_list]).min()

 # Grouping by Index Levels
 A final convenience for hierarchically indexed datasets is the ability to aggregate
 using one of the levels of an axis index. Let’s look at an example:

In [None]:
columns = pd.MultiIndex.from_arrays([["US", "US", "US", "JP", "JP"],
                                    [1, 3, 5, 1, 3]],
                                    names=["cty", "tenor"])
hier_df = pd.DataFrame(np.random.standard_normal((4, 5)), columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.332883,-2.359419,-0.199543,-1.541996,-0.970736
1,-1.30703,0.28635,0.377984,-0.753887,0.331286
2,1.349742,0.069877,0.246674,-0.011862,1.004812
3,1.327195,-0.919262,-1.549106,0.022185,0.758363


In [None]:
hier_df.groupby(level="cty", axis="columns").count()

  hier_df.groupby(level="cty", axis="columns").count()


cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


 # 10.2 Data Aggregation
 Aggregations refer to any data transformation that produces scalar values from arrays.
 The preceding examples have used several of them, including mean, count, min, and
 sum. You may wonder what is going on when you invoke mean() on a GroupBy
 object.

 You can use aggregations of your own devising and additionally call any method
 that is also defined on the object being grouped. For example, the nsmallest
 Series method selects the smallest requested number of values from the data.
 While nsmallest is not explicitly implemented for GroupBy, we can still use it
 with a nonoptimized implementation. Internally, GroupBy slices up the Series, calls
 piece.nsmallest(n) for each piece, and then assembles those results into the result
 object:

In [None]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.204708,0.281746
1,a,2.0,0.478943,0.769023
2,,1.0,-0.519439,1.246435
3,b,2.0,-0.55573,1.007189
4,b,1.0,1.965781,-1.296221
5,a,,1.393406,0.274992
6,,1.0,0.092908,0.228913


In [None]:
df
grouped = df.groupby("key1")
grouped[["data1","data2"]].nsmallest(2)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x79478b3c2470>

 To use your own aggregation functions, pass any function that aggregates an array to
 the aggregate method or its short alias agg:

In [None]:
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1.598113,0.494031
b,1,2.521511,2.30341


 You may notice that some methods, like describe, also work, even though they are
 not aggregations, strictly speaking:

In [None]:
grouped.describe()

Unnamed: 0_level_0,key2,key2,key2,key2,key2,key2,key2,key2,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
a,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0,3.0,0.555881,...,0.936175,1.393406,3.0,0.44192,0.283299,0.274992,0.278369,0.281746,0.525384,0.769023
b,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0,2.0,0.705025,...,1.335403,1.965781,2.0,-0.144516,1.628757,-1.296221,-0.720368,-0.144516,0.431337,1.007189


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
google_drive_path_header = '/content/drive/MyDrive/analytics_programming'

Mounted at /content/drive


In [None]:
path = google_drive_path_header + '/tips.csv'
tips = pd.read_csv(path)
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


In [None]:
tips["tip_pct"] = tips["tip"] / tips["total_bill"]
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


In [None]:
grouped = tips.groupby(["day", "smoker"])

In [None]:
grouped_pct = grouped["tip_pct"]


In [None]:
grouped_pct.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct
day,smoker,Unnamed: 2_level_1
Fri,No,0.15165
Fri,Yes,0.174783
Sat,No,0.158048
Sat,Yes,0.147906
Sun,No,0.160113
Sun,Yes,0.18725
Thur,No,0.160298
Thur,Yes,0.163863


In [None]:
grouped_pct.agg(["mean","max",peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,max,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.187735,0.067349
Fri,Yes,0.174783,0.26348,0.159925
Sat,No,0.158048,0.29199,0.235193
Sat,Yes,0.147906,0.325733,0.290095
Sun,No,0.160113,0.252672,0.193226
Sun,Yes,0.18725,0.710345,0.644685
Thur,No,0.160298,0.266312,0.19335
Thur,Yes,0.163863,0.241255,0.15124


In [None]:
grouped_pct.agg(["mean", "std", ("range of tip",peak_to_peak)])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,range of tip
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


In [None]:
grouped_pct.agg([("average", "mean"), ("stdev", np.std)])

  grouped_pct.agg([("average", "mean"), ("stdev", np.std)])


Unnamed: 0_level_0,Unnamed: 1_level_0,average,stdev
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [None]:
functions = ["count", "mean", "max"]
result = grouped[["tip_pct", "total_bill"]].agg(functions)
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


In [None]:
result["tip_pct"]

In [None]:
ftuples = [("Average", "mean"), ("Variance", np.var)]
grouped[["tip_pct", "total_bill"]].agg(ftuples)

In [None]:
grouped.agg({"tip" : np.max, "size" : "sum"})
grouped.agg({"tip_pct" : ["min", "max", "mean", "std"],
             "size" : "sum"})

In [None]:
grouped = tips.groupby(["day", "smoker"], as_index=False)
grouped.mean(numeric_only=True)

Unnamed: 0,day,smoker,total_bill,tip,size
0,Fri,No,18.42,2.8125,2.25
1,Fri,Yes,16.813333,2.714,2.066667
2,Sat,No,19.661778,3.102889,2.555556
3,Sat,Yes,21.276667,2.875476,2.47619
4,Sun,No,20.506667,3.167895,2.929825
5,Sun,Yes,24.12,3.516842,2.578947
6,Thur,No,17.113111,2.673778,2.488889
7,Thur,Yes,19.190588,3.03,2.352941


 # Apply: General split-apply-combine
 The most general-purpose GroupBy method is apply, which is the subject of this
 section. apply splits the object being manipulated into pieces, invokes the passed
 function on each piece, and then attempts to concatenate the pieces.
 Returning to the tipping dataset from before, suppose you wanted to select the top
 five tip_pct values by group. First, write a function that selects the rows with the
 largest values in a particular column:

In [None]:
def top(df, n=5, column="tip_pct"):
    return df.sort_values(column, ascending=False)[:n]
top(tips, n=6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
232,11.61,3.39,No,Sat,Dinner,2,0.29199
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


Now, if we group by smoker, say, and call apply with this function, we get the
 following:

In [None]:
tips.groupby("smoker").apply(top)

  tips.groupby("smoker").apply(top)


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


 What has happened here? First, the tips DataFrame is split into groups based on the
 value of smoker. Then the top function is called on each group, and the results of
 each function call are glued together using pandas.concat, labeling the pieces with
 the group names. The result therefore has a hierarchical index with an inner level that
 contains index values from the original DataFrame.

 If you pass a function to apply that takes other arguments or keywords, you can pass
 these after the function:

In [None]:
tips.groupby("day").apply(top, n=1, column="total_bill")

  tips.groupby("day").apply(top, n=1, column="total_bill")


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


Beyond these basic usage mechanics, getting the most out of apply may require some
 creativity. What occurs inside the function passed is up to you; it must either return
 a pandas object or a scalar value. The rest of this chapter will consist mainly of
 examples showing you how to solve various problems using groupby.


 For example, you may recall that I earlier called describe on a GroupBy object:

In [None]:
result = tips.groupby("smoker")["tip_pct"].describe()
result
result.unstack("smoker")

# Suppressing the Group Keys

 In the preceding examples, you see that the resulting object has a hierarchical index
 formed from the group keys, along with the indexes of each piece of the original
 object. You can disable this by passing group_keys=False to groupby:

In [None]:
tips.groupby("smoker", group_keys=False).apply(top)

  tips.groupby("smoker", group_keys=False).apply(top)


Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
232,11.61,3.39,No,Sat,Dinner,2,0.29199
149,7.51,2.0,No,Thur,Lunch,2,0.266312
51,10.29,2.6,No,Sun,Dinner,2,0.252672
185,20.69,5.0,No,Sun,Dinner,5,0.241663
88,24.71,5.85,No,Thur,Lunch,2,0.236746
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


 Quantile and Bucket Analysis

In [None]:
frame = pd.DataFrame({"data1": np.random.standard_normal(1000),
                      "data2": np.random.standard_normal(1000)})
frame.head()
quartiles = pd.cut(frame["data1"], 4)
quartiles.head(10)

Unnamed: 0,data1
0,"(-1.23, 0.489]"
1,"(0.489, 2.208]"
2,"(-1.23, 0.489]"
3,"(-1.23, 0.489]"
4,"(0.489, 2.208]"
5,"(0.489, 2.208]"
6,"(-1.23, 0.489]"
7,"(-1.23, 0.489]"
8,"(-2.956, -1.23]"
9,"(-1.23, 0.489]"


In [None]:
def get_stats(group):
    return pd.DataFrame(
        {"min": group.min(), "max": group.max(),
        "count": group.count(), "mean": group.mean()}
    )

grouped = frame.groupby(quartiles)
grouped.apply(get_stats)

In [None]:
grouped.agg(["min", "max", "count", "mean"])

In [None]:
quartiles_samp = pd.qcut(frame["data1"], 4, labels=False)
quartiles_samp.head()
grouped = frame.groupby(quartiles_samp)
grouped.apply(get_stats)

In [None]:
s = pd.Series(np.random.standard_normal(6))
s[::2] = np.nan
s
s.fillna(s.mean())

In [None]:
states = ["Ohio", "New York", "Vermont", "Florida",
          "Oregon", "Nevada", "California", "Idaho"]
group_key = ["East", "East", "East", "East",
             "West", "West", "West", "West"]
data = pd.Series(np.random.standard_normal(8), index=states)
data

In [None]:
data[["Vermont", "Nevada", "Idaho"]] = np.nan
data
data.groupby(group_key).size()
data.groupby(group_key).count()
data.groupby(group_key).mean()

In [None]:
def fill_mean(group):
    return group.fillna(group.mean())

data.groupby(group_key).apply(fill_mean)

In [None]:
fill_values = {"East": 0.5, "West": -1}
def fill_func(group):
    return group.fillna(fill_values[group.name])

data.groupby(group_key).apply(fill_func)

In [None]:
suits = ["H", "S", "C", "D"]  # Hearts, Spades, Clubs, Diamonds
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_names = ["A"] + list(range(2, 11)) + ["J", "K", "Q"]
cards = []
for suit in suits:
    cards.extend(str(num) + suit for num in base_names)

deck = pd.Series(card_val, index=cards)

In [None]:
deck.head(13)

In [None]:
def draw(deck, n=5):
    return deck.sample(n)
draw(deck)

In [None]:
def get_suit(card):
    # last letter is suit
    return card[-1]

deck.groupby(get_suit).apply(draw, n=2)

In [None]:
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

In [None]:
df = pd.DataFrame({"category": ["a", "a", "a", "a",
                                "b", "b", "b", "b"],
                   "data": np.random.standard_normal(8),
                   "weights": np.random.uniform(size=8)})
df

In [None]:
grouped = df.groupby("category")
def get_wavg(group):
    return np.average(group["data"], weights=group["weights"])

grouped.apply(get_wavg)

In [None]:
close_px = pd.read_csv("examples/stock_px.csv", parse_dates=True,
                       index_col=0)
close_px.info()
close_px.tail(4)

In [None]:
def spx_corr(group):
    return group.corrwith(group["SPX"])

In [None]:
rets = close_px.pct_change().dropna()

In [None]:
def get_year(x):
    return x.year

by_year = rets.groupby(get_year)
by_year.apply(spx_corr)

In [None]:
def corr_aapl_msft(group):
    return group["AAPL"].corr(group["MSFT"])
by_year.apply(corr_aapl_msft)

In [None]:
import statsmodels.api as sm
def regress(data, yvar=None, xvars=None):
    Y = data[yvar]
    X = data[xvars]
    X["intercept"] = 1.
    result = sm.OLS(Y, X).fit()
    return result.params

In [None]:
by_year.apply(regress, yvar="AAPL", xvars=["SPX"])

In [None]:
df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,
                   'value': np.arange(12.)})
df

In [None]:
g = df.groupby('key')['value']
g.mean()

In [None]:
def get_mean(group):
    return group.mean()
g.transform(get_mean)

In [None]:
g.transform('mean')

In [None]:
def times_two(group):
    return group * 2
g.transform(times_two)

In [None]:
def get_ranks(group):
    return group.rank(ascending=False)
g.transform(get_ranks)

In [None]:
def normalize(x):
    return (x - x.mean()) / x.std()

In [None]:
g.transform(normalize)
g.apply(normalize)

In [None]:
g.transform('mean')
normalized = (df['value'] - g.transform('mean')) / g.transform('std')
normalized

In [None]:
tips.head()
tips.pivot_table(index=["day", "smoker"],
                 values=["size", "tip", "tip_pct", "total_bill"])

In [None]:
tips.pivot_table(index=["time", "day"], columns="smoker",
                 values=["tip_pct", "size"])

In [None]:
tips.pivot_table(index=["time", "day"], columns="smoker",
                 values=["tip_pct", "size"], margins=True)

In [None]:
tips.pivot_table(index=["time", "smoker"], columns="day",
                 values="tip_pct", aggfunc=len, margins=True)

In [None]:
tips.pivot_table(index=["time", "size", "smoker"], columns="day",
                 values="tip_pct", fill_value=0)

In [None]:
from io import StringIO
data = """Sample  Nationality  Handedness
1   USA  Right-handed
2   Japan    Left-handed
3   USA  Right-handed
4   Japan    Right-handed
5   Japan    Left-handed
6   Japan    Right-handed
7   USA  Right-handed
8   USA  Left-handed
9   Japan    Right-handed
10  USA  Right-handed"""
data = pd.read_table(StringIO(data), sep="\s+")

In [None]:
data

In [None]:
pd.crosstab(data["Nationality"], data["Handedness"], margins=True)

In [None]:
pd.crosstab([tips["time"], tips["day"]], tips["smoker"], margins=True)

In [None]:
pd.options.display.max_rows = PREVIOUS_MAX_ROWS