# Chapter 9: The GroupBy object

## 9.1 Creating a GroupBy object from scratch

In [1]:
import pandas as pd

In [2]:
food_data = {
    "Item": ["Banana", "Cucumber", "Orange", "Tomato", "Watermelon"],
    "Type": ["Fruit", "Vegetable", "Fruit", "Vegetable", "Fruit"],
    "Price": [0.99, 1.25, 0.25, 0.33, 3.00]
}

supermarket = pd.DataFrame(data=food_data)

supermarket

Unnamed: 0,Item,Type,Price
0,Banana,Fruit,0.99
1,Cucumber,Vegetable,1.25
2,Orange,Fruit,0.25
3,Tomato,Vegetable,0.33
4,Watermelon,Fruit,3.0


In [3]:
groups =  supermarket.groupby("Type")
groups

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F22277E150>

In [4]:
# pull out the "Fruit" rows
groups.get_group("Fruit")

Unnamed: 0,Item,Type,Price
0,Banana,Fruit,0.99
2,Orange,Fruit,0.25
4,Watermelon,Fruit,3.0


In [5]:
# pull out the "Vegetable"  rows
groups.get_group("Vegetable")

Unnamed: 0,Item,Type,Price
1,Cucumber,Vegetable,1.25
3,Tomato,Vegetable,0.33


In [6]:
groups.mean(numeric_only=True)

Unnamed: 0_level_0,Price
Type,Unnamed: 1_level_1
Fruit,1.413333
Vegetable,0.79


## 9.2 Creating a GroupBy object from a data set

In [7]:
fortune = pd.read_csv("data/ch09/fortune1000.csv")
fortune

Unnamed: 0,Company,Revenues,Profits,Employees,Sector,Industry
0,Walmart,500343.0,9862.0,2300000,Retailing,General Merchandisers
1,Exxon Mobil,244363.0,19710.0,71200,Energy,Petroleum Refining
2,Berkshire Hathaway,242137.0,44940.0,377000,Financials,Insurance: Property and Casualty (Stock)
3,Apple,229234.0,48351.0,123000,Technology,"Computers, Office Equipment"
4,UnitedHealth Group,201159.0,10558.0,260000,Health Care,Health Care: Insurance and Managed Care
...,...,...,...,...,...,...
995,SiteOne Landscape Supply,1862.0,54.6,3664,Wholesalers,Wholesalers: Diversified
996,Charles River Laboratories Intl,1858.0,123.4,11800,Health Care,Health Care: Pharmacy and Other Services
997,CoreLogic,1851.0,152.2,5900,Business Services,Financial Data Services
998,Ensign Group,1849.0,40.5,21301,Health Care,Health Care: Medical Facilities


In [8]:
# pull out all companies with a Sector value of "Retailing":
in_retailing = fortune["Sector"] == "Retailing"
retail_companies = fortune[in_retailing]
retail_companies.head()

Unnamed: 0,Company,Revenues,Profits,Employees,Sector,Industry
0,Walmart,500343.0,9862.0,2300000,Retailing,General Merchandisers
7,Amazon.com,177866.0,3033.0,566000,Retailing,Internet Services and Retailing
14,Costco,129025.0,2679.0,182000,Retailing,General Merchandisers
22,Home Depot,100904.0,8630.0,413000,Retailing,Specialty Retailers: Other
38,Target,71879.0,2934.0,345000,Retailing,General Merchandisers


In [9]:
# pull out the Revenues column from the subset
retail_companies["Revenues"].head()

0     500343.0
7     177866.0
14    129025.0
22    100904.0
38     71879.0
Name: Revenues, dtype: float64

In [10]:
retail_companies["Revenues"].mean()

21874.714285714286

In [11]:
# invoke the groupby method on the fortune DataFrame
sectors = fortune.groupby("Sector")

In [12]:
sectors

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F2227B4250>

In [13]:
len(sectors)

21

In [14]:
fortune["Sector"].nunique()

21

In [15]:
sectors.size()

Sector
Aerospace & Defense               25
Apparel                           14
Business Services                 53
Chemicals                         33
Energy                           107
Engineering & Construction        27
Financials                       155
Food &  Drug Stores               12
Food, Beverages & Tobacco         37
Health Care                       71
Hotels, Restaurants & Leisure     26
Household Products                28
Industrials                       49
Materials                         45
Media                             25
Motor Vehicles & Parts            19
Retailing                         77
Technology                       103
Telecommunications                10
Transportation                    40
Wholesalers                       44
dtype: int64