# Apply Function Groupby

In [1]:
import pandas as pd

# 创建示例DataFrame
data = {
    'department': ['Sales', 'Sales', 'HR', 'HR', 'Tech', 'Tech'],
    'employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank'],
    'salary': [70000, 80000, 50000, 55000, 90000, 95000]
}

df = pd.DataFrame(data)

## 返回多个值

In [None]:
# 使用groupby().apply()计算每个部门的平均工资和最高工资
def salary_summary(group):
    avg_salary = group['salary'].mean()
    max_salary = group['salary'].max()
    # 返回多个值的时候使用pd.Series，返回单个值的时候直接返回
    return pd.Series({'Average Salary': avg_salary, 'Highest Salary': max_salary})
    # return avg_salary, max_salary
    # return list(avg_salary, max_salary)

summary_df = df.groupby('department').apply(salary_summary)

print(summary_df)

department
HR       (52500.0, 55000)
Sales    (75000.0, 80000)
Tech     (92500.0, 95000)
dtype: object


这里如果不用pd.Series返回多个值，而直接写成return avg_salary, max_salary，会返回一个元组。这使得结果不太方便进一步分析，比如无法直接调用 `result["Average Salary"]` 或 `result["Highest Salary"]`。   
department   
HR       (52500.0, 55000)   
Sales    (75000.0, 80000)   
Tech     (92500.0, 95000)   
dtype: object   

也可以用`list()`将返回值转换为列表，但是这样会丢失列名，不够直观。   
department
HR       [52500.0, 55000]   
Sales    [75000.0, 80000]   
Tech     [92500.0, 95000]   
dtype: object   

使用`pd.Series()`这样的方法，返回的是一个DataFrame，可以像表格一样方便地访问列 `result["Average Salary"]` 和 `result["Highest Salary"]`。   
            Average Salary  Highest Salary   
department                                  
HR                 52500.0           55000   
Sales              75000.0           80000   
Tech               92500.0           95000   


## 返回单个值

In [4]:
import pandas as pd

# 创建示例 DataFrame
data = {
    'department': ['Sales', 'Sales', 'HR', 'HR', 'Tech', 'Tech', 'Tech'],
    'employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace'],
    'salary': [75000, 90000, 85000, 60000, 95000, 72000, 88000]
}

df = pd.DataFrame(data)

In [5]:
# 定义 apply 函数，计算高薪 (>80,000) 员工数量
def high_salary_count(column):
    return (column > 80000).sum()

# 使用 groupby().apply() 计算每个部门高薪员工的数量
high_salary_counts = df.groupby('department')['salary'].apply(high_salary_count)

# 输出结果
print(high_salary_counts)

department
HR       1
Sales    1
Tech     2
Name: salary, dtype: int64
