In [8]:
import pandas as pd

# Sample data
data = {
    'department': ['Sales', 'Engineering', 'Sales', 'Engineering', 'Marketing'],
    'employee': ['Alice', 'Bob', 'Charlie', 'Dave', 'Eve'],
    'salary': [70000, 80000, 65000, 85000, 75000],
    'age':[22, 23, 44, 55, 66]
}

df = pd.DataFrame(data)

# Group by department and calculate mean salary
result = df.groupby('department')['salary'].mean()
print(result)

department
Engineering    82500.0
Marketing      75000.0
Sales          67500.0
Name: salary, dtype: float64


In [None]:
df.groupby('department').agg({'salary': ['mean', 'min', 'max', 'count']})

In [15]:
#Filter: Keep groups that satisfy a condition
df.groupby('department').filter(lambda x: x['salary'].mean() > 60000)[['department']].drop_duplicates()

Unnamed: 0,department
0,Sales
1,Engineering
4,Marketing


In [None]:
#Transform: Return a same-sized DataFrame with group-specific calculations
df['salary_normalized'] = df.groupby('department')['salary'].transform(lambda x: (x - x.mean()) / x.std())

In [16]:
# Apply multiple aggregations simultaneously
df.groupby('department')['salary'].agg(['mean', 'sum', 'count', 'std'])

Unnamed: 0_level_0,mean,sum,count,std
department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Engineering,82500.0,165000,2,3535.533906
Marketing,75000.0,75000,1,
Sales,67500.0,135000,2,3535.533906


In [18]:
# Define custom aggregation logic
df.groupby('department')['salary'].agg(lambda x: x.max() - x.min())  # Salary range

department
Engineering    5000
Marketing         0
Sales          5000
Name: salary, dtype: int64

In [5]:
# Give meaningful names to your aggregations
df.groupby('department')['salary'].agg(
    average_salary='mean',
    total_payroll='sum',
    employee_count='count'
)

Unnamed: 0_level_0,average_salary,total_payroll,employee_count
department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Engineering,82500.0,165000,2
Marketing,75000.0,75000,1
Sales,67500.0,135000,2


In [9]:
df.groupby('department').agg({
    'salary': ['mean', 'median', 'std'],
    'age': ['min', 'max']
})

Unnamed: 0_level_0,salary,salary,salary,age,age
Unnamed: 0_level_1,mean,median,std,min,max
department,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Engineering,82500.0,82500.0,3535.533906,23,55
Marketing,75000.0,75000.0,,66,66
Sales,67500.0,67500.0,3535.533906,22,44


## as_index=False is usually better when groupby

In [1]:
import pandas as pd

data = {
    'department': ['Sales', 'Engineering', 'Sales', 'Marketing'],
    'salary': [70000, 80000, 65000, 75000]
}
df = pd.DataFrame(data)

# With default as_index=True
result1 = df.groupby('department')['salary'].mean()
print("as_index=True (default):")
print(result1)
print(type(result1))

# With as_index=False
result2 = df.groupby('department', as_index=False)['salary'].mean()
print("\nas_index=False:")
print(result2)
print(type(result2))

as_index=True (default):
department
Engineering    80000.0
Marketing      75000.0
Sales          67500.0
Name: salary, dtype: float64
<class 'pandas.core.series.Series'>

as_index=False:
    department   salary
0  Engineering  80000.0
1    Marketing  75000.0
2        Sales  67500.0
<class 'pandas.core.frame.DataFrame'>


In [3]:
result1

department
Engineering    80000.0
Marketing      75000.0
Sales          67500.0
Name: salary, dtype: float64

In [2]:
result2

Unnamed: 0,department,salary
0,Engineering,80000.0
1,Marketing,75000.0
2,Sales,67500.0
