### Как использовать groupby в связке с aggregate

Файл по примерам из урока "1.6. Групировка и агрегация" курса [Введение в Data Science и машинное обучение](https://stepik.org/course/4852/syllabus).

In [1]:
import pandas as pd
import numpy as np

students_performance = pd.read_csv('files/group_and_aggregate.csv')
students_performance.columns = [x.replace(" ", "_") for x in students_performance.columns]
students_performance.head()

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [2]:
students_performance.groupby('gender').aggregate({'math_score': 'mean', 'reading_score': 'mean'})

Unnamed: 0_level_0,math_score,reading_score
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,63.633205,72.608108
male,68.728216,65.473029


In [3]:
students_performance.groupby('gender', as_index=False) \
    .aggregate({'math_score': 'mean', 'reading_score': 'mean'}) \
    .rename(columns = {'math_score': 'mean_math_score', 'reading_score': 'mean_reading_score'})

Unnamed: 0,gender,mean_math_score,mean_reading_score
0,female,63.633205,72.608108
1,male,68.728216,65.473029


In [4]:
students_performance.groupby(['gender', 'race/ethnicity'], as_index=False) \
    .aggregate({'math_score': 'mean', 'reading_score': 'mean'}) \
    .rename(columns = {'math_score': 'mean_math_score', 'reading_score': 'mean_reading_score'})

Unnamed: 0,gender,race/ethnicity,mean_math_score,mean_reading_score
0,female,group A,58.527778,69.0
1,female,group B,61.403846,71.076923
2,female,group C,62.033333,71.944444
3,female,group D,65.248062,74.046512
4,female,group E,70.811594,75.84058
5,male,group A,63.735849,61.735849
6,male,group B,65.930233,62.848837
7,male,group C,67.611511,65.42446
8,male,group D,69.413534,66.135338
9,male,group E,76.746479,70.295775


### Многоуровневая индексация

In [5]:
mean_scores = students_performance.groupby(['gender', 'race/ethnicity']) \
    .aggregate({'math_score': 'mean', 'reading_score': 'mean'}) \
    .rename(columns = {'math_score': 'mean_math_score', 'reading_score': 'mean_reading_score'})
mean_scores.index

MultiIndex([('female', 'group A'),
            ('female', 'group B'),
            ('female', 'group C'),
            ('female', 'group D'),
            ('female', 'group E'),
            (  'male', 'group A'),
            (  'male', 'group B'),
            (  'male', 'group C'),
            (  'male', 'group D'),
            (  'male', 'group E')],
           names=['gender', 'race/ethnicity'])

In [6]:
mean_scores.loc[('female', 'group A')]

mean_math_score       58.527778
mean_reading_score    69.000000
Name: (female, group A), dtype: float64

In [7]:
mean_scores.loc[[('female', 'group A'), ('female', 'group B')]]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_math_score,mean_reading_score
gender,race/ethnicity,Unnamed: 2_level_1,Unnamed: 3_level_1
female,group A,58.527778,69.0
female,group B,61.403846,71.076923


Преимущество  MultiIndex – сгруппированная серия

In [8]:
students_performance.math_score.unique() # уникальные значения

array([ 72,  69,  90,  47,  76,  71,  88,  40,  64,  38,  58,  65,  78,
        50,  18,  46,  54,  66,  44,  74,  73,  67,  70,  62,  63,  56,
        97,  81,  75,  57,  55,  53,  59,  82,  77,  33,  52,   0,  79,
        39,  45,  60,  61,  41,  49,  30,  80,  42,  27,  43,  68,  85,
        98,  87,  51,  99,  84,  91,  83,  89,  22, 100,  96,  94,  48,
        35,  34,  86,  92,  37,  28,  24,  26,  95,  36,  29,  32,  93,
        19,  23,   8])

In [9]:
students_performance.math_score.nunique() # число уникальных значений

81

In [10]:
# уникальные значения на пересечении двух групп
students_performance.groupby(['gender', 'race/ethnicity']).math_score
# результат - сгруппированная серия - одномерный массив с информацией о группировке по двум переменным

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f43a76f9278>

In [11]:
students_performance.groupby(['gender', 'race/ethnicity']).math_score.nunique()

gender  race/ethnicity
female  group A           29
        group B           51
        group C           59
        group D           53
        group E           44
male    group A           38
        group B           43
        group C           56
        group D           49
        group E           38
Name: math_score, dtype: int64

In [12]:
type(students_performance.groupby(['gender', 'race/ethnicity']).math_score.nunique())

pandas.core.series.Series

### Поиск топ-элементов по группам

In [13]:
students_performance.sort_values(['gender', 'math_score'], ascending=False) \
    .groupby('gender').head()

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
149,male,group E,associate's degree,free/reduced,completed,100,100,93
623,male,group A,some college,standard,completed,100,96,86
625,male,group D,some college,standard,completed,100,97,99
916,male,group E,bachelor's degree,standard,completed,100,100,100
306,male,group E,some college,standard,completed,99,87,81
451,female,group E,some college,standard,none,100,92,97
458,female,group E,bachelor's degree,standard,none,100,100,100
962,female,group E,associate's degree,standard,none,100,100,100
114,female,group E,bachelor's degree,standard,completed,99,100,100
263,female,group E,high school,standard,none,99,93,90


### Создание новых колонок

In [14]:
students_performance['total_score'] = students_performance.math_score + students_performance.reading_score + students_performance.writing_score
students_performance.head()

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_score
0,female,group B,bachelor's degree,standard,none,72,72,74,218
1,female,group C,some college,standard,completed,69,90,88,247
2,female,group B,master's degree,standard,none,90,95,93,278
3,male,group A,associate's degree,free/reduced,none,47,57,44,148
4,male,group C,some college,standard,none,76,78,75,229


In [15]:
students_performance = students_performance.assign(total_score_log = np.log(students_performance.total_score))
students_performance.head()

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_score,total_score_log
0,female,group B,bachelor's degree,standard,none,72,72,74,218,5.384495
1,female,group C,some college,standard,completed,69,90,88,247,5.509388
2,female,group B,master's degree,standard,none,90,95,93,278,5.627621
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,4.997212
4,male,group C,some college,standard,none,76,78,75,229,5.433722


Удаление:

In [16]:
students_performance.drop(['total_score', 'lunch'], axis=1).head()

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,test_preparation_course,math_score,reading_score,writing_score,total_score_log
0,female,group B,bachelor's degree,none,72,72,74,5.384495
1,female,group C,some college,completed,69,90,88,5.509388
2,female,group B,master's degree,none,90,95,93,5.627621
3,male,group A,associate's degree,none,47,57,44,4.997212
4,male,group C,some college,none,76,78,75,5.433722


## Задачи по теме

**Задача 1**. Пересчитаем число ног у героев игры Dota2 (`dota_hero_stats.csv`). Сгруппируйте героев из датасэта по числу их ног (колонка legs).

Два варианта решения задачи:

In [17]:
df = pd.read_csv('files/dota_hero_stats.csv')
df.legs.value_counts().sort_index()

0    11
2    95
4     7
6     3
8     1
Name: legs, dtype: int64

In [18]:
df.groupby('legs').aggregate({'legs': 'count'})

Unnamed: 0_level_0,legs
legs,Unnamed: 1_level_1
0,11
2,95
4,7
6,3
8,1


**Задача 2**. К нам поступили данные из бухгалтерии о заработках Лупы и Пупы за разные задачи (`accountancy.csv`). Посмотрите, у кого из них больше средний заработок в различных категориях (колонка `Type`).

In [77]:
df = pd.read_csv('files/accountancy.csv', index_col=0)
df.head()

Unnamed: 0,Executor,Type,Salary
0,Pupa,D,63
1,Pupa,A,158
2,Pupa,D,194
3,Pupa,E,109
4,Loopa,E,184


In [78]:
df.groupby(['Type', 'Executor']).aggregate({'Salary': 'mean'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Salary
Type,Executor,Unnamed: 2_level_1
A,Loopa,58.0
A,Pupa,160.833333
B,Loopa,145.166667
B,Pupa,77.0
C,Loopa,154.333333
C,Pupa,74.5
D,Loopa,137.714286
D,Pupa,146.5
E,Loopa,164.0
E,Pupa,131.2


In [82]:
df.groupby(['Type', 'Executor']).mean().unstack()

Unnamed: 0_level_0,Salary,Salary
Executor,Loopa,Pupa
Type,Unnamed: 1_level_2,Unnamed: 2_level_2
A,58.0,160.833333
B,145.166667,77.0
C,154.333333,74.5
D,137.714286,146.5
E,164.0,131.2
F,238.0,136.25


In [91]:
pd.pivot_table(df,
               columns=['Type', 'Executor'],
               aggfunc='mean')

Type,A,A,B,B,C,C,D,D,E,E,F,F
Executor,Loopa,Pupa,Loopa,Pupa,Loopa,Pupa,Loopa,Pupa,Loopa,Pupa,Loopa,Pupa
Salary,58.0,160.833333,145.166667,77.0,154.333333,74.5,137.714286,146.5,164.0,131.2,238.0,136.25


**Задача 3**. Продолжим исследование героев Dota2. Сгруппируйте по колонкам `attack_type` и `primary_attr` и выберите самый распространённый набор характеристик.

In [94]:
df = pd.read_csv('files/dota_hero_stats.csv', index_col = 0)

In [116]:
df.groupby(["attack_type", "primary_attr"],
           as_index=False)\
.aggregate({'id':'count'})\
.sort_values('id', ascending=False)

Unnamed: 0,attack_type,primary_attr,id
4,Ranged,int,40
2,Melee,str,35
0,Melee,agi,19
3,Ranged,agi,18
5,Ranged,str,3
1,Melee,int,2


**Задача**. Аспирант Ростислав изучает метаболом водорослей и получил такую табличку `aglae.csv`. В ней он записал вид каждой водоросли, её род (группа, объединяющая близкие виды), группа (ещё одно объединение водорослей в крупные фракции) и концентрации анализируемых веществ.

Помогите Ростиславу найти среднюю концентрацию каждого из веществ в каждом из родов (колонка `genus`). Для этого проведите группировку датафрэйма, сохранённого в переменной `concentrations`, и примените метод, сохранив результат в переменной `mean_concentrations`.

In [121]:
concentrations = pd.read_csv('files/algae.csv')
concentrations.head()

Unnamed: 0,species,genus,group,sucrose,alanin,citrate,glucose,oleic_acid
0,Fucus_vesiculosus,Fucus,brown,3.001472,3.711498,5.004262,2.548459,6.405165
1,Saccharina_japonica,Saccharina,brown,6.73107,1.255251,5.621499,6.013219,4.1567
2,Fucus_serratus,Fucus,brown,3.27687,0.346431,1.216767,3.623225,0.304573
3,Fucus_distichus,Fucus,brown,6.786996,6.641303,6.423606,2.272724,3.393203
4,Cladophora_fracta,Cladophora,green,3.86147,1.64845,6.940588,2.316955,2.528886


In [124]:
concentrations.groupby('genus').mean()

Unnamed: 0_level_0,sucrose,alanin,citrate,glucose,oleic_acid
genus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ascophyllum,6.825467,0.875429,5.253527,3.414961,2.432526
Cladophora,4.008792,3.997055,5.288311,2.800276,2.263472
Fucus,4.355112,3.566411,4.214878,2.814803,3.367647
Palmaria,0.70458,3.17644,5.573905,3.24209,2.245538
Saccharina,4.183596,3.524207,3.34671,4.980594,4.487252


In [125]:
mean_concentrations = concentrations.groupby('genus').mean()

**Задача**. Пользуясь предыдущими данными, укажите через пробел (без запятых) чему равны минимальная, средняя и максимальная концентрации аланина (alanin) среди видов рода `Fucus`. Округлите до 2-ого знака, десятичным разделителем является точка.

In [134]:
fucus_alanin = concentrations[concentrations['genus'] == 'Fucus']['alanin']

In [146]:
print(f'Ответ: {fucus_alanin.min():.2} \
{fucus_alanin.mean():.3} \
{fucus_alanin.max():.3}')

Ответ: 0.35 3.57 6.64


In [147]:
concentrations.groupby('genus').describe()['alanin']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
genus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Ascophyllum,1.0,0.875429,,0.875429,0.875429,0.875429,0.875429,0.875429
Cladophora,3.0,3.997055,2.65705,1.64845,2.555071,3.461692,5.171358,6.881024
Fucus,3.0,3.566411,3.149943,0.346431,2.028965,3.711498,5.176401,6.641303
Palmaria,1.0,3.17644,,3.17644,3.17644,3.17644,3.17644,3.17644
Saccharina,2.0,3.524207,3.208789,1.255251,2.389729,3.524207,4.658685,5.793163


In [157]:
concentrations.groupby('group').describe()["sucrose"].transpose()

group,brown,green,red
count,6.0,3.0,1.0
mean,4.709666,4.008792,0.70458
std,2.336471,1.634817,
min,1.636122,2.452623,0.70458
25%,3.070321,3.157047,0.70458
50%,5.00397,3.86147,0.70458
75%,6.773014,4.786877,0.70458
max,6.825467,5.712284,0.70458


In [164]:
citr_max = concentrations.groupby('citrate').agg({'group': 'max'})

In [167]:
concentrations

Unnamed: 0,species,genus,group,sucrose,alanin,citrate,glucose,oleic_acid
0,Fucus_vesiculosus,Fucus,brown,3.001472,3.711498,5.004262,2.548459,6.405165
1,Saccharina_japonica,Saccharina,brown,6.73107,1.255251,5.621499,6.013219,4.1567
2,Fucus_serratus,Fucus,brown,3.27687,0.346431,1.216767,3.623225,0.304573
3,Fucus_distichus,Fucus,brown,6.786996,6.641303,6.423606,2.272724,3.393203
4,Cladophora_fracta,Cladophora,green,3.86147,1.64845,6.940588,2.316955,2.528886
5,Cladophora_compacta,Cladophora,green,5.712284,3.461692,3.082826,3.343707,1.432514
6,Cladophora_gracilis,Cladophora,green,2.452623,6.881024,5.84152,2.740165,2.829016
7,Palmaria_palmata,Palmaria,red,0.70458,3.17644,5.573905,3.24209,2.245538
8,Saccharina_latissima,Saccharina,brown,1.636122,5.793163,1.07192,3.947968,4.817804
9,Ascophyllum_nodosum,Ascophyllum,brown,6.825467,0.875429,5.253527,3.414961,2.432526
