# Agregación de datos por categoría

In [2]:
import numpy as np
import pandas as pd

In [4]:
np.random.seed(2022)

In [5]:
gender = ["Male", "Female"]
income = ["Poor", "Middle Class", "Rich"]

In [9]:
n = 500

gender_data = []
income_data = []

for i in range(0,n):
    # choice para eegir aleatorio de una lista
    gender_data.append(np.random.choice(gender))
    income_data.append(np.random.choice(income))

In [12]:
gender_data[0:10]

['Female',
 'Female',
 'Female',
 'Male',
 'Male',
 'Male',
 'Female',
 'Female',
 'Male',
 'Female']

In [14]:
income_data[0:10]

['Poor',
 'Poor',
 'Middle Class',
 'Poor',
 'Poor',
 'Middle Class',
 'Middle Class',
 'Rich',
 'Poor',
 'Poor']

In [38]:
# Z -> N(0,1)
# N(m,s) -> m + s * Z
height = np.ceil(170 + 20 * np.random.randn(n))
weight = np.ceil(80 + 30 * np.random.randn(n))
age = np.ceil(40 + 10 * np.random.randn(n))
income = np.ceil(18000 + 3500 * np.random.randn(n))

In [39]:
data = pd.DataFrame(
    {
        "Gender" : gender_data,
        "Economic Status" : income_data,
        "height" : height,
        "Weight" : weight,
        "Age" : age,
        "income" : income
    }
)

In [41]:
data

Unnamed: 0,Gender,Economic Status,height,Weight,Age,income
0,Female,Poor,167.0,68.0,48.0,13520.0
1,Female,Poor,162.0,81.0,50.0,12487.0
2,Female,Middle Class,152.0,108.0,42.0,15797.0
3,Male,Poor,170.0,82.0,56.0,11538.0
4,Male,Poor,170.0,128.0,35.0,20875.0
...,...,...,...,...,...,...
495,Male,Rich,200.0,63.0,37.0,18448.0
496,Female,Poor,153.0,75.0,41.0,15301.0
497,Male,Middle Class,161.0,73.0,16.0,23216.0
498,Male,Rich,121.0,122.0,40.0,16684.0


### Agrupación de datos

In [42]:
# Se usa el groupby con alguna de las llaves del DataFrame
grouped_gender = data.groupby("Gender")

In [45]:
# Mostramos la información
grouped_gender.groups

{'Female': [0, 1, 2, 6, 7, 9, 10, 12, 14, 15, 17, 18, 19, 22, 23, 26, 28, 29, 30, 31, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 54, 58, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 75, 76, 78, 79, 85, 87, 88, 89, 90, 101, 102, 103, 106, 107, 110, 112, 113, 114, 121, 122, 124, 125, 127, 128, 129, 132, 135, 137, 138, 139, 142, 143, 144, 145, 147, 148, 149, 152, 153, 154, 155, 157, 158, 159, 161, 168, 170, 171, 172, 173, 175, 178, 180, 186, 187, ...], 'Male': [3, 4, 5, 8, 11, 13, 16, 20, 21, 24, 25, 27, 32, 33, 34, 35, 38, 50, 51, 52, 53, 55, 56, 57, 59, 60, 61, 62, 65, 74, 77, 80, 81, 82, 83, 84, 86, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 104, 105, 108, 109, 111, 115, 116, 117, 118, 119, 120, 123, 126, 130, 131, 133, 134, 136, 140, 141, 146, 150, 151, 156, 160, 162, 163, 164, 165, 166, 167, 169, 174, 176, 177, 179, 181, 182, 183, 184, 185, 189, 190, 192, 193, 194, 195, 196, 198, 204, 205, 206, 209, ...]}

In [48]:
# Recorremos y mostramos los grupos
for names, groups in grouped_gender:
    print(names)
    print(groups)

Female
     Gender Economic Status  height  Weight   Age   income
0    Female            Poor   167.0    68.0  48.0  13520.0
1    Female            Poor   162.0    81.0  50.0  12487.0
2    Female    Middle Class   152.0   108.0  42.0  15797.0
6    Female    Middle Class   145.0    47.0  44.0  14408.0
7    Female            Rich   228.0   132.0  34.0  20722.0
..      ...             ...     ...     ...   ...      ...
489  Female    Middle Class   193.0    81.0  14.0  20475.0
490  Female            Rich   168.0   134.0  34.0  23086.0
494  Female            Rich   184.0    93.0  38.0  16985.0
496  Female            Poor   153.0    75.0  41.0  15301.0
499  Female            Poor   165.0   104.0  40.0  18016.0

[232 rows x 6 columns]
Male
    Gender Economic Status  height  Weight   Age   income
3     Male            Poor   170.0    82.0  56.0  11538.0
4     Male            Poor   170.0   128.0  35.0  20875.0
5     Male    Middle Class   155.0    92.0  48.0  19130.0
8     Male            Po

In [51]:
# Otra forma de objener información
grouped_gender.get_group("Male")

Unnamed: 0,Gender,Economic Status,height,Weight,Age,income
3,Male,Poor,170.0,82.0,56.0,11538.0
4,Male,Poor,170.0,128.0,35.0,20875.0
5,Male,Middle Class,155.0,92.0,48.0,19130.0
8,Male,Poor,195.0,75.0,45.0,15487.0
11,Male,Rich,176.0,77.0,58.0,15493.0
...,...,...,...,...,...,...
492,Male,Middle Class,173.0,82.0,28.0,15078.0
493,Male,Middle Class,160.0,70.0,30.0,13477.0
495,Male,Rich,200.0,63.0,37.0,18448.0
497,Male,Middle Class,161.0,73.0,16.0,23216.0


In [54]:
grouped_gender.get_group("Female")

Unnamed: 0,Gender,Economic Status,height,Weight,Age,income
0,Female,Poor,167.0,68.0,48.0,13520.0
1,Female,Poor,162.0,81.0,50.0,12487.0
2,Female,Middle Class,152.0,108.0,42.0,15797.0
6,Female,Middle Class,145.0,47.0,44.0,14408.0
7,Female,Rich,228.0,132.0,34.0,20722.0
...,...,...,...,...,...,...
489,Female,Middle Class,193.0,81.0,14.0,20475.0
490,Female,Rich,168.0,134.0,34.0,23086.0
494,Female,Rich,184.0,93.0,38.0,16985.0
496,Female,Poor,153.0,75.0,41.0,15301.0


In [55]:
# Agrupamos por dos factores
double_group = data.groupby(["Gender", "Economic Status"])

In [56]:
len(double_group)

6

In [59]:
# Mostramos este grupo
for names, groups in double_group:
    print(names)
    print(groups)

('Female', 'Middle Class')
     Gender Economic Status  height  Weight   Age   income
2    Female    Middle Class   152.0   108.0  42.0  15797.0
6    Female    Middle Class   145.0    47.0  44.0  14408.0
17   Female    Middle Class   182.0    78.0  48.0  17405.0
18   Female    Middle Class   190.0    55.0  37.0  22593.0
23   Female    Middle Class   185.0   153.0  51.0  21777.0
..      ...             ...     ...     ...   ...      ...
461  Female    Middle Class   171.0    41.0  56.0  17080.0
463  Female    Middle Class   177.0    70.0  23.0  16388.0
470  Female    Middle Class   196.0    76.0  54.0  20879.0
474  Female    Middle Class   145.0    55.0  34.0  20446.0
489  Female    Middle Class   193.0    81.0  14.0  20475.0

[76 rows x 6 columns]
('Female', 'Poor')
     Gender Economic Status  height  Weight   Age   income
0    Female            Poor   167.0    68.0  48.0  13520.0
1    Female            Poor   162.0    81.0  50.0  12487.0
9    Female            Poor   172.0    87.0  4