# Agregación de datos por categoría

In [1]:
import numpy as np
import pandas as pd

In [2]:
gender = ["Male", "Female"]
income_cat = ["Poor", "Middle Class", "Rich"]

In [3]:
n = 500 # tamaño del dataset
gender_data = []
income_data = []

for i in range(0,500):
    gender_data.append(np.random.choice(gender))
    income_data.append(np.random.choice(income))


In [8]:
gender_data[1:10]

['Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Female',
 'Female',
 'Female',
 'Female']

In [9]:
income_data[1:10]

['Poor',
 'Middle Class',
 'Middle Class',
 'Rich',
 'Rich',
 'Poor',
 'Rich',
 'Rich',
 'Poor']

In [10]:
# Z -> N(0,1)
# N(m,s) -> m+s * Z
height = 160 + 30 * np.random.randn(n)
weight = 65 + 25 * np.random.randn(n)
age = 30 + 12 * np.random.randn(n)
income = 18000 + 3500 * np.random.rand(n)

In [13]:
data = pd.DataFrame(
    {
        "Gender": gender_data,
        "Economic Status": income_data,
        "Height": height,
        "Weight": weight,
        "Age": age,
        "Income": income
    }
)

In [14]:
data.head()

Unnamed: 0,Gender,Economic Status,Height,Weight,Age,Income
0,Female,Rich,179.514357,38.354575,27.422894,18022.267671
1,Male,Poor,165.160125,61.653524,22.039982,18735.300358
2,Male,Middle Class,139.49812,71.300819,32.608946,20930.767807
3,Female,Middle Class,99.42022,80.412329,33.453004,19765.483997
4,Male,Rich,169.419939,87.3171,34.737537,21200.731805


## Agrupación de datos

In [16]:
grouped_gender = data.groupby("Gender")

In [17]:
grouped_gender.groups

{'Female': Int64Index([  0,   3,   6,   7,   8,   9,  13,  14,  17,  19,
             ...
             481, 482, 486, 487, 489, 490, 492, 493, 494, 497],
            dtype='int64', length=240),
 'Male': Int64Index([  1,   2,   4,   5,  10,  11,  12,  15,  16,  18,
             ...
             480, 483, 484, 485, 488, 491, 495, 496, 498, 499],
            dtype='int64', length=260)}

In [18]:
for names, groups in grouped_gender:
    print(names)
    print(groups)

Female
     Gender Economic Status      Height     Weight        Age        Income
0    Female            Rich  179.514357  38.354575  27.422894  18022.267671
3    Female    Middle Class   99.420220  80.412329  33.453004  19765.483997
6    Female            Poor  150.385814  18.179467  41.676108  20086.475976
7    Female            Rich  138.988165  16.344850  44.666182  20046.596771
8    Female            Rich  177.102401  66.615389  15.713560  18406.251472
..      ...             ...         ...        ...        ...           ...
490  Female    Middle Class  137.985144  37.333559  34.927860  20037.299792
492  Female    Middle Class  118.343957  53.336781  13.199206  20552.161183
493  Female            Rich  195.296411  21.848592  35.430546  21414.141753
494  Female            Rich  174.999901  56.001101  54.427746  18089.602089
497  Female            Rich  183.990852  63.617646  46.218600  19053.392233

[240 rows x 6 columns]
Male
    Gender Economic Status      Height     Weight   

In [19]:
grouped_gender.get_group("Female") ## Genera un dataframe

Unnamed: 0,Economic Status,Height,Weight,Age,Income
0,Rich,179.514357,38.354575,27.422894,18022.267671
3,Middle Class,99.420220,80.412329,33.453004,19765.483997
6,Poor,150.385814,18.179467,41.676108,20086.475976
7,Rich,138.988165,16.344850,44.666182,20046.596771
8,Rich,177.102401,66.615389,15.713560,18406.251472
...,...,...,...,...,...
490,Middle Class,137.985144,37.333559,34.927860,20037.299792
492,Middle Class,118.343957,53.336781,13.199206,20552.161183
493,Rich,195.296411,21.848592,35.430546,21414.141753
494,Rich,174.999901,56.001101,54.427746,18089.602089


In [20]:
# Agrupación por más de una categoría
double_group = data.groupby(["Gender", "Economic Status"])

In [21]:
len(double_group)

6

In [22]:
for names, groups in double_group:
    print(names)
    print(groups)

('Female', 'Middle Class')
     Gender Economic Status      Height      Weight        Age        Income
3    Female    Middle Class   99.420220   80.412329  33.453004  19765.483997
13   Female    Middle Class  100.645969   49.521381  15.869042  19131.794528
17   Female    Middle Class  165.582249   80.891896  35.645506  19481.166842
20   Female    Middle Class  169.666871   68.661432  39.818922  19496.101575
30   Female    Middle Class  137.155883   81.534657  16.990878  20311.026305
..      ...             ...         ...         ...        ...           ...
457  Female    Middle Class  164.989934   55.524088  44.926800  18765.400769
461  Female    Middle Class  160.925867   77.231771  14.698483  20332.073688
482  Female    Middle Class  158.687032  104.144953  21.400851  19550.695702
490  Female    Middle Class  137.985144   37.333559  34.927860  20037.299792
492  Female    Middle Class  118.343957   53.336781  13.199206  20552.161183

[77 rows x 6 columns]
('Female', 'Poor')
     Ge