In [None]:
import numpy as np
import pandas as pd

ИНДЕКСЫ

Если размерность данных > 2, то используют иерархическую индексацию (мультииндекс). В один индекс включается несколько уровней.

In [None]:
index = [
    ('city1', 2010),
    ('city1', 2020),
    ('city2', 2010),
    ('city2', 2020),
    ('city3', 2010),
    ('city3', 2020)
]

population = [
    101,
    102,
    103,
    104,
    105,
    106
]

pop = pd.Series(population, index=index)

print(pop)
print()
print(pop[[i for i in pop.index if i[1] == 2020]])

MultiIndex

In [None]:
index = pd.MultiIndex.from_tuples(index)
pop = pop.reindex(index)

print(pop)
print()
print(pop[:, 2020])
print()

pop_df = pop.unstack()
print(pop_df)
print()
print(pop_df.stack())

In [None]:
index = [
    ('city1', 2010, 1),
    ('city1', 2010, 2),

    ('city1', 2020, 1),
    ('city1', 2020, 2),

    ('city2', 2010, 1),
    ('city2', 2010, 2),

    ('city2', 2020, 1),
    ('city2', 2020, 2)
]

population = [
    101,
    1010,
    201,
    2010,
    102,
    1020,
    202,
    2020
]

pop = pd.Series(population, index=index)
print(pop)
print()

index = pd.MultiIndex.from_tuples(index)

pop = pop.reindex(index)
print(pop)
print()

print(pop[:, :, 2])

In [None]:
pop_df = pop.unstack()
print(pop_df)
print()
print(pop_df.stack())

In [None]:
index = [
    ('city1', 2010),
    ('city1', 2010),
    ('city1', 2020),

    ('city2', 2010),
    ('city2', 2010),
    ('city2', 2020)
]

population = [
    101,
    1010,
    201,
    2010,
    102,
    1020
]

pop = pd.Series(population, index=index)
print(pop)
print()

pop_df = pd.DataFrame(
    {
        'total': pop,
        'something': [
            10,
            11,
            12,
            13,
            14,
            15
        ]
    }
)

print(pop_df)
print()
print(pop_df['something'])

Как можно создавать мультииндексы?
1. Список массивов, задающих значение индекса на каждом уровне

In [None]:
i1 = pd.MultiIndex.from_arrays(
    [
        ['a', 'a', 'b', 'b'],
        [1, 2, 1, 2]
    ]
)

print(i1)

2. Список кортежей, задающих значение индекса в каждой точке

In [None]:
i2 = pd.MultiIndex.from_tuples(
    [
        ('a', 1),
        ('a', 2),
        ('b', 1),
        ('b', 2)
    ]
)

print(i2)

3. Декартово произведение обычных индексов

In [None]:
i3 = pd.MultiIndex.from_product(
    [
        ['a', 'b'],
        [1, 2]
    ]
)

print(i3)

4. Описание внутреннего представления: levels, codes

In [None]:
i4 = pd.MultiIndex(
    levels = [
        ['a', 'b', 'c'],
        [1, 2]
    ],
    codes = [
        [0, 0, 1, 1, 2, 2], # a a b b c c
        [0, 1, 0, 1, 0, 1]  # 1 2 1 2 1 2
    ]
)

print(i4)

Уровням можно задавать названия

In [None]:
data = {
    ('city1', 2010): 100,
    ('city1', 2020): 200,
    ('city2', 2010): 1001,
    ('city2', 2020): 2001,
}

s = pd.Series(data)
print(s)
print()

s.index.names = ['city', 'year']
print(s)

In [None]:
index = pd.MultiIndex.from_product(
    [
        ['city1', 'city2'],
        [2010, 2020]
    ],
    names=['city', 'year']
)

print(index)

columns =  pd.MultiIndex.from_product(
    [
        ['person1', 'person2', 'person3'],
        ['job1', 'job2']
    ],
    names=['worker', 'job']
)

rng = np.random.default_rng(1)

data = rng.random((4, 6))
print(data)
print()

data_df = pd.DataFrame(data, index=index, columns=columns)
print(data_df)

Индексация и срезы (по мультииндексу)

In [None]:
data = {
    ('city1', 2010): 100,
    ('city1', 2020): 200,
    ('city2', 2010): 1001,
    ('city2', 2020): 2001,
    ('city3', 2010): 10001,
    ('city3', 2020): 20001,
}

s = pd.Series(data)
print(s)
print()

s.index.names = ['city', 'year']
print(s['city1', 2010])
print()
print(s['city1'])
print()
print(s.loc['city1': 'city2'])
print()
print(s[s > 2000])

In [None]:
print(s[['city1', 'city3']])

Перегруппировка мультииндексов

In [None]:
rng = np.random.default_rng(1)

index = pd.MultiIndex.from_product(
    [
        ['a', 'c', 'b'],
        [1, 2]
    ]
)

data = pd.Series(rng.random(6), index=index)
data.index.names = ['char', 'int']

print(data)
print()
""" print(data['a': 'b']) """ # Неправильно

data = data.sort_index()
print(data)
print()
print(data['a': 'b'])

In [None]:
index = [
    ('city1', 2010, 1),
    ('city1', 2010, 2),

    ('city1', 2020, 1),
    ('city1', 2020, 2),

    ('city2', 2010, 1),
    ('city2', 2010, 2),

    ('city2', 2020, 1),
    ('city2', 2020, 2)
]

population = [
    101,
    1010,
    201,
    2010,
    301,
    3010,
    401,
    4010
]

pop = pd.Series(population, index=index)
print(pop)
print()

i = pd.MultiIndex.from_tuples(index)
pop = pop.reindex(i)

print(pop)
print()
print(pop.unstack())

In [None]:
print(pop.unstack(level = 0))
print()
print(pop.unstack(level = 1))
print()
print(pop.unstack(level = 2))

NumPy Конкатенация

In [None]:
x = [1 , 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]

print(np.concatenate([x, y, z]))
print()

x = [[1 , 2, 3]]
y = [[4, 5, 6]]
z = [[7, 8, 9]]

print(np.concatenate([x, y, z]))
print()
print(np.concatenate([x, y, z], axis = 1))
print()
print(np.concatenate([x, y, z], axis = 0))

Pandas - concat

In [None]:
ser1 = pd.Series(['a', 'b', 'c'], index=[1, 2, 3])
ser2 = pd.Series(['d', 'e', 'f'], index=[4, 5, 6])

print(pd.concat([ser1, ser2]))

ser1 = pd.Series(['a', 'b', 'c'], index=[1, 2, 3])
ser2 = pd.Series(['d', 'e', 'f'], index=[1, 2, 6])

print(pd.concat([ser1, ser2], verify_integrity=False))
print(pd.concat([ser1, ser2], ignore_index=True))
print(pd.concat([ser1, ser2], keys=['x', 'y']))