In [4]:
import pandas as pd
import numpy as np

### Pandas Series

In [3]:
pd.Series([1, 2, 3])

0    1
1    2
2    3
dtype: int64

In [14]:
arr = pd.Series(np.random.random(10))
print(arr)

0    0.936461
1    0.615898
2    0.310966
3    0.533303
4    0.170425
5    0.540678
6    0.165281
7    0.653145
8    0.702753
9    0.627275
dtype: float64


In [9]:
dados = pd.Series({1: 0.25, 2: 0.50, 3: 0.75, 4: 1.00})
print(dados)

1    0.25
2    0.50
3    0.75
4    1.00
dtype: float64


In [10]:
dados.index

Index([1, 2, 3, 4], dtype='int64')

In [11]:
dados.values

array([0.25, 0.5 , 0.75, 1.  ])

In [13]:
dados[1]

np.float64(0.25)

In [15]:
arr[0]

np.float64(0.9364614021958628)

In [18]:
arr[3:8]

3    0.533303
4    0.170425
5    0.540678
6    0.165281
7    0.653145
dtype: float64

In [19]:
dados = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print(dados)

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64


In [21]:
dados['a']

np.float64(0.25)

In [22]:
populacao_dict = {
    'California': 39538223, 'Texas': 29145505,
    'Florida': 21538187, 'New York': 20201249, 'Pennsylvania': 13002700
}
populacao = pd.Series(populacao_dict)
print(populacao)

California      39538223
Texas           29145505
Florida         21538187
New York        20201249
Pennsylvania    13002700
dtype: int64


In [23]:
populacao['Texas']

np.int64(29145505)

In [24]:
populacao['Florida':'New York']

Florida     21538187
New York    20201249
dtype: int64

In [25]:
populacao['California':'New York']

California    39538223
Texas         29145505
Florida       21538187
New York      20201249
dtype: int64

In [26]:
pd.Series(5, index=[100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [27]:
pd.Series({2:'a', 1:'b', 3:'c'}, index=[1, 2])

1    b
2    a
dtype: object

### Pandas Dataframe

In [28]:
populacao

California      39538223
Texas           29145505
Florida         21538187
New York        20201249
Pennsylvania    13002700
dtype: int64

In [29]:
area_dict = {
    'California': 423967, 'Texas': 695662,
    'Florida': 170312, 'New York': 141297,
    'Pennsylvania': 119280
}
area = pd.Series(area_dict)
print(area)

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
dtype: int64


In [30]:
estados = pd.DataFrame({'populacao': populacao, 'area': area})

In [31]:
estados

Unnamed: 0,populacao,area
California,39538223,423967
Texas,29145505,695662
Florida,21538187,170312
New York,20201249,141297
Pennsylvania,13002700,119280


In [32]:
estados.index

Index(['California', 'Texas', 'Florida', 'New York', 'Pennsylvania'], dtype='object')

In [33]:
estados.values

array([[39538223,   423967],
       [29145505,   695662],
       [21538187,   170312],
       [20201249,   141297],
       [13002700,   119280]])

In [34]:
estados.columns

Index(['populacao', 'area'], dtype='object')

In [35]:
estados['area']

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

In [36]:
estados.to_dict(orient='list')

{'populacao': [39538223, 29145505, 21538187, 20201249, 13002700],
 'area': [423967, 695662, 170312, 141297, 119280]}

In [37]:
estados.to_dict(orient='split')

{'index': ['California', 'Texas', 'Florida', 'New York', 'Pennsylvania'],
 'columns': ['populacao', 'area'],
 'data': [[39538223, 423967],
  [29145505, 695662],
  [21538187, 170312],
  [20201249, 141297],
  [13002700, 119280]]}

In [39]:
pd.DataFrame(populacao, columns=['populacao'])

Unnamed: 0,populacao
California,39538223
Texas,29145505
Florida,21538187
New York,20201249
Pennsylvania,13002700


In [40]:
pd.DataFrame(np.random.rand(3, 2), 
columns=['foo', 'bar'], 
index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.084125,0.456874
b,0.084112,0.713455
c,0.086879,0.614043


In [41]:
pd.DataFrame(
    np.zeros(3, 
    dtype=[
        ('A', 'i8'), 
        ('B', 'f8')
    ]
))


Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


### Pandas Index

In [42]:
a_index = pd.Index([2, 3, 4, 5])
print(a_index)

Index([2, 3, 4, 5], dtype='int64')


In [43]:
a_index[2]

np.int64(4)

In [44]:
a_index[1] = 99

TypeError: Index does not support mutable operations

In [45]:
b_index = pd.Index([1, 2, 3])

In [46]:
a_index.intersection(b_index)

Index([2, 3], dtype='int64')

In [47]:
arr = np.array([[0, 3, 6, 5, 7],
 		[7, 1, 8, 8, 8],
 		[4, 6, 0, 5, 5]])

In [48]:
arr[2, 1]

np.int64(6)

In [49]:
arr[1, [0, 4]]

array([7, 8])

In [50]:
populacao

California      39538223
Texas           29145505
Florida         21538187
New York        20201249
Pennsylvania    13002700
dtype: int64

In [51]:
populacao.keys()

Index(['California', 'Texas', 'Florida', 'New York', 'Pennsylvania'], dtype='object')

In [53]:
list(populacao.items())

[('California', 39538223),
 ('Texas', 29145505),
 ('Florida', 21538187),
 ('New York', 20201249),
 ('Pennsylvania', 13002700)]

In [54]:
populacao['Texas'] = 29145500

In [55]:
populacao

California      39538223
Texas           29145500
Florida         21538187
New York        20201249
Pennsylvania    13002700
dtype: int64

In [56]:
dados = pd.Series({1: 0.25, 2: 0.50, 3: 0.75, 4: 1.00})
print(dados)

1    0.25
2    0.50
3    0.75
4    1.00
dtype: float64


In [58]:
dados.loc[2]

np.float64(0.5)

In [59]:
dados.iloc[2]

np.float64(0.75)

In [60]:
dados.loc[1:3]

1    0.25
2    0.50
3    0.75
dtype: float64

In [62]:
dados

1    0.25
2    0.50
3    0.75
4    1.00
dtype: float64

In [61]:
dados.iloc[1:3]

2    0.50
3    0.75
dtype: float64

In [63]:
populacao

California      39538223
Texas           29145500
Florida         21538187
New York        20201249
Pennsylvania    13002700
dtype: int64

In [64]:
populacao.loc['California']

np.int64(39538223)

In [65]:
populacao.iloc[0]

np.int64(39538223)

In [66]:
populacao.loc[0]

KeyError: 0

In [69]:
serie = pd.Series([0, 7, 6, 4], index=['a', 'b', 'c', 'd'])
print(serie)

a    0
b    7
c    6
d    4
dtype: int64


In [70]:
np.exp(serie)

a       1.000000
b    1096.633158
c     403.428793
d      54.598150
dtype: float64

In [71]:
tabela = pd.DataFrame([[4, 8, 0, 6], 
                       [2, 0, 5, 9], 
                       [7, 7, 7, 7]],
         columns=['A', 'B', 'C', 'D'])
print(tabela)

   A  B  C  D
0  4  8  0  6
1  2  0  5  9
2  7  7  7  7


In [72]:
np.exp(tabela)

Unnamed: 0,A,B,C,D
0,54.59815,2980.957987,1.0,403.428793
1,7.389056,1.0,148.413159,8103.083928
2,1096.633158,1096.633158,1096.633158,1096.633158


In [79]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
print(A)
print(B)

0    2
1    4
2    6
dtype: int64
1    1
2    3
3    5
dtype: int64


In [80]:
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [82]:
A.add(B, fill_value = 0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [83]:
A = pd.DataFrame([[10, 2], [16, 9]],
         columns=['a', 'b'])
print(A)

    a  b
0  10  2
1  16  9


In [89]:
B = pd.DataFrame([[5, 0, 1], 
                  [9, 7, 6], 
                  [4, 8, 5]],
         columns=['a', 'b', 'c'])
print(B)

   a  b  c
0  5  0  1
1  9  7  6
2  4  8  5


In [90]:
A + B

Unnamed: 0,a,b,c
0,15.0,2.0,
1,25.0,16.0,
2,,,


In [91]:
A.add(B, fill_value = 1)

Unnamed: 0,a,b,c
0,15.0,2.0,2.0
1,25.0,16.0,7.0
2,5.0,9.0,6.0


In [92]:
A / B

Unnamed: 0,a,b,c
0,2.0,inf,
1,1.777778,1.285714,
2,,,


In [94]:
A.divide(B, fill_value = 0)

Unnamed: 0,a,b,c
0,2.0,inf,0.0
1,1.777778,1.285714,0.0
2,0.0,0.0,0.0


In [96]:
A = np.array([[4, 4, 2, 0],
              [5, 8, 0, 8],
              [8, 2, 6, 1]])

df = pd.DataFrame(A, columns=['Q', 'R', 'S', 'T'])
print(df)

   Q  R  S  T
0  4  4  2  0
1  5  8  0  8
2  8  2  6  1


In [98]:
df.iloc[0].shape

(4,)

In [99]:
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,1,4,-2,8
2,4,-2,4,1


In [100]:
df.sub(df.iloc[0])

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,1,4,-2,8
2,4,-2,4,1


In [101]:
df.subtract(df['R'], axis = 0)

Unnamed: 0,Q,R,S,T
0,0,0,-2,-4
1,-3,0,-8,0
2,6,0,4,-1


In [102]:
A

array([[4, 4, 2, 0],
       [5, 8, 0, 8],
       [8, 2, 6, 1]])

In [103]:
df.iloc[0, ::2]

Q    4
S    2
Name: 0, dtype: int64

In [104]:
df - df.iloc[0, ::2]

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,1.0,,-2.0,
2,4.0,,4.0,


In [113]:
df.subtract(df.iloc[0, ::2], axis=1)

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,1.0,,-2.0,
2,4.0,,4.0,


In [114]:
type(None)

NoneType

In [115]:
isinstance(None, object)

True

In [124]:
valores1 = np.array([1, None, 2, 3])

In [125]:
valores1.dtype

dtype('O')

In [126]:
%timeit np.arange(1E6, dtype=int).sum()

1.98 ms ± 15.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [127]:
%timeit np.arange(1E6, dtype=object).sum()

34.1 ms ± 277 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [128]:
valores1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [129]:
valores2 = np.array([1, 2, 3])

In [130]:
valores2.sum()

np.int64(6)

In [133]:
valores1 = np.array([1, np.nan, 2, 3])
valores1

array([ 1., nan,  2.,  3.])

In [135]:
valores1.sum()

np.float64(nan)

In [137]:
np.nansum(valores1)

np.float64(6.0)

In [138]:
np.array([1, None, 3], dtype=float)

array([ 1., nan,  3.])

In [139]:
np.array(["a", np.nan, "c"])

array(['a', 'nan', 'c'], dtype='<U32')

In [140]:
np.array(["a", None, "c"])

array(['a', None, 'c'], dtype=object)

In [142]:
import numpy.ma as ma
masked = ma.array([1, 6, 3], mask =[0, 1, 0])
masked

masked_array(data=[1, --, 3],
             mask=[False,  True, False],
       fill_value=999999)

In [144]:
pd.Series([1, np.nan, 2, None, pd.NA], dtype='Int32')

0       1
1    <NA>
2       2
3    <NA>
dtype: Int32

In [145]:
pd.Series(["2025-01-01", pd.NaT, "2025-08-08", None], dtype="datetime64[ns]")

0   2025-01-01
1          NaT
2   2025-08-08
3          NaT
dtype: datetime64[ns]

In [150]:
series = pd.Series([1, np.nan, 2, None, pd.NA], dtype='Int32')
mask = series.isnull()
series[mask]

1    <NA>
3    <NA>
4    <NA>
dtype: Int32

In [151]:
mask = series.notnull()
series[mask]

0    1
2    2
dtype: Int32

In [152]:
series.dropna()

0    1
2    2
dtype: Int32

In [153]:
series

0       1
1    <NA>
2       2
3    <NA>
4    <NA>
dtype: Int32

In [154]:
dados_df = pd.DataFrame([[1     , np.nan, 2], 
                         [2     , 3     , 5],
                         [np.nan, 4     , 6]])
dados_df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [155]:
dados_df.dropna(axis=1)

Unnamed: 0,2
0,2
1,5
2,6


In [None]:
dados_df[3] = np.nan

In [157]:
dados_df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [160]:
dados_df.dropna(axis=1, how="all")

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [165]:
dados_df.dropna(thresh=2, axis=1)

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [166]:
series

0       1
1    <NA>
2       2
3    <NA>
4    <NA>
dtype: Int32

In [169]:
series.fillna(series.sum())

0    1
1    3
2    2
3    3
4    3
dtype: Int32

In [170]:
series.ffill()

0    1
1    1
2    2
3    2
4    2
dtype: Int32

In [171]:
series.bfill()

0       1
1       2
2       2
3    <NA>
4    <NA>
dtype: Int32

In [172]:
dados_df = pd.DataFrame([[1     , np.nan, 2, np.nan], 
                         [2     , 3     , 5, np.nan],
                         [np.nan, 4     , 6, np.nan]])
print(dados_df)

     0    1  2   3
0  1.0  NaN  2 NaN
1  2.0  3.0  5 NaN
2  NaN  4.0  6 NaN


In [173]:
dados_df.fillna(10)

Unnamed: 0,0,1,2,3
0,1.0,10.0,2,10.0
1,2.0,3.0,5,10.0
2,10.0,4.0,6,10.0


In [176]:
dados_df.bfill(axis=1)

Unnamed: 0,0,1,2,3
0,1.0,2.0,2.0,
1,2.0,3.0,5.0,
2,4.0,4.0,6.0,


In [177]:
dados_df = pd.DataFrame({
    'nome': ['Ana', 'Ana', 'Ana', 'Bruno', 'Bruno', 'Carlos', 'Carlos'],
    'idade': [20, 20, 21, 21, 22, 23, 23],
    'cidade': ['SP', 'SP', 'RJ', 'RJ', 'SP', 'MG', 'MG']
})
dados_df

Unnamed: 0,nome,idade,cidade
0,Ana,20,SP
1,Ana,20,SP
2,Ana,21,RJ
3,Bruno,21,RJ
4,Bruno,22,SP
5,Carlos,23,MG
6,Carlos,23,MG


In [178]:
dados_df.drop_duplicates()

Unnamed: 0,nome,idade,cidade
0,Ana,20,SP
2,Ana,21,RJ
3,Bruno,21,RJ
4,Bruno,22,SP
5,Carlos,23,MG


In [179]:
dados_df.drop_duplicates(subset=['nome'])

Unnamed: 0,nome,idade,cidade
0,Ana,20,SP
3,Bruno,21,RJ
5,Carlos,23,MG


In [181]:
dados_df.drop_duplicates(subset=['nome', 'cidade'])

Unnamed: 0,nome,idade,cidade
0,Ana,20,SP
2,Ana,21,RJ
3,Bruno,21,RJ
4,Bruno,22,SP
5,Carlos,23,MG


In [180]:
dados_df.drop_duplicates(subset=['nome'], keep='last')

Unnamed: 0,nome,idade,cidade
2,Ana,21,RJ
4,Bruno,22,SP
6,Carlos,23,MG


In [188]:
series.unique()

<IntegerArray>
[1, <NA>, 2]
Length: 3, dtype: Int32

In [189]:
dados_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   nome    7 non-null      object
 1   idade   7 non-null      int64 
 2   cidade  7 non-null      object
dtypes: int64(1), object(2)
memory usage: 300.0+ bytes


In [190]:
dados_df.columns

Index(['nome', 'idade', 'cidade'], dtype='object')

In [191]:
dados_df

Unnamed: 0,nome,idade,cidade
0,Ana,20,SP
1,Ana,20,SP
2,Ana,21,RJ
3,Bruno,21,RJ
4,Bruno,22,SP
5,Carlos,23,MG
6,Carlos,23,MG


In [193]:
dados_df.query("idade > 21")

Unnamed: 0,nome,idade,cidade
4,Bruno,22,SP
5,Carlos,23,MG
6,Carlos,23,MG


In [195]:
pd.MultiIndex.from_arrays(
    [['a', 'a', 'b', 'b'],
			  [1, 2, 1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [196]:
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), 
			  ('b', 1), ('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [199]:
multiindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

In [198]:
dados = np.array([[18, 20], 
                  [28, 18.6],
                  [25, 20],
	          [18, 19.6]])
dados

array([[18. , 20. ],
       [28. , 18.6],
       [25. , 20. ],
       [18. , 19.6]])

In [200]:
dados_df = pd.DataFrame(dados, index = multiindex, columns=['data1', 'data2'])

In [201]:
dados_df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,18.0,20.0
a,2,28.0,18.6
b,1,25.0,20.0
b,2,18.0,19.6


In [204]:
dados_df.index.names = ['teste1', 'teste2']

In [205]:
dados_df

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
teste1,teste2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,18.0,20.0
a,2,28.0,18.6
b,1,25.0,20.0
b,2,18.0,19.6


In [206]:
dados = np.array([[18, 20,   28, 18.5, 20, 21.1], 
                  [28, 18.6, 35, 20.2, 23, 19.6],
                  [25, 20,   32, 18.6, 22, 20.4],
	          [18, 19.6, 16, 20.2, 20, 21  ]])
dados

array([[18. , 20. , 28. , 18.5, 20. , 21.1],
       [28. , 18.6, 35. , 20.2, 23. , 19.6],
       [25. , 20. , 32. , 18.6, 22. , 20.4],
       [18. , 19.6, 16. , 20.2, 20. , 21. ]])

In [207]:
indices = pd.MultiIndex.from_product([[2024, 2025], ['A', 'B']], 
				names = ['ano', 'teste'])

In [208]:
indices

MultiIndex([(2024, 'A'),
            (2024, 'B'),
            (2025, 'A'),
            (2025, 'B')],
           names=['ano', 'teste'])

In [209]:
colunas = pd.MultiIndex.from_product(
    [['Floresta', 'Caatinga', 'Cerrado'], ['Folha', 'Raiz']], 
	names = ['habitat', 'estrutura']
    )

In [210]:
dados_df = pd.DataFrame(dados, index=indices, columns = colunas)

In [211]:
dados_df

Unnamed: 0_level_0,habitat,Floresta,Floresta,Caatinga,Caatinga,Cerrado,Cerrado
Unnamed: 0_level_1,estrutura,Folha,Raiz,Folha,Raiz,Folha,Raiz
ano,teste,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2024,A,18.0,20.0,28.0,18.5,20.0,21.1
2024,B,28.0,18.6,35.0,20.2,23.0,19.6
2025,A,25.0,20.0,32.0,18.6,22.0,20.4
2025,B,18.0,19.6,16.0,20.2,20.0,21.0


In [212]:
dados_ser = pd.Series([18, 28, 25, 18], index = indices)

In [213]:
dados_ser

ano   teste
2024  A        18
      B        28
2025  A        25
      B        18
dtype: int64

In [216]:
dados_ser[2024, 'A']

np.int64(18)

In [217]:
dados_ser[2024]

teste
A    18
B    28
dtype: int64

In [219]:
dados_ser[:, 'A']

ano
2024    18
2025    25
dtype: int64

In [220]:
dados_ser > 20

ano   teste
2024  A        False
      B         True
2025  A         True
      B        False
dtype: bool

In [221]:
dados_ser[dados_ser > 20]

ano   teste
2024  B        28
2025  A        25
dtype: int64

In [222]:
dados_df

Unnamed: 0_level_0,habitat,Floresta,Floresta,Caatinga,Caatinga,Cerrado,Cerrado
Unnamed: 0_level_1,estrutura,Folha,Raiz,Folha,Raiz,Folha,Raiz
ano,teste,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2024,A,18.0,20.0,28.0,18.5,20.0,21.1
2024,B,28.0,18.6,35.0,20.2,23.0,19.6
2025,A,25.0,20.0,32.0,18.6,22.0,20.4
2025,B,18.0,19.6,16.0,20.2,20.0,21.0


In [224]:
dados_df['Floresta', 'Folha']

ano   teste
2024  A        18.0
      B        28.0
2025  A        25.0
      B        18.0
Name: (Floresta, Folha), dtype: float64

In [225]:
dados_df.loc[(2025, 'B'), 'Cerrado']

estrutura
Folha    20.0
Raiz     21.0
Name: (2025, B), dtype: float64

In [226]:
dados_ser

ano   teste
2024  A        18
      B        28
2025  A        25
      B        18
dtype: int64

In [227]:
indice = pd.MultiIndex.from_product([['A', 'C', 'B'],  				    [1, 2]])
dados = pd.Series(np.random.randint(1, 10, 6), index=indice)
dados.index.names = ['char', 'int']

In [228]:
dados

char  int
A     1      6
      2      1
C     1      2
      2      3
B     1      4
      2      2
dtype: int32

In [229]:
dados['A':'B']

UnsortedIndexError: 'Key length (1) was greater than MultiIndex lexsort depth (0)'

In [230]:
dados = dados.sort_index()

In [231]:
dados

char  int
A     1      6
      2      1
B     1      4
      2      2
C     1      2
      2      3
dtype: int32

In [232]:
dados['A':'B']

char  int
A     1      6
      2      1
B     1      4
      2      2
dtype: int32

In [233]:
dados_df

Unnamed: 0_level_0,habitat,Floresta,Floresta,Caatinga,Caatinga,Cerrado,Cerrado
Unnamed: 0_level_1,estrutura,Folha,Raiz,Folha,Raiz,Folha,Raiz
ano,teste,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2024,A,18.0,20.0,28.0,18.5,20.0,21.1
2024,B,28.0,18.6,35.0,20.2,23.0,19.6
2025,A,25.0,20.0,32.0,18.6,22.0,20.4
2025,B,18.0,19.6,16.0,20.2,20.0,21.0


In [235]:
dados_df.stack(future_stack = True)

Unnamed: 0_level_0,Unnamed: 1_level_0,habitat,Floresta,Caatinga,Cerrado
ano,teste,estrutura,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024,A,Folha,18.0,28.0,20.0
2024,A,Raiz,20.0,18.5,21.1
2024,B,Folha,28.0,35.0,23.0
2024,B,Raiz,18.6,20.2,19.6
2025,A,Folha,25.0,32.0,22.0
2025,A,Raiz,20.0,18.6,20.4
2025,B,Folha,18.0,16.0,20.0
2025,B,Raiz,19.6,20.2,21.0


In [236]:
dados_df.stack(level='habitat', future_stack = True)

Unnamed: 0_level_0,Unnamed: 1_level_0,estrutura,Folha,Raiz
ano,teste,habitat,Unnamed: 3_level_1,Unnamed: 4_level_1
2024,A,Floresta,18.0,20.0
2024,A,Caatinga,28.0,18.5
2024,A,Cerrado,20.0,21.1
2024,B,Floresta,28.0,18.6
2024,B,Caatinga,35.0,20.2
2024,B,Cerrado,23.0,19.6
2025,A,Floresta,25.0,20.0
2025,A,Caatinga,32.0,18.6
2025,A,Cerrado,22.0,20.4
2025,B,Floresta,18.0,19.6


In [237]:
dados_df.unstack(level='ano')

habitat,Floresta,Floresta,Floresta,Floresta,Caatinga,Caatinga,Caatinga,Caatinga,Cerrado,Cerrado,Cerrado,Cerrado
estrutura,Folha,Folha,Raiz,Raiz,Folha,Folha,Raiz,Raiz,Folha,Folha,Raiz,Raiz
ano,2024,2025,2024,2025,2024,2025,2024,2025,2024,2025,2024,2025
teste,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
A,18.0,25.0,20.0,20.0,28.0,32.0,18.5,18.6,20.0,22.0,21.1,20.4
B,28.0,18.0,18.6,19.6,35.0,16.0,20.2,20.2,23.0,20.0,19.6,21.0


In [238]:
dados_df.reset_index()

habitat,ano,teste,Floresta,Floresta,Caatinga,Caatinga,Cerrado,Cerrado
estrutura,Unnamed: 1_level_1,Unnamed: 2_level_1,Folha,Raiz,Folha,Raiz,Folha,Raiz
0,2024,A,18.0,20.0,28.0,18.5,20.0,21.1
1,2024,B,28.0,18.6,35.0,20.2,23.0,19.6
2,2025,A,25.0,20.0,32.0,18.6,22.0,20.4
3,2025,B,18.0,19.6,16.0,20.2,20.0,21.0


In [240]:
dados_df.T.reset_index()

ano,habitat,estrutura,2024,2024,2025,2025
teste,Unnamed: 1_level_1,Unnamed: 2_level_1,A,B,A,B
0,Floresta,Folha,18.0,28.0,25.0,18.0
1,Floresta,Raiz,20.0,18.6,20.0,19.6
2,Caatinga,Folha,28.0,35.0,32.0,16.0
3,Caatinga,Raiz,18.5,20.2,18.6,20.2
4,Cerrado,Folha,20.0,23.0,22.0,20.0
5,Cerrado,Raiz,21.1,19.6,20.4,21.0


In [241]:
dados_df

Unnamed: 0_level_0,habitat,Floresta,Floresta,Caatinga,Caatinga,Cerrado,Cerrado
Unnamed: 0_level_1,estrutura,Folha,Raiz,Folha,Raiz,Folha,Raiz
ano,teste,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2024,A,18.0,20.0,28.0,18.5,20.0,21.1
2024,B,28.0,18.6,35.0,20.2,23.0,19.6
2025,A,25.0,20.0,32.0,18.6,22.0,20.4
2025,B,18.0,19.6,16.0,20.2,20.0,21.0


In [245]:
dados_df.reset_index(level='ano', drop=True)

habitat,ano,Floresta,Floresta,Caatinga,Caatinga,Cerrado,Cerrado
estrutura,Unnamed: 1_level_1,Folha,Raiz,Folha,Raiz,Folha,Raiz
teste,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
A,2024,18.0,20.0,28.0,18.5,20.0,21.1
B,2024,28.0,18.6,35.0,20.2,23.0,19.6
A,2025,25.0,20.0,32.0,18.6,22.0,20.4
B,2025,18.0,19.6,16.0,20.2,20.0,21.0


In [247]:
dados_df.columns.to_flat_index()

Index([('Floresta', 'Folha'),  ('Floresta', 'Raiz'), ('Caatinga', 'Folha'),
        ('Caatinga', 'Raiz'),  ('Cerrado', 'Folha'),   ('Cerrado', 'Raiz')],
      dtype='object')

In [248]:
['_'.join(col) for col in dados_df.columns.to_flat_index()]


['Floresta_Folha',
 'Floresta_Raiz',
 'Caatinga_Folha',
 'Caatinga_Raiz',
 'Cerrado_Folha',
 'Cerrado_Raiz']

In [249]:
dados_df

Unnamed: 0_level_0,habitat,Floresta,Floresta,Caatinga,Caatinga,Cerrado,Cerrado
Unnamed: 0_level_1,estrutura,Folha,Raiz,Folha,Raiz,Folha,Raiz
ano,teste,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2024,A,18.0,20.0,28.0,18.5,20.0,21.1
2024,B,28.0,18.6,35.0,20.2,23.0,19.6
2025,A,25.0,20.0,32.0,18.6,22.0,20.4
2025,B,18.0,19.6,16.0,20.2,20.0,21.0


In [252]:
dados_df.columns = ['_'.join(col) for col in dados_df.columns.to_flat_index()]

In [253]:
dados_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Floresta_Folha,Floresta_Raiz,Caatinga_Folha,Caatinga_Raiz,Cerrado_Folha,Cerrado_Raiz
ano,teste,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024,A,18.0,20.0,28.0,18.5,20.0,21.1
2024,B,28.0,18.6,35.0,20.2,23.0,19.6
2025,A,25.0,20.0,32.0,18.6,22.0,20.4
2025,B,18.0,19.6,16.0,20.2,20.0,21.0


##### concat()

In [2]:
import pandas as pd

In [3]:
df1 = pd.DataFrame({
    'id': [0, 1, 2],
    'nome': ['Ana', 'Maria', 'Joao'],
    'idade': [20, 21, 22]
})

df2 = pd.DataFrame({
    'id': [3, 4, 5],
    'nome': ['AnaMaria', 'Carlos', 'Jose'],
    'idade': [21, 21, 20]
})

In [4]:
df1

Unnamed: 0,id,nome,idade
0,0,Ana,20
1,1,Maria,21
2,2,Joao,22


In [5]:
df2

Unnamed: 0,id,nome,idade
0,3,AnaMaria,21
1,4,Carlos,21
2,5,Jose,20


In [7]:
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,id,nome,idade
0,0,Ana,20
1,1,Maria,21
2,2,Joao,22
3,3,AnaMaria,21
4,4,Carlos,21
5,5,Jose,20


In [9]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,id,nome,idade,id.1,nome.1,idade.1
0,0,Ana,20,3,AnaMaria,21
1,1,Maria,21,4,Carlos,21
2,2,Joao,22,5,Jose,20


#### merge()

In [10]:
df_notas = pd.DataFrame({
    'id': [1, 2, 4],
    'nota': [8.5, 7.0, 9.5]
})
df_notas

Unnamed: 0,id,nota
0,1,8.5
1,2,7.0
2,4,9.5


In [11]:
df1

Unnamed: 0,id,nome,idade
0,0,Ana,20
1,1,Maria,21
2,2,Joao,22


In [14]:
df1.merge(df_notas, on='id', how='inner')

Unnamed: 0,id,nome,idade,nota
0,1,Maria,21,8.5
1,2,Joao,22,7.0


In [15]:
df1.merge(df_notas, on='id', how='outer')

Unnamed: 0,id,nome,idade,nota
0,0,Ana,20.0,
1,1,Maria,21.0,8.5
2,2,Joao,22.0,7.0
3,4,,,9.5


In [16]:
df1.merge(df_notas, on='id', how='left')

Unnamed: 0,id,nome,idade,nota
0,0,Ana,20,
1,1,Maria,21,8.5
2,2,Joao,22,7.0


In [17]:
df1.merge(df_notas, on='id', how='right')

Unnamed: 0,id,nome,idade,nota
0,1,Maria,21.0,8.5
1,2,Joao,22.0,7.0
2,4,,,9.5


In [18]:
df_notas = pd.DataFrame({
    'id_aluno': [1, 2, 4],
    'nota': [8.5, 7.0, 9.5]
})
df_notas

Unnamed: 0,id_aluno,nota
0,1,8.5
1,2,7.0
2,4,9.5


In [20]:
df1.merge(df_notas, left_on='id', right_on='id_aluno')

Unnamed: 0,id,nome,idade,id_aluno,nota
0,1,Maria,21,1,8.5
1,2,Joao,22,2,7.0


In [22]:
df1

Unnamed: 0,id,nome,idade
0,0,Ana,20
1,1,Maria,21
2,2,Joao,22


In [23]:
df_notas

Unnamed: 0,id_aluno,nota
0,1,8.5
1,2,7.0
2,4,9.5


In [24]:
df1.join(df_notas, how='outer')

Unnamed: 0,id,nome,idade,id_aluno,nota
0,0,Ana,20,1,8.5
1,1,Maria,21,2,7.0
2,2,Joao,22,4,9.5


In [25]:
df1 = pd.DataFrame({
    'id': [0, 1, 2],
    'nome': ['Ana', 'Maria', 'Joao'],
    'idade': [20, 21, 22]
}, index=['a', 'b', 'c'])

df2 = pd.DataFrame({
    'id': [3, 4, 5],
    'nome': ['AnaMaria', 'Carlos', 'Jose'],
    'idade': [21, 21, 20]
}, index=['b', 'c', 'd'])

In [26]:
df1

Unnamed: 0,id,nome,idade
a,0,Ana,20
b,1,Maria,21
c,2,Joao,22


In [27]:
df2

Unnamed: 0,id,nome,idade
b,3,AnaMaria,21
c,4,Carlos,21
d,5,Jose,20


In [33]:
df1.join(df2, how='right', lsuffix='x',  rsuffix='y')

Unnamed: 0,idx,nomex,idadex,idy,nomey,idadey
b,1.0,Maria,21.0,3,AnaMaria,21
c,2.0,Joao,22.0,4,Carlos,21
d,,,,5,Jose,20


#### Importar dados

In [37]:
populacao = pd.read_csv('dados/state-population.csv')
areas = pd.read_csv('dados/state-areas.csv')
abreviacoes = pd.read_csv('dados/state-abbrevs.csv')

In [36]:
populacao.head(2)

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0


In [42]:
populacao.describe(include='all')

Unnamed: 0,state/region,ages,year,population
count,2544,2544,2544.0,2524.0
unique,53,2,,
top,AL,under18,,
freq,48,1272,,
mean,,,2001.5,6805558.0
std,,,6.923547,28550140.0
min,,,1990.0,101309.0
25%,,,1995.75,742380.5
50%,,,2001.5,1597005.0
75%,,,2007.25,4547104.0


In [43]:
populacao.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2544 entries, 0 to 2543
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   state/region  2544 non-null   object 
 1   ages          2544 non-null   object 
 2   year          2544 non-null   int64  
 3   population    2524 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 79.6+ KB


In [45]:
populacao.tail(2)

Unnamed: 0,state/region,ages,year,population
2542,USA,under18,2012,73708179.0
2543,USA,total,2012,313873685.0


In [38]:
areas.head(2)

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425


In [39]:
abreviacoes.head(2)

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK


In [50]:
dados = pd.merge(populacao, abreviacoes, how='outer', left_on='state/region', right_on='abbreviation')

In [51]:
dados.head()

Unnamed: 0,state/region,ages,year,population,state,abbreviation
0,AK,total,1990,553290.0,Alaska,AK
1,AK,under18,1990,177502.0,Alaska,AK
2,AK,total,1992,588736.0,Alaska,AK
3,AK,under18,1991,182180.0,Alaska,AK
4,AK,under18,1992,184878.0,Alaska,AK


In [53]:
# dados = dados.drop('abbreviation', axis=1)
dados.drop('abbreviation', axis=1, inplace=True)

In [54]:
dados.head()

Unnamed: 0,state/region,ages,year,population,state
0,AK,total,1990,553290.0,Alaska
1,AK,under18,1990,177502.0,Alaska
2,AK,total,1992,588736.0,Alaska
3,AK,under18,1991,182180.0,Alaska
4,AK,under18,1992,184878.0,Alaska


In [59]:
dados.isnull().sum()

state/region     0
ages             0
year             0
population      20
state           96
dtype: int64

In [64]:
dados[dados['population'].isnull()].head()

Unnamed: 0,state/region,ages,year,population,state
1872,PR,under18,1990,,
1873,PR,total,1990,,
1874,PR,total,1991,,
1875,PR,under18,1991,,
1876,PR,total,1993,,


In [68]:
dados.loc[dados['state'].isnull(), 'state/region'].unique()

array(['PR', 'USA'], dtype=object)

In [70]:
dados.loc[dados['state/region'] == 'PR', 'state'] = 'Porto Rico'

In [74]:
dados.loc[dados['state/region'] == 'USA', 'state'] = 'USA'

In [None]:
dados[dados['state/region'] == 'PR'].head()

Unnamed: 0,state/region,ages,year,population,state
1872,PR,under18,1990,,Porto Rico
1873,PR,total,1990,,Porto Rico
1874,PR,total,1991,,Porto Rico
1875,PR,under18,1991,,Porto Rico
1876,PR,total,1993,,Porto Rico


In [75]:
dados.isnull().sum()

state/region     0
ages             0
year             0
population      20
state            0
dtype: int64

In [80]:
dados.dropna(inplace=True)

In [82]:
areas.head()

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


In [83]:
dados

Unnamed: 0,state/region,ages,year,population,state
0,AK,total,1990,553290.0,Alaska
1,AK,under18,1990,177502.0,Alaska
2,AK,total,1992,588736.0,Alaska
3,AK,under18,1991,182180.0,Alaska
4,AK,under18,1992,184878.0,Alaska
...,...,...,...,...,...
2539,WY,under18,1993,137458.0,Wyoming
2540,WY,total,1991,459260.0,Wyoming
2541,WY,under18,1991,136720.0,Wyoming
2542,WY,under18,1990,136078.0,Wyoming


In [81]:
dados.isnull().sum()

state/region    0
ages            0
year            0
population      0
state           0
dtype: int64

In [85]:
dados_areas = pd.merge(dados, areas, on='state', how='left')

In [86]:
dados_areas.describe()

Unnamed: 0,year,population,area (sq. mi)
count,2524.0,2524.0,2448.0
mean,2001.555468,6805558.0,74252.627451
std,6.917991,28550140.0,94929.655186
min,1990.0,101309.0,68.0
25%,1996.0,742380.5,35387.0
50%,2002.0,1597005.0,56276.0
75%,2008.0,4547104.0,84904.0
max,2013.0,316128800.0,656425.0


In [87]:
dados_areas.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AK,total,1990,553290.0,Alaska,656425.0
1,AK,under18,1990,177502.0,Alaska,656425.0
2,AK,total,1992,588736.0,Alaska,656425.0
3,AK,under18,1991,182180.0,Alaska,656425.0
4,AK,under18,1992,184878.0,Alaska,656425.0


In [None]:
dados2010_total = dados_areas.query("year == 2010 & ages == 'total'")
dados2010_total.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
43,AK,total,2010,713868.0,Alaska,656425.0
51,AL,total,2010,4785570.0,Alabama,52423.0
141,AR,total,2010,2922280.0,Arkansas,53182.0
149,AZ,total,2010,6408790.0,Arizona,114006.0
197,CA,total,2010,37333601.0,California,163707.0


In [92]:
dados2010_total.set_index('state', inplace=True)

In [93]:
dados2010_total.head()

Unnamed: 0_level_0,state/region,ages,year,population,area (sq. mi)
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alaska,AK,total,2010,713868.0,656425.0
Alabama,AL,total,2010,4785570.0,52423.0
Arkansas,AR,total,2010,2922280.0,53182.0
Arizona,AZ,total,2010,6408790.0,114006.0
California,CA,total,2010,37333601.0,163707.0


In [94]:
densidade = dados2010_total['population'] / dados2010_total['area (sq. mi)']

In [95]:
densidade.head()

state
Alaska          1.087509
Alabama        91.287603
Arkansas       54.948667
Arizona        56.214497
California    228.051342
dtype: float64

In [96]:
densidade.sort_values(ascending=False, inplace=True)
densidade.head()

state
District of Columbia    8898.897059
New Jersey              1009.253268
Rhode Island             681.339159
Connecticut              645.600649
Massachusetts            621.815538
dtype: float64

In [97]:
densidade.tail()

state
Montana       6.736171
Wyoming       5.768079
Alaska        1.087509
Porto Rico         NaN
USA                NaN
dtype: float64

In [98]:
dados_areas.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AK,total,1990,553290.0,Alaska,656425.0
1,AK,under18,1990,177502.0,Alaska,656425.0
2,AK,total,1992,588736.0,Alaska,656425.0
3,AK,under18,1991,182180.0,Alaska,656425.0
4,AK,under18,1992,184878.0,Alaska,656425.0


In [None]:
group_population = dados_areas.groupby('year')['population']

In [104]:
pop_total_ano = group_population.sum()

In [106]:
pop_total_ano.sort_values(ascending=False)

year
2013    783858576.0
2012    779657013.0
2011    775525479.0
2010    771509855.0
2009    766472596.0
2008    761103707.0
2007    755257832.0
2006    749079009.0
2005    742921345.0
2004    737068863.0
2003    731294092.0
2002    726012052.0
2001    720176600.0
2000    713976868.0
1999    701972464.0
1998    694571044.0
1997    687135340.0
1996    679255606.0
1995    671503086.0
1994    663533524.0
1993    655027066.0
1992    646046816.0
1991    636587920.0
1990    627682652.0
Name: population, dtype: float64

In [109]:
dados_pivot = dados_areas.pivot_table(index='year', columns='state', values='population', aggfunc='sum')

In [114]:
dados_stacked = dados_pivot.stack()
print(dados_stacked)

year  state        
1990  Alabama           5100096.0
      Alaska             730792.0
      Arizona           4690137.0
      Arkansas          2977519.0
      California       37940016.0
                          ...    
2013  Virginia         10124940.0
      Washington        8567201.0
      West Virginia     2235982.0
      Wisconsin         7050489.0
      Wyoming            720337.0
Length: 1262, dtype: float64


In [115]:
dados_areas['ages'].unique()

array(['total', 'under18'], dtype=object)

In [117]:
pd.get_dummies(dados_areas, columns=['ages']).head()

Unnamed: 0,state/region,year,population,state,area (sq. mi),ages_total,ages_under18
0,AK,1990,553290.0,Alaska,656425.0,True,False
1,AK,1990,177502.0,Alaska,656425.0,False,True
2,AK,1992,588736.0,Alaska,656425.0,True,False
3,AK,1991,182180.0,Alaska,656425.0,False,True
4,AK,1992,184878.0,Alaska,656425.0,False,True


In [118]:
df_list = pd.DataFrame({
    'Aluno': ['Ana', 'Bruno'],
    'Disciplinas': [['Matematica', 'Portugues'], ['Matematica']]
})

df_list

Unnamed: 0,Aluno,Disciplinas
0,Ana,"[Matematica, Portugues]"
1,Bruno,[Matematica]


In [120]:
dados_exploded = df_list.explode('Disciplinas')

In [122]:
dados_exploded

Unnamed: 0,Aluno,Disciplinas
0,Ana,Matematica
0,Ana,Portugues
1,Bruno,Matematica


In [121]:
pd.crosstab(dados_exploded['Aluno'], dados_exploded['Disciplinas'])

Disciplinas,Matematica,Portugues
Aluno,Unnamed: 1_level_1,Unnamed: 2_level_1
Ana,1,1
Bruno,1,0


In [123]:
dados_areas['population'].head()

0    553290.0
1    177502.0
2    588736.0
3    182180.0
4    184878.0
Name: population, dtype: float64

In [125]:
pd.cut(dados_areas['population'], bins=3).unique()

[(-214718.53, 105443819.0], (210786329.0, 316128839.0]]
Categories (3, interval[float64, right]): [(-214718.53, 105443819.0] < (105443819.0, 210786329.0] < (210786329.0, 316128839.0]]

In [128]:
dados_areas.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AK,total,1990,553290.0,Alaska,656425.0
1,AK,under18,1990,177502.0,Alaska,656425.0
2,AK,total,1992,588736.0,Alaska,656425.0
3,AK,under18,1991,182180.0,Alaska,656425.0
4,AK,under18,1992,184878.0,Alaska,656425.0


In [130]:
dados_label, valores_unicos = pd.factorize(dados_areas['ages'])

In [134]:
dados_label[:10]

array([0, 1, 0, 1, 1, 0, 1, 0, 0, 1])

In [135]:
valores_unicos

Index(['total', 'under18'], dtype='object')