# Criando Estruturas de Dados

In [48]:
import pandas as pd

## Series

In [49]:
data = [1, 2, 3, 4, 5]

In [50]:
# Criando uma Series
s = pd.Series(data)

In [51]:
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

Init signature:
pd.Series(
    data=None,
    index=None,
    dtype: 'Dtype | None' = None,
    name=None,
    copy: 'bool' = False,
    fastpath: 'bool' = False,
)
Docstring:     
One-dimensional ndarray with axis labels (including time series).

Labels need not be unique but must be a hashable type. The object
supports both integer- and label-based indexing and provides a host of
methods for performing operations involving the index. Statistical
methods from ndarray have been overridden to automatically exclude
missing data (currently represented as NaN).

Operations between Series (+, -, /, *, **) align values based on their
associated index values-- they need not be the same length. The result
index will be the sorted union of the two indexes.

Parameters
----------
data : array-like, Iterable, dict, or scalar value
    Contains data stored in Series. If data is a dict, argument order is
    maintained.
index : array-like or Index (1d)
    Values must be hashable and have the same length as `data`.
    Non-unique index values are allowed. Will default to
    RangeIndex (0, 1, 2, ..., n) if not provided. If data is dict-like
    and index is None, then the keys in the data are used as the index. If the
    index is not None, the resulting Series is reindexed with the index values.
dtype : str, numpy.dtype, or ExtensionDtype, optional
    Data type for the output Series. If not specified, this will be
    inferred from `data`.
    See the :ref:`user guide <basics.dtypes>` for more usages.
name : str, optional
    The name to give to the Series.
copy : bool, default False
    Copy input data. Only affects Series or 1d ndarray input. See examples.

Examples
--------
Constructing Series from a dictionary with an Index specified

>>> d = {'a': 1, 'b': 2, 'c': 3}
>>> ser = pd.Series(data=d, index=['a', 'b', 'c'])
>>> ser
a   1
b   2
c   3
dtype: int64

The keys of the dictionary match with the Index values, hence the Index
values have no effect.

>>> d = {'a': 1, 'b': 2, 'c': 3}
>>> ser = pd.Series(data=d, index=['x', 'y', 'z'])
>>> ser
x   NaN
y   NaN
z   NaN
dtype: float64

Note that the Index is first build with the keys from the dictionary.
After this the Series is reindexed with the given Index values, hence we
get all NaN as a result.

Constructing Series from a list with `copy=False`.

>>> r = [1, 2]
>>> ser = pd.Series(r, copy=False)
>>> ser.iloc[0] = 999
>>> r
[1, 2]
>>> ser
0    999
1      2
dtype: int64

Due to input data type the Series has a `copy` of
the original data even though `copy=False`, so
the data is unchanged.

Constructing Series from a 1d ndarray with `copy=False`.

>>> r = np.array([1, 2])
>>> ser = pd.Series(r, copy=False)
>>> ser.iloc[0] = 999
>>> r
array([999,   2])
>>> ser
0    999
1      2
dtype: int64

Due to input data type the Series has a `view` on
the original data, so
the data is changed as well.
File:           /usr/local/lib/python3.8/dist-packages/pandas/core/series.py
Type:           type
Subclasses:     SubclassedSeries


In [52]:
index = ['Linha-' + str(i) for i in range(5)]

In [53]:
s = pd.Series(data = data, index = index)

In [54]:
s

Linha-0    1
Linha-1    2
Linha-2    3
Linha-3    4
Linha-4    5
dtype: int64

In [55]:
data = {'Linha-' + str(i) : i + 1 for i in range(5)}

In [56]:
data

{'Linha-0': 1, 'Linha-1': 2, 'Linha-2': 3, 'Linha-3': 4, 'Linha-4': 5}

In [57]:
# Criando nossa Series
s = pd.Series(data)

"""
>>> Obs: Agora não precisamos mais passar o index porque
o index vai ser composto pelas chaves do dicionário.
"""

'\n>>> Obs: Agora não precisamos mais passar o index porque\no index vai ser composto pelas chaves do dicionário.\n'

In [58]:
s

Linha-0    1
Linha-1    2
Linha-2    3
Linha-3    4
Linha-4    5
dtype: int64

## Operações com Series

In [59]:
s1 = s + 2

In [60]:
s1

Linha-0    3
Linha-1    4
Linha-2    5
Linha-3    6
Linha-4    7
dtype: int64

#### É possível somar, multiplicar, subtrair, etc, duas Series que tenham o mesmo index

Se for encontrada alguma linha com tipo de dados diferentes, será retornado um NaN

In [61]:
s2 = s + s1
s2

Linha-0     4
Linha-1     6
Linha-2     8
Linha-3    10
Linha-4    12
dtype: int64

In [62]:
s3 = s * s1
s3

Linha-0     3
Linha-1     8
Linha-2    15
Linha-3    24
Linha-4    35
dtype: int64

In [63]:
s4 = s2 / s
s4

Linha-0    4.000000
Linha-1    3.000000
Linha-2    2.666667
Linha-3    2.500000
Linha-4    2.400000
dtype: float64

In [64]:
s5 = s / s2
s5

Linha-0    0.250000
Linha-1    0.333333
Linha-2    0.375000
Linha-3    0.400000
Linha-4    0.416667
dtype: float64

## DataFrames

In [65]:
data = [[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]]
data

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]

In [66]:
# Criando um DataFrame a partir de uma lista de listas
df_1 = pd.DataFrame(data) # poderia fazer (data = data) que daria o mesmo resultado
df_1

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


Init signature:
pd.DataFrame(
    data=None,
    index: 'Axes | None' = None,
    columns: 'Axes | None' = None,
    dtype: 'Dtype | None' = None,
    copy: 'bool | None' = None,
)
Docstring:     
Two-dimensional, size-mutable, potentially heterogeneous tabular data.

Data structure also contains labeled axes (rows and columns).
Arithmetic operations align on both row and column labels. Can be
thought of as a dict-like container for Series objects. The primary
pandas data structure.

Parameters
----------
data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
    Dict can contain Series, arrays, constants, dataclass or list-like objects. If
    data is a dict, column order follows insertion-order.

    .. versionchanged:: 0.25.0
       If data is a list of dicts, column order follows insertion-order.

index : Index or array-like
    Index to use for resulting frame. Will default to RangeIndex if
    no indexing information part of input data and no index provided.
columns : Index or array-like
    Column labels to use for resulting frame when data does not have them,
    defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,
    will perform column selection instead.
dtype : dtype, default None
    Data type to force. Only a single dtype is allowed. If None, infer.
copy : bool or None, default None
    Copy data from inputs.
    For dict data, the default of None behaves like ``copy=True``.  For DataFrame
    or 2d ndarray input, the default of None behaves like ``copy=False``.

    .. versionchanged:: 1.3.0

See Also
--------
DataFrame.from_records : Constructor from tuples, also record arrays.
DataFrame.from_dict : From dicts of Series, arrays, or dicts.
read_csv : Read a comma-separated values (csv) file into DataFrame.
read_table : Read general delimited file into DataFrame.
read_clipboard : Read text from clipboard into DataFrame.

Examples
--------
Constructing DataFrame from a dictionary.

>>> d = {'col1': [1, 2], 'col2': [3, 4]}
>>> df = pd.DataFrame(data=d)
>>> df
   col1  col2
0     1     3
1     2     4

Notice that the inferred dtype is int64.

>>> df.dtypes
col1    int64
col2    int64
dtype: object

To enforce a single dtype:

>>> df = pd.DataFrame(data=d, dtype=np.int8)
>>> df.dtypes
col1    int8
col2    int8
dtype: object

Constructing DataFrame from numpy ndarray:

>>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
...                    columns=['a', 'b', 'c'])
>>> df2
   a  b  c
0  1  2  3
1  4  5  6
2  7  8  9

Constructing DataFrame from a numpy ndarray that has labeled columns:

>>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
...                 dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])
>>> df3 = pd.DataFrame(data, columns=['c', 'a'])
...
>>> df3
   c  a
0  3  1
1  6  4
2  9  7

Constructing DataFrame from dataclass:

>>> from dataclasses import make_dataclass
>>> Point = make_dataclass("Point", [("x", int), ("y", int)])
>>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
   x  y
0  0  0
1  0  3
2  2  3
File:           /usr/local/lib/python3.8/dist-packages/pandas/core/frame.py
Type:           type
Subclasses:     SubclassedDataFrame


In [67]:
# Criando um index para o DataFrame
index = ['Linha-' + str(i) for i in range(3)]
index

['Linha-0', 'Linha-1', 'Linha-2']

In [68]:
# Atribuindo ao DataFrame o index criado
df_1 = pd.DataFrame(data = data, index = index)
df_1

Unnamed: 0,0,1,2
Linha-0,1,2,3
Linha-1,4,5,6
Linha-2,7,8,9


In [69]:
# Criando um range com os novos nomes das colunas
columns = ['Coluna-' + str(i) for i in range(3)]
columns

['Coluna-0', 'Coluna-1', 'Coluna-2']

In [70]:
# Atribuindo ao DataFrame os nomes das colunas
df_1 = pd.DataFrame(data = data, index = index, columns = columns)
df_1

Unnamed: 0,Coluna-0,Coluna-1,Coluna-2
Linha-0,1,2,3
Linha-1,4,5,6
Linha-2,7,8,9


In [80]:
"""
Se as colunas e linhas estivessem, por exemplo, nomeadas dessa maneira:
data = {'Coluna_0' : {'Linha_0' : 1, 'Linha_1' : 4, 'Linha_2' : 7},
        'Coluna_1' : {'Linha_0' : 2, 'Linha_1' : 5, 'Linha_2' : 8},
        'Coluna_2' : {'Linha_0' : 3, 'Linha_1' : 6, 'Linha_2' : 9}}
data

Teríamos resultados NaN, porque encontraríamos index e nomes de colunas diferentes
"""

data = {'Coluna-0' : {'Linha-0' : 1, 'Linha-1' : 4, 'Linha-2' : 7},
        'Coluna-1' : {'Linha-0' : 2, 'Linha-1' : 5, 'Linha-2' : 8},
        'Coluna-2' : {'Linha-0' : 3, 'Linha-1' : 6, 'Linha-2' : 9}}
data

{'Coluna-0': {'Linha-0': 1, 'Linha-1': 4, 'Linha-2': 7},
 'Coluna-1': {'Linha-0': 2, 'Linha-1': 5, 'Linha-2': 8},
 'Coluna-2': {'Linha-0': 3, 'Linha-1': 6, 'Linha-2': 9}}

In [72]:
# Criando um DataFrame a partir de um dicionário de dicionários
df_2 = pd.DataFrame(data)
df_2

Unnamed: 0,Coluna-0,Coluna-1,Coluna-2
Linha-0,1,2,3
Linha-1,4,5,6
Linha-2,7,8,9


In [73]:
data = [(1, 2, 3),
        (4, 5, 6),
        (7, 8, 9)]
data

[(1, 2, 3), (4, 5, 6), (7, 8, 9)]

In [74]:
# Criando um DataFrame a partir de uma lista de tuplas
df_3 = pd.DataFrame(data = data, index = index, columns = columns)
df_3

Unnamed: 0,Coluna-0,Coluna-1,Coluna-2
Linha-0,1,2,3
Linha-1,4,5,6
Linha-2,7,8,9


## Concatenando DataFrames

In [75]:
df_1[df_1 > 0] = 'A'
df_1

Unnamed: 0,Coluna-0,Coluna-1,Coluna-2
Linha-0,A,A,A
Linha-1,A,A,A
Linha-2,A,A,A


In [76]:
df_2[df_2 > 0] = 'B'
df_2

Unnamed: 0,Coluna-0,Coluna-1,Coluna-2
Linha-0,B,B,B
Linha-1,B,B,B
Linha-2,B,B,B


In [77]:
df_3[df_3 > 0] = 'C'
df_3

Unnamed: 0,Coluna-0,Coluna-1,Coluna-2
Linha-0,C,C,C
Linha-1,C,C,C
Linha-2,C,C,C


__Exemplo com um retorno NaN:__

In [88]:
data = {'Coluna_0' : {'Linha_0' : 1, 'Linha_1' : 4, 'Linha_2' : 7},
        'Coluna_1' : {'Linha_0' : 2, 'Linha_1' : 5, 'Linha_2' : 8},
        'Coluna_2' : {'Linha_0' : 3, 'Linha_1' : 6, 'Linha_2' : 9}}

df_2 = pd.DataFrame(data)
df_2

Unnamed: 0,Coluna_0,Coluna_1,Coluna_2
Linha_0,1,2,3
Linha_1,4,5,6
Linha_2,7,8,9


In [89]:
# Teremos retorno not a number por causa do underscore
df_4 = pd.concat([df_1, df_2, df_3])
df_4

Unnamed: 0,Coluna-0,Coluna-1,Coluna-2,Coluna_0,Coluna_1,Coluna_2
Linha-0,A,A,A,,,
Linha-1,A,A,A,,,
Linha-2,A,A,A,,,
Linha_0,,,,1.0,2.0,3.0
Linha_1,,,,4.0,5.0,6.0
Linha_2,,,,7.0,8.0,9.0
Linha-0,C,C,C,,,
Linha-1,C,C,C,,,
Linha-2,C,C,C,,,


In [90]:
df_4 = pd.concat([df_1, df_2, df_3], axis = 1)
df_4

Unnamed: 0,Coluna-0,Coluna-1,Coluna-2,Coluna_0,Coluna_1,Coluna_2,Coluna-0.1,Coluna-1.1,Coluna-2.1
Linha-0,A,A,A,,,,C,C,C
Linha-1,A,A,A,,,,C,C,C
Linha-2,A,A,A,,,,C,C,C
Linha_0,,,,1.0,2.0,3.0,,,
Linha_1,,,,4.0,5.0,6.0,,,
Linha_2,,,,7.0,8.0,9.0,,,


In [83]:
# Corrigindo o caracter: substituindo o underscore pelo traço
data = {'Coluna-0' : {'Linha-0' : 1, 'Linha-1' : 4, 'Linha-2' : 7},
        'Coluna-1' : {'Linha-0' : 2, 'Linha-1' : 5, 'Linha-2' : 8},
        'Coluna-2' : {'Linha-0' : 3, 'Linha-1' : 6, 'Linha-2' : 9}}
df_2 = pd.DataFrame(data)
df_2

Unnamed: 0,Coluna-0,Coluna-1,Coluna-2
Linha-0,1,2,3
Linha-1,4,5,6
Linha-2,7,8,9


In [84]:
# Retorno correto da concatenação
df_4 = pd.concat([df_1, df_2, df_3])
df_4

Unnamed: 0,Coluna-0,Coluna-1,Coluna-2
Linha-0,A,A,A
Linha-1,A,A,A
Linha-2,A,A,A
Linha-0,1,2,3
Linha-1,4,5,6
Linha-2,7,8,9
Linha-0,C,C,C
Linha-1,C,C,C
Linha-2,C,C,C


In [85]:
df_4 = pd.concat([df_1, df_2, df_3], axis = 1)
df_4

Unnamed: 0,Coluna-0,Coluna-1,Coluna-2,Coluna-0.1,Coluna-1.1,Coluna-2.1,Coluna-0.2,Coluna-1.2,Coluna-2.2
Linha-0,A,A,A,1,2,3,C,C,C
Linha-1,A,A,A,4,5,6,C,C,C
Linha-2,A,A,A,7,8,9,C,C,C


## Exercício

Testar as saídas dos códigos exibidos na questão e marcar as saídas corretas

__Saída 1__
```
Out [4]:
	A	B	C	D
X	1	2	3	4
```

In [108]:
# Código 1 (Concatenação errada)
df1 = pd.DataFrame({'A': {'X': 1}, 'B': {'X': 2}})
df2 = pd.DataFrame({'C': {'X': 3}, 'D': {'X': 4}})
pd.concat([df1, df2])

Unnamed: 0,A,B,C,D
X,1,2,3,4


In [109]:
# Correção da concatenação do código 1: inserir o axis
pd.concat([df1, df2], axis = 1)

Unnamed: 0,A,B,C,D
X,1,2,3,4


__Saída 2__
```
Out [2]:
	C1	C2
L1	A	C
L2	B	D
```

In [105]:
# Código 2 (Nomeação dos index e colunas incorreta)
dados = [('A', 'B'), ('C', 'D')]
df = pd.DataFrame(dados, columns = ['L1', 'L2'],  index = ['C1', 'C2'])
df

Unnamed: 0,L1,L2
C1,A,B
C2,C,D


In [111]:
# Correção do código 2
dados = [('A', 'B'), ('C', 'D')]
df = pd.DataFrame(dados, index = ['L1', 'L2'],  columns = ['C1', 'C2'])
df

Unnamed: 0,C1,C2
L1,A,B
L2,C,D


__Saída 3__
```
Out [3]:
	A	B	C
X	1	2	3
Y	4	5	6
```

In [112]:
# Código 3 (Correto!)
dados = [[1, 2, 3], [4, 5, 6]]
index = 'X,Y'.split(',')
columns = list('CBA')[::-1]
df = pd.DataFrame(dados, index, columns)
df

Unnamed: 0,A,B,C
X,1,2,3
Y,4,5,6


__Saída 4__
```Out [1]:
	A	B
X	1	2
Y	3	4
```

In [107]:
# Código 4 (Correto!)
dados = {'A': {'X': 1, 'Y': 3}, 'B': {'X': 2, 'Y': 4}}
df = pd.DataFrame(dados)
df

Unnamed: 0,A,B
X,1,2
Y,3,4


## Fim