# Fundamentos de DataFrames

In [1]:
import numpy as np
import pandas as pd

In [2]:
columns = ['W','X','Y','Z']
index = ['A','B','C','D','E']

In [3]:
from numpy.random import randint

In [4]:
np.random.seed(42)
data = randint(-100,100,(5,4))

In [5]:
data

array([[  2,  79,  -8, -86],
       [  6, -29,  88, -80],
       [  2,  21, -26, -13],
       [ 16,  -1,   3,  51],
       [ 30,  49, -48, -99]])

In [7]:
df = pd.DataFrame(data,index,columns)

In [8]:
df

Unnamed: 0,W,X,Y,Z
A,2,79,-8,-86
B,6,-29,88,-80
C,2,21,-26,-13
D,16,-1,3,51
E,30,49,-48,-99


In [11]:
df[['W','Z']]

Unnamed: 0,W,Z
A,2,-86
B,6,-80
C,2,-13
D,16,51
E,30,-99


In [12]:
df['new'] = df['W'] + df['Y']

In [13]:
df

Unnamed: 0,W,X,Y,Z,new
A,2,79,-8,-86,-6
B,6,-29,88,-80,94
C,2,21,-26,-13,-24
D,16,-1,3,51,19
E,30,49,-48,-99,-18


In [16]:
# Normalmente o drop é usado para remover linhas, 
# mas se quisermos utiliza-lo para remover uma coluna, basta colocar axis=1 
# Essa não é  uma mudança permanente
df.drop('new',axis=1)

Unnamed: 0,W,X,Y,Z
A,2,79,-8,-86
B,6,-29,88,-80
C,2,21,-26,-13
D,16,-1,3,51
E,30,49,-48,-99


In [18]:
# new ainda estara aqui
df

Unnamed: 0,W,X,Y,Z,new
A,2,79,-8,-86,-6
B,6,-29,88,-80,94
C,2,21,-26,-13,-24
D,16,-1,3,51,19
E,30,49,-48,-99,-18


In [19]:
df = df.drop('new',axis=1)

In [20]:
df

Unnamed: 0,W,X,Y,Z
A,2,79,-8,-86
B,6,-29,88,-80
C,2,21,-26,-13
D,16,-1,3,51
E,30,49,-48,-99


# Operações com linhas

In [22]:
# Se quisermos acessar linhas a sintaxe df[linha] sendo linha um inteiro ou string causa erro
df['A']

KeyError: 'A'

In [24]:
# Para acessar linhas utilizamos, o metodo loc[]
# Desse modo é retornada uma Series
df.loc['A']

W     2
X    79
Y    -8
Z   -86
Name: A, dtype: int64

In [26]:
# Podemos recuperar os valores de diversas linhas
# Desse modo é retornada um DataFrame
df.loc[['A', 'B']]

Unnamed: 0,W,X,Y,Z
A,2,79,-8,-86
B,6,-29,88,-80


In [27]:
# Podemos acessar utilizando índices numéricos
df.iloc[1]

W     6
X   -29
Y    88
Z   -80
Name: B, dtype: int64

In [28]:
df.iloc[:2]

Unnamed: 0,W,X,Y,Z
A,2,79,-8,-86
B,6,-29,88,-80


In [29]:
df.iloc[[1,2]]

Unnamed: 0,W,X,Y,Z
B,6,-29,88,-80
C,2,21,-26,-13


In [69]:
# Para remover é só usar o método drop (lembrando que não é definitivo)
df.drop('C')

Unnamed: 0,W,X,Y,Z,States
A,2,79,-8,-86,CA
B,6,-29,88,-80,NY
D,16,-1,3,51,OR
E,30,49,-48,-99,CO


In [32]:
df

Unnamed: 0,W,X,Y,Z
A,2,79,-8,-86
B,6,-29,88,-80
C,2,21,-26,-13
D,16,-1,3,51
E,30,49,-48,-99


In [34]:
# Podemos seguir a mesma lógica do numpy e usar mais de um parametro no iloc 
# para pegar um valor especifico do dataframe
# lembrando que loc['x', 'y'] é diferente de loc[['x', 'y']]
df.loc['A', 'W']

2

In [35]:
df.loc[['A', 'C'], ['W', 'Y']]

Unnamed: 0,W,Y
A,2,-8
C,2,-26


# Operações nos dataframes

In [41]:
# Na coluna X, quais linhas são maiores que 0?
# Vamos receber uma series contendo as linhas que satisfazem a condicional
df['X']>0

A     True
B    False
C     True
D    False
E     True
Name: X, dtype: bool

In [44]:
# No dataframe df, quais linhas em que X > 0?
df[df['X']>0]

Unnamed: 0,W,X,Y,Z
A,2,79,-8,-86
C,2,21,-26,-13
E,30,49,-48,-99


In [45]:
# Quais são as linhas em que W > 0 e Y > 1?
df[(df['W']>0) & (df['Y']>1)]

Unnamed: 0,W,X,Y,Z
B,6,-29,88,-80
D,16,-1,3,51


In [63]:
# O metodo reset_index transforma os indices atuais em uma coluna
# e os substitui por inteiros comecando em 0
df.reset_index()

Unnamed: 0,index,W,X,Y,Z,States
0,A,2,79,-8,-86,CA
1,B,6,-29,88,-80,NY
2,C,2,21,-26,-13,WY
3,D,16,-1,3,51,OR
4,E,30,49,-48,-99,CO


In [64]:
new_indexes = ['CA','NY','WY','OR','CO']

In [65]:
df['States'] = new_indexes

In [66]:
df

Unnamed: 0,W,X,Y,Z,States
A,2,79,-8,-86,CA
B,6,-29,88,-80,NY
C,2,21,-26,-13,WY
D,16,-1,3,51,OR
E,30,49,-48,-99,CO


In [67]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2,79,-8,-86
NY,6,-29,88,-80
WY,2,21,-26,-13
OR,16,-1,3,51
CO,30,49,-48,-99


In [68]:
df.describe()

Unnamed: 0,W,X,Y,Z
count,5.0,5.0,5.0,5.0
mean,11.2,23.8,1.8,-45.4
std,11.96662,42.109381,51.915316,63.366395
min,2.0,-29.0,-48.0,-99.0
25%,2.0,-1.0,-26.0,-86.0
50%,6.0,21.0,-8.0,-80.0
75%,16.0,49.0,3.0,-13.0
max,30.0,79.0,88.0,51.0
