# Pandas

In [2]:
import pandas as pd
import numpy as np

In [6]:
# Dict in Hand
d = {
    'a':10,
    'b':20,
    'c':30
}
d

{'a': 10, 'b': 20, 'c': 30}

In [8]:
# Dict with pandas
d_new = pd.Series(
    data = [10,20,30],
    index = ['a','b','c']
)
d_new

a    10
b    20
c    30
dtype: int64

## Operations with Series

In [9]:
a = pd.Series(
    index = ["Taubaté","Londres","Dublin"],
    data = [10,20,30]
)
b = pd.Series(
    index = ["Taubaté", "Londres", "Italia"],
    data = [10,20,40]
)
a+b

Dublin      NaN
Italia      NaN
Londres    40.0
Taubaté    20.0
dtype: float64

## DataFrame

In [21]:
df = pd.DataFrame(
    data = np.random.randn(2,3),
    index = ['A','B'],
    columns = ['x','y','z']
)
df

Unnamed: 0,x,y,z
A,-0.33004,-0.447733,0.364004
B,1.042908,0.418034,0.039727


In [22]:
df['x']

A   -0.330040
B    1.042908
Name: x, dtype: float64

In [23]:
df.x #IDEM

A   -0.330040
B    1.042908
Name: x, dtype: float64

In [24]:
df['y']

A   -0.447733
B    0.418034
Name: y, dtype: float64

In [25]:
df[['x','y']]

Unnamed: 0,x,y
A,-0.33004,-0.447733
B,1.042908,0.418034


In [26]:
# creating new Rows
df['new'] = df['x'] + df['y']*2
df

Unnamed: 0,x,y,z,new
A,-0.33004,-0.447733,0.364004,-1.225506
B,1.042908,0.418034,0.039727,1.878976


In [27]:
df['new2'] = 100
df

Unnamed: 0,x,y,z,new,new2
A,-0.33004,-0.447733,0.364004,-1.225506,100
B,1.042908,0.418034,0.039727,1.878976,100


In [28]:
# Remove Rows
df.drop('new',axis=1,inplace=True)
df.drop('new2',axis=1,inplace=True)

In [29]:
df

Unnamed: 0,x,y,z
A,-0.33004,-0.447733,0.364004
B,1.042908,0.418034,0.039727


In [30]:
# Remove Columns
df.drop('B',axis=0,inplace=True)

In [31]:
df

Unnamed: 0,x,y,z
A,-0.33004,-0.447733,0.364004


In [32]:
df.loc[['A'],['x','y']]

Unnamed: 0,x,y
A,-0.33004,-0.447733


### Deleting NaN

In [45]:
dados = {
    'a':[10,2,20,4,5,None,None,10],
    'b':[1,2,30,4,5,6,0,8],
    'c':[1,2,3,15,5,6,7,8]
}

x = pd.DataFrame(dados)
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
a    6 non-null float64
b    8 non-null int64
c    8 non-null int64
dtypes: float64(1), int64(2)
memory usage: 272.0 bytes


In [47]:
x.head()

Unnamed: 0,a,b,c
0,10.0,1,1
1,2.0,2,2
2,20.0,30,3
3,4.0,4,15
4,5.0,5,5


In [49]:
# DELETING NaN
y = x.dropna()
y

Unnamed: 0,a,b,c
0,10.0,1,1
1,2.0,2,2
2,20.0,30,3
3,4.0,4,15
4,5.0,5,5
7,10.0,8,8


In [50]:
# DELETING NaN + RESET the INDEX
y = x.dropna().reset_index()
y

Unnamed: 0,index,a,b,c
0,0,10.0,1,1
1,1,2.0,2,2
2,2,20.0,30,3
3,3,4.0,4,15
4,4,5.0,5,5
5,7,10.0,8,8


In [51]:
# DELETING NaN + RESET the INDEX + DEL ROW 'Index'
y = x.dropna().reset_index().drop('index',axis=1)
y

Unnamed: 0,a,b,c
0,10.0,1,1
1,2.0,2,2
2,20.0,30,3
3,4.0,4,15
4,5.0,5,5
5,10.0,8,8


### Condiction - add value

In [52]:
k = y[y > 5]
k

Unnamed: 0,a,b,c
0,10.0,,
1,,,
2,20.0,30.0,
3,,,15.0
4,,,
5,10.0,8.0,8.0


In [53]:
k = k.dropna()
k

Unnamed: 0,a,b,c
5,10.0,8.0,8.0


In [54]:
# Condiction - Add value - Select row
g = y[ y['a'] > 5 ]
g

Unnamed: 0,a,b,c
0,10.0,1,1
2,20.0,30,3
5,10.0,8,8


In [55]:
h = y[ y[['a','b']] > 5 ]
h

Unnamed: 0,a,b,c
0,10.0,,
1,,,
2,20.0,30.0,
3,,,
4,,,
5,10.0,8.0,


In [56]:
# Deleting any line
x.drop(2,axis=0)

Unnamed: 0,a,b,c
0,10.0,1,1
1,2.0,2,2
3,4.0,4,15
4,5.0,5,5
5,,6,6
6,,0,7
7,10.0,8,8


In [57]:
x.drop(7,axis=0)

Unnamed: 0,a,b,c
0,10.0,1,1
1,2.0,2,2
2,20.0,30,3
3,4.0,4,15
4,5.0,5,5
5,,6,6
6,,0,7


# Ascending and Descending Order

In [58]:
y = x.sort_values('a',ascending=True)
y

Unnamed: 0,a,b,c
1,2.0,2,2
3,4.0,4,15
4,5.0,5,5
0,10.0,1,1
7,10.0,8,8
2,20.0,30,3
5,,6,6
6,,0,7


In [59]:
# Resetar os ìndices
y = y.reset_index().drop('index',axis=1)
y

Unnamed: 0,a,b,c
0,2.0,2,2
1,4.0,4,15
2,5.0,5,5
3,10.0,1,1
4,10.0,8,8
5,20.0,30,3
6,,6,6
7,,0,7


In [60]:
# Descending
y = x.sort_values('a',ascending=False)
y

Unnamed: 0,a,b,c
2,20.0,30,3
0,10.0,1,1
7,10.0,8,8
4,5.0,5,5
3,4.0,4,15
1,2.0,2,2
5,,6,6
6,,0,7


### Missing Data

In [61]:
d = {'A':[1,2,None],'B':[5,np.nan,np.nan],'C':[1,2,3]}

df = pd.DataFrame(d)
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [None]:
# Dropdown the rows that dont have value

In [62]:
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [63]:
# Cut down the columns without value
df.dropna(axis=0)

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [65]:
### Replace NaN for other thing
df.fillna(value = 'NOTHING')

Unnamed: 0,A,B,C
0,1,5,1
1,2,NOTHING,2
2,NOTHING,NOTHING,3


In [69]:
### Replace NaN by an mean
df.fillna(value = df['A'].mean())

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,1.5,2
2,1.5,1.5,3
