# <font color = "blue">  Manipulação de dados com Pandas </font>

In [1]:
# versão de python
from platform import python_version
print("versão de python", python_version())

versão de python 3.9.13


In [2]:
# Importar pandas
import pandas as pd
pd.__version__

'1.5.3'

## Manipulando Dados em DataFrame do Pandas

In [6]:
# crie um dicicionário
dados = {"Estado":["Santa Catarina", "Rio de Janeiro", "Tocantins", "Bahia", "Minhas Gerais"],
         "Ano":list(range(2004, 2009)),
         "Taxa Desemprego":[1.5, 1.7, 1.6, 2.4, 2.7]}

In [7]:
# criar um DataFrame
df = pd.DataFrame(dados)
df

Unnamed: 0,Estado,Ano,Taxa Desemprego
0,Santa Catarina,2004,1.5
1,Rio de Janeiro,2005,1.7
2,Tocantins,2006,1.6
3,Bahia,2007,2.4
4,Minhas Gerais,2008,2.7


In [8]:
# encabeçalho do DataFrame
df.head()

Unnamed: 0,Estado,Ano,Taxa Desemprego
0,Santa Catarina,2004,1.5
1,Rio de Janeiro,2005,1.7
2,Tocantins,2006,1.6
3,Bahia,2007,2.4
4,Minhas Gerais,2008,2.7


In [9]:
# tipo de objeto 
type(df)

pandas.core.frame.DataFrame

In [12]:
# Reorganizando as colunas
pd.DataFrame(dados, columns = ["Estado", "Taxa Desemprego", "Ano"])

Unnamed: 0,Estado,Taxa Desemprego,Ano
0,Santa Catarina,1.5,2004
1,Rio de Janeiro,1.7,2005
2,Tocantins,1.6,2006
3,Bahia,2.4,2007
4,Minhas Gerais,2.7,2008


In [14]:
# Adicionando uma nova coluna
df2 = pd.DataFrame(dados,
               columns = ["Estado", "Taxa Desemprego", "Taxa Crescimento", "Ano"],
               index = ["Estado1", "Estado2", "Estado3", "Estado4", "Estado5"])

In [15]:
print(df2)

                 Estado  Taxa Desemprego Taxa Crescimento   Ano
Estado1  Santa Catarina              1.5              NaN  2004
Estado2  Rio de Janeiro              1.7              NaN  2005
Estado3       Tocantins              1.6              NaN  2006
Estado4           Bahia              2.4              NaN  2007
Estado5   Minhas Gerais              2.7              NaN  2008


In [16]:
df2.values

array([['Santa Catarina', 1.5, nan, 2004],
       ['Rio de Janeiro', 1.7, nan, 2005],
       ['Tocantins', 1.6, nan, 2006],
       ['Bahia', 2.4, nan, 2007],
       ['Minhas Gerais', 2.7, nan, 2008]], dtype=object)

In [17]:
df2.dtypes

Estado               object
Taxa Desemprego     float64
Taxa Crescimento     object
Ano                   int64
dtype: object

In [26]:
df2.columns

Index(['Estado', 'Taxa Desemprego', 'Taxa Crescimento', 'Ano'], dtype='object')


In [21]:
# imprimindo apenas uma columna
df2['Estado']

Estado1    Santa Catarina
Estado2    Rio de Janeiro
Estado3         Tocantins
Estado4             Bahia
Estado5     Minhas Gerais
Name: Estado, dtype: object

In [22]:
type(df2["Estado"])

pandas.core.series.Series

In [23]:
# imprimir mais de uma columna
df2[ ['Estado', 'Taxa Desemprego'] ]

Unnamed: 0,Estado,Taxa Desemprego
Estado1,Santa Catarina,1.5
Estado2,Rio de Janeiro,1.7
Estado3,Tocantins,1.6
Estado4,Bahia,2.4
Estado5,Minhas Gerais,2.7


In [25]:
type(df2[ ['Estado', 'Taxa Desemprego']])

pandas.core.frame.DataFrame

In [27]:
# imprimendo columna index
df2.index

Index(['Estado1', 'Estado2', 'Estado3', 'Estado4', 'Estado5'], dtype='object')

In [35]:
# filtrando pelo indice
df2.filter(items=["Estado3"], axis = 0)

Unnamed: 0,Estado,Taxa Desemprego,Taxa Crescimento,Ano
Estado3,Tocantins,1.6,,2006


## Usando Numpy e Pandas

In [39]:
df2.head()

Unnamed: 0,Estado,Taxa Desemprego,Taxa Crescimento,Ano
Estado1,Santa Catarina,1.5,,2004
Estado2,Rio de Janeiro,1.7,,2005
Estado3,Tocantins,1.6,,2006
Estado4,Bahia,2.4,,2007
Estado5,Minhas Gerais,2.7,,2008


In [42]:
# Resumo estatístico do DataFrame
df2["Taxa Desemprego"].describe()

count    5.000000
mean     1.980000
std      0.535724
min      1.500000
25%      1.600000
50%      1.700000
75%      2.400000
max      2.700000
Name: Taxa Desemprego, dtype: float64

In [43]:
type(df2["Taxa Desemprego"].describe())

pandas.core.series.Series

In [45]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, Estado1 to Estado5
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Estado            5 non-null      object 
 1   Taxa Desemprego   5 non-null      float64
 2   Taxa Crescimento  0 non-null      object 
 3   Ano               5 non-null      int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 372.0+ bytes


In [46]:
# encontrar "NaN"
df2.isna()

Unnamed: 0,Estado,Taxa Desemprego,Taxa Crescimento,Ano
Estado1,False,False,True,False
Estado2,False,False,True,False
Estado3,False,False,True,False
Estado4,False,False,True,False
Estado5,False,False,True,False


In [47]:
df2["Taxa Crescimento"].isna()

Estado1    True
Estado2    True
Estado3    True
Estado4    True
Estado5    True
Name: Taxa Crescimento, dtype: bool

In [54]:
# Preencher valores ausentes como Numpy
import numpy as np
df2["Taxa Crescimento"] = np.arange(5)

In [55]:
df2["Taxa Crescimento"]

Estado1    0
Estado2    1
Estado3    2
Estado4    3
Estado5    4
Name: Taxa Crescimento, dtype: int32

In [58]:
df2.dtypes

Estado               object
Taxa Desemprego     float64
Taxa Crescimento      int32
Ano                   int64
dtype: object

In [61]:
df2[["Taxa Desemprego", "Taxa Crescimento"]].describe()

Unnamed: 0,Taxa Desemprego,Taxa Crescimento
count,5.0,5.0
mean,1.98,2.0
std,0.535724,1.581139
min,1.5,0.0
25%,1.6,1.0
50%,1.7,2.0
75%,2.4,3.0
max,2.7,4.0


## Slicing de DataFrame do Pandas

In [62]:
df2

Unnamed: 0,Estado,Taxa Desemprego,Taxa Crescimento,Ano
Estado1,Santa Catarina,1.5,0,2004
Estado2,Rio de Janeiro,1.7,1,2005
Estado3,Tocantins,1.6,2,2006
Estado4,Bahia,2.4,3,2007
Estado5,Minhas Gerais,2.7,4,2008


In [65]:
df2["Estado1":"Estado3"]

Unnamed: 0,Estado,Taxa Desemprego,Taxa Crescimento,Ano
Estado1,Santa Catarina,1.5,0,2004
Estado2,Rio de Janeiro,1.7,1,2005
Estado3,Tocantins,1.6,2,2006


In [66]:
df2[df2["Taxa Desemprego"] < 2]

Unnamed: 0,Estado,Taxa Desemprego,Taxa Crescimento,Ano
Estado1,Santa Catarina,1.5,0,2004
Estado2,Rio de Janeiro,1.7,1,2005
Estado3,Tocantins,1.6,2,2006


In [69]:
df2.loc[df2["Taxa Desemprego"] < 2, ["Taxa Desemprego", "Ano"]]

Unnamed: 0,Taxa Desemprego,Ano
Estado1,1.5,2004
Estado2,1.7,2005
Estado3,1.6,2006


## Preenchendo Valores Ausentes em DataFrame do Pandas

In [70]:
df3 = pd.read_csv(filepath_or_buffer="dataset.csv")
df3.head()

Unnamed: 0,ID_Pedido,Data_Pedido,ID_Cliente,Segmento,Pais,Regiao,ID_Produto,Categoria,Nome_Produto,Valor_Venda,Quantidade
0,CA-2016-152156,2016-11-08,CG-12520,Consumer,United States,South,FUR-BO-10001798,Furniture,Bush Somerset Collection Bookcase,261.96,
1,CA-2016-152156,2016-11-08,CG-12520,Consumer,United States,South,FUR-CH-10000454,Furniture,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,
2,CA-2016-138688,2016-06-12,DV-13045,Corporate,United States,West,OFF-LA-10000240,Office Supplies,Self-Adhesive Address Labels for Typewriters b...,14.62,2.0
3,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,FUR-TA-10000577,Furniture,Bretford CR4500 Series Slim Rectangular Table,957.5775,5.0
4,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,OFF-ST-10000760,Office Supplies,Eldon Fold 'N Roll Cart System,22.368,2.0


In [72]:
df3.isna().sum()

ID_Pedido       0
Data_Pedido     0
ID_Cliente      0
Segmento        0
Pais            0
Regiao          0
ID_Produto      0
Categoria       0
Nome_Produto    0
Valor_Venda     0
Quantidade      2
dtype: int64

In [76]:
mediana = df3["Quantidade"].median()

In [77]:
df3["Quantidade"].fillna(value = mediana, inplace=True)

In [78]:
df3.isna().sum()

ID_Pedido       0
Data_Pedido     0
ID_Cliente      0
Segmento        0
Pais            0
Regiao          0
ID_Produto      0
Categoria       0
Nome_Produto    0
Valor_Venda     0
Quantidade      0
dtype: int64

## Query de Dados DataFrame do Pandas

In [81]:
# valores mínimos e máximo da coluna Valor_Venda
df3.Valor_Venda.describe()

count     9994.000000
mean       229.858001
std        623.245101
min          0.444000
25%         17.280000
50%         54.490000
75%        209.940000
max      22638.480000
Name: Valor_Venda, dtype: float64

> Da coluna `Valor_Venda` os valores mínimos e máximo são 0.44 e   22638.48

In [83]:
# Se gera um novo DataFrame apenas con o `Valor_Venda` entre 229 e 10000
df4 = df3.query('229 < Valor_Venda < 10000')
df4

Unnamed: 0,ID_Pedido,Data_Pedido,ID_Cliente,Segmento,Pais,Regiao,ID_Produto,Categoria,Nome_Produto,Valor_Venda,Quantidade
0,CA-2016-152156,2016-11-08,CG-12520,Consumer,United States,South,FUR-BO-10001798,Furniture,Bush Somerset Collection Bookcase,261.9600,3.0
1,CA-2016-152156,2016-11-08,CG-12520,Consumer,United States,South,FUR-CH-10000454,Furniture,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.9400,3.0
3,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,FUR-TA-10000577,Furniture,Bretford CR4500 Series Slim Rectangular Table,957.5775,5.0
7,CA-2014-115812,2014-06-09,BH-11710,Consumer,United States,West,TEC-PH-10002275,Technology,Mitel 5320 IP Phone VoIP phone,907.1520,6.0
10,CA-2014-115812,2014-06-09,BH-11710,Consumer,United States,West,FUR-TA-10001539,Furniture,Chromcraft Rectangular Conference Tables,1706.1840,9.0
...,...,...,...,...,...,...,...,...,...,...,...
9973,US-2016-103674,2016-12-06,AP-10720,Home Office,United States,West,TEC-PH-10004080,Technology,Avaya 5410 Digital phone,271.9600,5.0
9976,US-2016-103674,2016-12-06,AP-10720,Home Office,United States,West,TEC-PH-10002496,Technology,Cisco SPA301,249.5840,2.0
9979,US-2016-103674,2016-12-06,AP-10720,Home Office,United States,West,OFF-BI-10002026,Office Supplies,Ibico Recycled Linen-Style Covers,437.4720,14.0
9991,CA-2017-121258,2017-02-26,DB-13060,Consumer,United States,West,TEC-PH-10003645,Technology,Aastra 57i VoIP phone,258.5760,2.0


In [87]:
# Verifica-se que o `Valor_Venda` esteja entre 229 e 10000
df4.Valor_Venda.describe()

count    2357.000000
mean      766.679142
std       856.315136
min       229.544000
25%       323.100000
50%       490.320000
75%       859.200000
max      9892.740000
Name: Valor_Venda, dtype: float64

In [88]:
# Gera-se um novo de DataFrame para `Valor_Venda > 766`
df5 = df4.query('Valor_Venda > 766')
df5

Unnamed: 0,ID_Pedido,Data_Pedido,ID_Cliente,Segmento,Pais,Regiao,ID_Produto,Categoria,Nome_Produto,Valor_Venda,Quantidade
3,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,FUR-TA-10000577,Furniture,Bretford CR4500 Series Slim Rectangular Table,957.5775,5.0
7,CA-2014-115812,2014-06-09,BH-11710,Consumer,United States,West,TEC-PH-10002275,Technology,Mitel 5320 IP Phone VoIP phone,907.1520,6.0
10,CA-2014-115812,2014-06-09,BH-11710,Consumer,United States,West,FUR-TA-10001539,Furniture,Chromcraft Rectangular Conference Tables,1706.1840,9.0
11,CA-2014-115812,2014-06-09,BH-11710,Consumer,United States,West,TEC-PH-10002033,Technology,Konftel 250 Conference phone - Charcoal black,911.4240,4.0
24,CA-2015-106320,2015-09-25,EB-13870,Consumer,United States,West,FUR-TA-10000577,Furniture,Bretford CR4500 Series Slim Rectangular Table,1044.6300,3.0
...,...,...,...,...,...,...,...,...,...,...,...
9925,CA-2015-159534,2015-03-20,DH-13075,Corporate,United States,East,OFF-BI-10003656,Office Supplies,Fellowes PB200 Plastic Comb Binding Machine,1087.9360,8.0
9929,CA-2016-129630,2016-09-04,IM-15055,Consumer,United States,West,TEC-CO-10003763,Technology,Canon PC1060 Personal Laser Copier,2799.9600,5.0
9942,CA-2014-143371,2014-12-28,MD-17350,Consumer,United States,West,OFF-ST-10001128,Office Supplies,"Carina Mini System Audio Rack, Model AR050B",998.8200,9.0
9947,CA-2017-121559,2017-06-01,HW-14935,Corporate,United States,Central,FUR-CH-10003746,Furniture,Hon 4070 Series Pagoda Round Back Stacking Chairs,1925.8800,6.0


In [90]:
# Verifica-se que o filtro estaja certo
df5.Valor_Venda.describe()

count     687.000000
mean     1615.597817
std      1203.403998
min       767.214000
25%       914.200000
50%      1212.960000
75%      1763.575000
max      9892.740000
Name: Valor_Venda, dtype: float64

## Verifica-se a Ocorrência de Diversos Valores em Uma Coluna

In [91]:
# Então aplicas-e o filtro
df3[df3.Quantidade.isin([5, 7, 9, 11])]

Unnamed: 0,ID_Pedido,Data_Pedido,ID_Cliente,Segmento,Pais,Regiao,ID_Produto,Categoria,Nome_Produto,Valor_Venda,Quantidade
3,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,FUR-TA-10000577,Furniture,Bretford CR4500 Series Slim Rectangular Table,957.5775,5.0
5,CA-2014-115812,2014-06-09,BH-11710,Consumer,United States,West,FUR-FU-10001487,Furniture,Eldon Expressions Wood and Plastic Desk Access...,48.8600,7.0
9,CA-2014-115812,2014-06-09,BH-11710,Consumer,United States,West,OFF-AP-10002892,Office Supplies,Belkin F5C206VTEL 6 Outlet Surge,114.9000,5.0
10,CA-2014-115812,2014-06-09,BH-11710,Consumer,United States,West,FUR-TA-10001539,Furniture,Chromcraft Rectangular Conference Tables,1706.1840,9.0
14,US-2015-118983,2015-11-22,HP-14815,Home Office,United States,Central,OFF-AP-10002311,Office Supplies,Holmes Replacement Filter for HEPA Air Cleaner...,68.8100,5.0
...,...,...,...,...,...,...,...,...,...,...,...
9974,US-2016-103674,2016-12-06,AP-10720,Home Office,United States,West,OFF-AR-10004752,Office Supplies,Blackstonian Pencils,18.6900,7.0
9977,US-2016-103674,2016-12-06,AP-10720,Home Office,United States,West,OFF-FA-10003467,Office Supplies,"Alliance Big Bands Rubber Bands, 12/Pack",13.8600,7.0
9981,CA-2017-163566,2017-08-03,TB-21055,Consumer,United States,East,OFF-LA-10004484,Office Supplies,Avery 476,16.5200,5.0
9982,US-2016-157728,2016-09-22,RC-19960,Consumer,United States,Central,OFF-PA-10002195,Office Supplies,"RSVP Cards & Envelopes, Blank White, 8-1/2"" X ...",35.5600,7.0


In [95]:
df3[df3.Quantidade.isin([5, 7, 9, 11])][:10]

Unnamed: 0,ID_Pedido,Data_Pedido,ID_Cliente,Segmento,Pais,Regiao,ID_Produto,Categoria,Nome_Produto,Valor_Venda,Quantidade
3,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,FUR-TA-10000577,Furniture,Bretford CR4500 Series Slim Rectangular Table,957.5775,5.0
5,CA-2014-115812,2014-06-09,BH-11710,Consumer,United States,West,FUR-FU-10001487,Furniture,Eldon Expressions Wood and Plastic Desk Access...,48.86,7.0
9,CA-2014-115812,2014-06-09,BH-11710,Consumer,United States,West,OFF-AP-10002892,Office Supplies,Belkin F5C206VTEL 6 Outlet Surge,114.9,5.0
10,CA-2014-115812,2014-06-09,BH-11710,Consumer,United States,West,FUR-TA-10001539,Furniture,Chromcraft Rectangular Conference Tables,1706.184,9.0
14,US-2015-118983,2015-11-22,HP-14815,Home Office,United States,Central,OFF-AP-10002311,Office Supplies,Holmes Replacement Filter for HEPA Air Cleaner...,68.81,5.0
21,CA-2016-137330,2016-12-09,KB-16585,Corporate,United States,Central,OFF-AR-10000246,Office Supplies,Newell 318,19.46,7.0
22,CA-2016-137330,2016-12-09,KB-16585,Corporate,United States,Central,OFF-AP-10001492,Office Supplies,"Acco Six-Outlet Power Strip, 4' Cord Length",60.34,7.0
27,US-2015-150630,2015-09-17,TB-21520,Consumer,United States,East,FUR-BO-10004834,Furniture,"Riverside Palais Royal Lawyers Bookcase, Royal...",3083.43,7.0
35,CA-2016-117590,2016-12-08,GH-14485,Corporate,United States,Central,TEC-PH-10004977,Technology,GE 30524EE4,1097.544,7.0
36,CA-2016-117590,2016-12-08,GH-14485,Corporate,United States,Central,FUR-FU-10003664,Furniture,"Electrix Architect's Clamp-On Swing Arm Lamp, ...",190.92,5.0


## Operadores Lógicos de Dados com Pandas

In [97]:
df3.head(2)

Unnamed: 0,ID_Pedido,Data_Pedido,ID_Cliente,Segmento,Pais,Regiao,ID_Produto,Categoria,Nome_Produto,Valor_Venda,Quantidade
0,CA-2016-152156,2016-11-08,CG-12520,Consumer,United States,South,FUR-BO-10001798,Furniture,Bush Somerset Collection Bookcase,261.96,3.0
1,CA-2016-152156,2016-11-08,CG-12520,Consumer,United States,South,FUR-CH-10000454,Furniture,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3.0


In [98]:
# Filtrando DataFrame que ocorreram para o segmento de Home Officce e na região South
df3[(df3.Segmento == "Home Office" ) & (df3.Regiao == "South")]

Unnamed: 0,ID_Pedido,Data_Pedido,ID_Cliente,Segmento,Pais,Regiao,ID_Produto,Categoria,Nome_Produto,Valor_Venda,Quantidade
182,CA-2014-158274,2014-11-19,RM-19675,Home Office,United States,South,TEC-PH-10003273,Technology,AT&T TR1909W,503.9600,4.0
183,CA-2014-158274,2014-11-19,RM-19675,Home Office,United States,South,TEC-PH-10004896,Technology,Nokia Lumia 521 (T-Mobile),149.9500,5.0
184,CA-2014-158274,2014-11-19,RM-19675,Home Office,United States,South,TEC-AC-10002345,Technology,HP Standard 104 key PS/2 Keyboard,29.0000,2.0
231,US-2017-100930,2017-04-07,CS-12400,Home Office,United States,South,FUR-TA-10001705,Furniture,Bush Advantage Collection Round Conference Table,233.8600,2.0
232,US-2017-100930,2017-04-07,CS-12400,Home Office,United States,South,FUR-TA-10003473,Furniture,Bretford Rectangular Conference Table Tops,620.6145,3.0
...,...,...,...,...,...,...,...,...,...,...,...
9805,CA-2016-136322,2016-10-21,AP-10720,Home Office,United States,South,FUR-FU-10002878,Furniture,"Seth Thomas 14"" Day/Date Wall Clock",45.5680,2.0
9806,CA-2016-136322,2016-10-21,AP-10720,Home Office,United States,South,OFF-BI-10004817,Office Supplies,GBC Personal VeloBind Strips,28.7520,8.0
9960,CA-2017-141446,2017-09-16,CL-12700,Home Office,United States,South,TEC-AC-10002305,Technology,KeyTronic E03601U1 - Keyboard - Beige,18.0000,1.0
9970,CA-2015-103772,2015-06-28,MP-17470,Home Office,United States,South,OFF-BI-10002867,Office Supplies,GBC Recycled Regency Composition Covers,119.5600,2.0


In [100]:
# Filtrando DataFrame que ocorreram para o segmento de Home Officce ou na região South
df3[(df3.Segmento == "Home Office" ) | (df3.Regiao == "South")]

Unnamed: 0,ID_Pedido,Data_Pedido,ID_Cliente,Segmento,Pais,Regiao,ID_Produto,Categoria,Nome_Produto,Valor_Venda,Quantidade
0,CA-2016-152156,2016-11-08,CG-12520,Consumer,United States,South,FUR-BO-10001798,Furniture,Bush Somerset Collection Bookcase,261.9600,3.0
1,CA-2016-152156,2016-11-08,CG-12520,Consumer,United States,South,FUR-CH-10000454,Furniture,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.9400,3.0
3,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,FUR-TA-10000577,Furniture,Bretford CR4500 Series Slim Rectangular Table,957.5775,5.0
4,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,OFF-ST-10000760,Office Supplies,Eldon Fold 'N Roll Cart System,22.3680,2.0
12,CA-2017-114412,2017-04-15,AA-10480,Consumer,United States,South,OFF-PA-10002365,Office Supplies,Xerox 1967,15.5520,3.0
...,...,...,...,...,...,...,...,...,...,...,...
9979,US-2016-103674,2016-12-06,AP-10720,Home Office,United States,West,OFF-BI-10002026,Office Supplies,Ibico Recycled Linen-Style Covers,437.4720,14.0
9980,US-2015-151435,2015-09-06,SW-20455,Consumer,United States,South,FUR-TA-10001029,Furniture,KI Adjustable-Height Table,85.9800,1.0
9987,CA-2017-163629,2017-11-17,RA-19885,Corporate,United States,South,TEC-AC-10001539,Technology,Logitech G430 Surround Sound Gaming Headset wi...,79.9900,1.0
9988,CA-2017-163629,2017-11-17,RA-19885,Corporate,United States,South,TEC-PH-10004006,Technology,Panasonic KX - TS880B Telephone,206.1000,5.0


In [104]:
# Filtrando DataFrame que ocorreram para o segmento não é Home Officce e não é região South
df3[(df3.Segmento != "Home Office" ) & (df3.Regiao != "South")].sample(5)

Unnamed: 0,ID_Pedido,Data_Pedido,ID_Cliente,Segmento,Pais,Regiao,ID_Produto,Categoria,Nome_Produto,Valor_Venda,Quantidade
8013,CA-2017-120168,2017-05-25,TB-21625,Consumer,United States,East,TEC-AC-10002167,Technology,Imation 8gb Micro Traveldrive Usb 2.0 Flash Drive,120.0,8.0
8841,US-2016-112396,2016-02-09,JR-16210,Corporate,United States,West,TEC-AC-10003628,Technology,Logitech 910-002974 M325 Wireless Mouse for We...,89.97,3.0
667,CA-2017-132682,2017-06-08,TH-21235,Corporate,United States,Central,TEC-PH-10004042,Technology,ClearOne Communications CHAT 70 OC Speaker Phone,381.576,3.0
374,US-2014-119137,2014-07-23,AG-10900,Consumer,United States,West,OFF-AR-10000658,Office Supplies,Newell 324,9.24,1.0
5548,CA-2014-159800,2014-11-28,SG-20470,Consumer,United States,West,OFF-AP-10004859,Office Supplies,Acco 6 Outlet Guardian Premium Surge Suppressor,43.68,3.0


## Função Pandas Group BY

In [109]:
# Média de `Valor_Venda` por `Segmento` e `Regiao`
df3[ ["Valor_Venda", "Segmento", "Regiao"] ].groupby(["Regiao", "Segmento"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Valor_Venda
Regiao,Segmento,Unnamed: 2_level_1
Central,Consumer,207.946728
Central,Corporate,234.763466
Central,Home Office,208.248046
East,Consumer,238.875539
East,Corporate,228.516929
East,Home Office,253.911805
South,Consumer,233.39018
South,Corporate,238.992025
South,Home Office,272.996329
West,Consumer,217.033955


## Grup By com Pandas

In [114]:
# Média de `Valor_Venda` por `Segmento` e `Regiao`
df3.groupby(by = ["Regiao", "Segmento"]).agg({"Valor_Venda":"mean"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Valor_Venda
Regiao,Segmento,Unnamed: 2_level_1
Central,Consumer,207.946728
Central,Corporate,234.763466
Central,Home Office,208.248046
East,Consumer,238.875539
East,Corporate,228.516929
East,Home Office,253.911805
South,Consumer,233.39018
South,Corporate,238.992025
South,Home Office,272.996329
West,Consumer,217.033955


## Agregação Múltipla com Group By

In [126]:
df3.groupby(by = ["Segmento", "Regiao"]).agg({"Valor_Venda":["count", "mean", "median", "std"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,Valor_Venda,Valor_Venda,Valor_Venda,Valor_Venda
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,median,std
Segmento,Regiao,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Consumer,Central,1212,207.946728,46.06,587.906523
Consumer,East,1469,238.875539,51.56,633.371169
Consumer,South,838,233.39018,58.195,559.346824
Consumer,West,1672,217.033955,59.52,551.997547
Corporate,Central,673,234.763466,42.24,818.947521
Corporate,East,877,228.516929,59.9,530.001654
Corporate,South,510,238.992025,49.64,586.176947
Corporate,West,960,235.265911,69.468,471.288764
Home Office,Central,438,208.248046,50.935,371.00918
Home Office,East,502,253.911805,52.715,722.777318


In [127]:
# Resetenado-se os indíce se tem
df3.groupby(by = ["Segmento", "Regiao"]).agg({"Valor_Venda":["count", "mean", "median", "std"]}).reset_index()

Unnamed: 0_level_0,Segmento,Regiao,Valor_Venda,Valor_Venda,Valor_Venda,Valor_Venda
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,median,std
0,Consumer,Central,1212,207.946728,46.06,587.906523
1,Consumer,East,1469,238.875539,51.56,633.371169
2,Consumer,South,838,233.39018,58.195,559.346824
3,Consumer,West,1672,217.033955,59.52,551.997547
4,Corporate,Central,673,234.763466,42.24,818.947521
5,Corporate,East,877,228.516929,59.9,530.001654
6,Corporate,South,510,238.992025,49.64,586.176947
7,Corporate,West,960,235.265911,69.468,471.288764
8,Home Office,Central,438,208.248046,50.935,371.00918
9,Home Office,East,502,253.911805,52.715,722.777318


> Filtrando DataFame do Pandas com Base em Strings

In [128]:
# Primeiras 5 linhas do DataFrame
df3.head()

Unnamed: 0,ID_Pedido,Data_Pedido,ID_Cliente,Segmento,Pais,Regiao,ID_Produto,Categoria,Nome_Produto,Valor_Venda,Quantidade
0,CA-2016-152156,2016-11-08,CG-12520,Consumer,United States,South,FUR-BO-10001798,Furniture,Bush Somerset Collection Bookcase,261.96,3.0
1,CA-2016-152156,2016-11-08,CG-12520,Consumer,United States,South,FUR-CH-10000454,Furniture,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3.0
2,CA-2016-138688,2016-06-12,DV-13045,Corporate,United States,West,OFF-LA-10000240,Office Supplies,Self-Adhesive Address Labels for Typewriters b...,14.62,2.0
3,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,FUR-TA-10000577,Furniture,Bretford CR4500 Series Slim Rectangular Table,957.5775,5.0
4,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,OFF-ST-10000760,Office Supplies,Eldon Fold 'N Roll Cart System,22.368,2.0


In [130]:
# Filtrando o DataFrame pela coluna segmento com valores que iniciam com a letra 'con'
df3[df3.Segmento.str.startswith("Con")].head()

Unnamed: 0,ID_Pedido,Data_Pedido,ID_Cliente,Segmento,Pais,Regiao,ID_Produto,Categoria,Nome_Produto,Valor_Venda,Quantidade
0,CA-2016-152156,2016-11-08,CG-12520,Consumer,United States,South,FUR-BO-10001798,Furniture,Bush Somerset Collection Bookcase,261.96,3.0
1,CA-2016-152156,2016-11-08,CG-12520,Consumer,United States,South,FUR-CH-10000454,Furniture,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3.0
3,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,FUR-TA-10000577,Furniture,Bretford CR4500 Series Slim Rectangular Table,957.5775,5.0
4,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,OFF-ST-10000760,Office Supplies,Eldon Fold 'N Roll Cart System,22.368,2.0
5,CA-2014-115812,2014-06-09,BH-11710,Consumer,United States,West,FUR-FU-10001487,Furniture,Eldon Expressions Wood and Plastic Desk Access...,48.86,7.0


In [131]:
#Filtrando o DataFrame pela coluna con calores que terminan com as  letras 'mer'
df3[df3.Segmento.str.endswith("mer")].head()

Unnamed: 0,ID_Pedido,Data_Pedido,ID_Cliente,Segmento,Pais,Regiao,ID_Produto,Categoria,Nome_Produto,Valor_Venda,Quantidade
0,CA-2016-152156,2016-11-08,CG-12520,Consumer,United States,South,FUR-BO-10001798,Furniture,Bush Somerset Collection Bookcase,261.96,3.0
1,CA-2016-152156,2016-11-08,CG-12520,Consumer,United States,South,FUR-CH-10000454,Furniture,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3.0
3,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,FUR-TA-10000577,Furniture,Bretford CR4500 Series Slim Rectangular Table,957.5775,5.0
4,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,OFF-ST-10000760,Office Supplies,Eldon Fold 'N Roll Cart System,22.368,2.0
5,CA-2014-115812,2014-06-09,BH-11710,Consumer,United States,West,FUR-FU-10001487,Furniture,Eldon Expressions Wood and Plastic Desk Access...,48.86,7.0


In [133]:
# Contagem do número de categoría do string
df3.Segmento.value_counts()

Consumer       5191
Corporate      3020
Home Office    1783
Name: Segmento, dtype: int64

In [134]:
# tipo de objeto criado
type(df3.Segmento.value_counts())

pandas.core.series.Series

In [137]:
# verificando o quantidade de valores do segmento
df3[df3.Segmento.str.endswith("mer")].Segmento.value_counts()

Consumer    5191
Name: Segmento, dtype: int64

In [142]:
# Procurar um string em qualquer posição no campo categórico
df3.Segmento.str.contains("f").sum()

1783

## Split de String em DataFrames Pandas

In [143]:
# Primeiras 5 linhas do DataFrame
df3.head()

Unnamed: 0,ID_Pedido,Data_Pedido,ID_Cliente,Segmento,Pais,Regiao,ID_Produto,Categoria,Nome_Produto,Valor_Venda,Quantidade
0,CA-2016-152156,2016-11-08,CG-12520,Consumer,United States,South,FUR-BO-10001798,Furniture,Bush Somerset Collection Bookcase,261.96,3.0
1,CA-2016-152156,2016-11-08,CG-12520,Consumer,United States,South,FUR-CH-10000454,Furniture,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3.0
2,CA-2016-138688,2016-06-12,DV-13045,Corporate,United States,West,OFF-LA-10000240,Office Supplies,Self-Adhesive Address Labels for Typewriters b...,14.62,2.0
3,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,FUR-TA-10000577,Furniture,Bretford CR4500 Series Slim Rectangular Table,957.5775,5.0
4,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,OFF-ST-10000760,Office Supplies,Eldon Fold 'N Roll Cart System,22.368,2.0


In [144]:
# Primeiras 5 linhas do campo "ID_Pedido"
df3.ID_Pedido.head()

0    CA-2016-152156
1    CA-2016-152156
2    CA-2016-138688
3    US-2015-108966
4    US-2015-108966
Name: ID_Pedido, dtype: object

In [147]:
# Split da coluna pelo carater '-'
df3.ID_Pedido.str.split("-").head()

0    [CA, 2016, 152156]
1    [CA, 2016, 152156]
2    [CA, 2016, 138688]
3    [US, 2015, 108966]
4    [US, 2015, 108966]
Name: ID_Pedido, dtype: object

In [157]:
# Estrair o ano da coluna `ID_Pedido`
df3["Ano"] = df3.ID_Pedido.str.split("-").str[1]
df3.head()

Unnamed: 0,ID_Pedido,Data_Pedido,ID_Cliente,Segmento,Pais,Regiao,ID_Produto,Categoria,Nome_Produto,Valor_Venda,Quantidade,Ano
0,CA-2016-152156,2016-11-08,CG-12520,Consumer,United States,South,FUR-BO-10001798,Furniture,Bush Somerset Collection Bookcase,261.96,3.0,2016
1,CA-2016-152156,2016-11-08,CG-12520,Consumer,United States,South,FUR-CH-10000454,Furniture,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3.0,2016
2,CA-2016-138688,2016-06-12,DV-13045,Corporate,United States,West,OFF-LA-10000240,Office Supplies,Self-Adhesive Address Labels for Typewriters b...,14.62,2.0,2016
3,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,FUR-TA-10000577,Furniture,Bretford CR4500 Series Slim Rectangular Table,957.5775,5.0,2015
4,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,OFF-ST-10000760,Office Supplies,Eldon Fold 'N Roll Cart System,22.368,2.0,2015


## Strip de Strings em DataFrames Pandas

In [158]:
df3.Data_Pedido.head()

0    2016-11-08
1    2016-11-08
2    2016-06-12
3    2015-10-11
4    2015-10-11
Name: Data_Pedido, dtype: object

In [162]:
# Remover os digitos '2' e '0' a esquerda String
df3.Data_Pedido.str.lstrip('20')

0       16-11-08
1       16-11-08
2       16-06-12
3       15-10-11
4       15-10-11
          ...   
9989    14-01-21
9990    17-02-26
9991    17-02-26
9992    17-02-26
9993    17-05-04
Name: Data_Pedido, Length: 9994, dtype: object

In [164]:
# Remover os digitos '2' e '6' a direita do String
df3.Data_Pedido.str.rstrip('26')

0       2016-11-08
1       2016-11-08
2        2016-06-1
3       2015-10-11
4       2015-10-11
           ...    
9989    2014-01-21
9990      2017-02-
9991      2017-02-
9992      2017-02-
9993    2017-05-04
Name: Data_Pedido, Length: 9994, dtype: object

In [178]:
# Remover os digitos '2' e '6' na posição inicial ou final do String
df3.Data_Pedido.str.strip("2")

0       016-11-08
1       016-11-08
2        016-06-1
3       015-10-11
4       015-10-11
          ...    
9989    014-01-21
9990    017-02-26
9991    017-02-26
9992    017-02-26
9993    017-05-04
Name: Data_Pedido, Length: 9994, dtype: object

## Replace no Data Frame Pandas

In [181]:
# Sustitui-se os carateres 'C' e 'G' da coluna 'ID_Cliente'
df3.ID_Cliente = df3.ID_Cliente.str.replace("CG", "AX")

In [183]:
# Verificação
df3.ID_Cliente

0       AX-12520
1       AX-12520
2       DV-13045
3       SO-20335
4       SO-20335
          ...   
9989    TB-21400
9990    DB-13060
9991    DB-13060
9992    DB-13060
9993    CC-12220
Name: ID_Cliente, Length: 9994, dtype: object

## Combinação de Strings em DataFrames do Pandas

In [184]:
# Primeiras 5 linhas do DataFrame
df3.head()

Unnamed: 0,ID_Pedido,Data_Pedido,ID_Cliente,Segmento,Pais,Regiao,ID_Produto,Categoria,Nome_Produto,Valor_Venda,Quantidade,Ano
0,CA-2016-152156,2016-11-08,AX-12520,Consumer,United States,South,FUR-BO-10001798,Furniture,Bush Somerset Collection Bookcase,261.96,3.0,2016
1,CA-2016-152156,2016-11-08,AX-12520,Consumer,United States,South,FUR-CH-10000454,Furniture,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3.0,2016
2,CA-2016-138688,2016-06-12,DV-13045,Corporate,United States,West,OFF-LA-10000240,Office Supplies,Self-Adhesive Address Labels for Typewriters b...,14.62,2.0,2016
3,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,FUR-TA-10000577,Furniture,Bretford CR4500 Series Slim Rectangular Table,957.5775,5.0,2015
4,US-2015-108966,2015-10-11,SO-20335,Consumer,United States,South,OFF-ST-10000760,Office Supplies,Eldon Fold 'N Roll Cart System,22.368,2.0,2015


In [186]:
# Concatenas `ID_pedido` e  `Segmento`
df3["Pedido_Segmento"] = df3.ID_Pedido.str.cat(df3.Segmento, "-")


In [187]:
# Verificação
df3["Pedido_Segmento"].head()

0     CA-2016-152156-Consumer
1     CA-2016-152156-Consumer
2    CA-2016-138688-Corporate
3     US-2015-108966-Consumer
4     US-2015-108966-Consumer
Name: Pedido_Segmento, dtype: object