## Modificando tipos de series no Pandas

In [1]:
import pandas as pd
import numpy as np

In [2]:
arq = 'drinks.csv'
drinks = pd.read_csv(arq)
drinks.columns

Index(['country', 'beer_servings', 'spirit_servings', 'wine_servings',
       'total_litres_of_pure_alcohol', 'population', 'continent'],
      dtype='object')

## Verificando os tipos do Dataset

In [3]:
# utilizando o dtypes
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
population                       object
continent                        object
dtype: object

In [4]:
drinks.head(10)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,population,continent
0,Afghanistan,0,0,0,0.0,34.660.000,Asia
1,Albania,89,132,54,4.9,2.876.000,Europe
2,Algeria,25,0,14,0.7,40.061.000,Africa
3,Andorra,245,138,312,12.4,77.281.000,Europe
4,Angola,217,57,45,5.9,28.081.000,Africa
5,Antigua & Barbuda,102,128,45,4.9,86.295,North America
6,Argentina,193,25,221,8.3,43.085.000,South America
7,Armenia,21,179,11,3.8,2.925.000,Europe
8,Australia,261,72,212,10.4,24.013.000,Oceania
9,Austria,279,75,191,9.7,8.725.111,Europe


In [5]:
# modificando o tipo em uma série existente com ASTYPE()
drinks['beer_servings'] = drinks.beer_servings.astype(float)

In [6]:
drinks.head(10)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,population,continent
0,Afghanistan,0.0,0,0,0.0,34.660.000,Asia
1,Albania,89.0,132,54,4.9,2.876.000,Europe
2,Algeria,25.0,0,14,0.7,40.061.000,Africa
3,Andorra,245.0,138,312,12.4,77.281.000,Europe
4,Angola,217.0,57,45,5.9,28.081.000,Africa
5,Antigua & Barbuda,102.0,128,45,4.9,86.295,North America
6,Argentina,193.0,25,221,8.3,43.085.000,South America
7,Armenia,21.0,179,11,3.8,2.925.000,Europe
8,Australia,261.0,72,212,10.4,24.013.000,Oceania
9,Austria,279.0,75,191,9.7,8.725.111,Europe


In [7]:
drinks.dtypes

country                          object
beer_servings                   float64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
population                       object
continent                        object
dtype: object

In [8]:
drinks['wine_servings'] = drinks.wine_servings.astype(float)

In [9]:
drinks.head(10)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,population,continent
0,Afghanistan,0.0,0,0.0,0.0,34.660.000,Asia
1,Albania,89.0,132,54.0,4.9,2.876.000,Europe
2,Algeria,25.0,0,14.0,0.7,40.061.000,Africa
3,Andorra,245.0,138,312.0,12.4,77.281.000,Europe
4,Angola,217.0,57,45.0,5.9,28.081.000,Africa
5,Antigua & Barbuda,102.0,128,45.0,4.9,86.295,North America
6,Argentina,193.0,25,221.0,8.3,43.085.000,South America
7,Armenia,21.0,179,11.0,3.8,2.925.000,Europe
8,Australia,261.0,72,212.0,10.4,24.013.000,Oceania
9,Austria,279.0,75,191.0,9.7,8.725.111,Europe


In [10]:
drinks.dtypes

country                          object
beer_servings                   float64
spirit_servings                   int64
wine_servings                   float64
total_litres_of_pure_alcohol    float64
population                       object
continent                        object
dtype: object

In [11]:
# Alternativamente, podemos modificar o tipo enquanto lemos uma série.
drinks = pd.read_csv(arq, dtype={'spirit_servings': float})
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                 float64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
population                       object
continent                        object
dtype: object

## Lendo outro Dataset

In [12]:
arquivo = 'Sanduiches.txt'
vendas = pd.read_table(arquivo)
vendas.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
6,3,1,Side of Chips,,$1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


In [13]:
# Observe a coluna item_price
vendas.dtypes

order_id               int64
quantity               int64
item_name             object
choice_description    object
item_price            object
dtype: object

In [14]:
# convertendo uma string em um float e trazendo a média
vendas.item_price.str.replace('$','').astype(float).mean()

7.464335785374397

## Mais um pouco sobre booleanos

In [15]:
# Já aprendemos a utilizar o contains, certo?
vendas.item_name.str.contains('Chicken').head(10)

0    False
1    False
2    False
3    False
4     True
5     True
6    False
7    False
8    False
9    False
Name: item_name, dtype: bool

In [16]:
# Vamos converter uma serie booleana em integer (False = 0, True = 1)
vendas.item_name.str.contains('Chicken').astype(int).head(20)

0     0
1     0
2     0
3     0
4     1
5     1
6     0
7     0
8     0
9     0
10    0
11    1
12    1
13    1
14    0
15    0
16    1
17    1
18    0
19    1
Name: item_name, dtype: int64

In [19]:
# Podemos fazer o procedimento com uma variável? 
vendas_int = vendas.item_name.str.contains('Chicken').astype(int)
vendas_int.head(20)

0     0
1     0
2     0
3     0
4     1
5     1
6     0
7     0
8     0
9     0
10    0
11    1
12    1
13    1
14    0
15    0
16    1
17    1
18    0
19    1
Name: item_name, dtype: int64

## Quantos sanduíches contém Tomato?

In [23]:
vendas.item_name.str.contains('Tomato').astype(int).sum()

111

In [24]:
# Como é trabalhada a programação acima?
# Passo01 - vendas.item_name.str.contains('Tomato').astype(int)
# Passo02 - Somente aplicamos o sum() na série inteira, que somará as ocorrências verdadeiras, ou seja, 1.

passo01 = vendas.item_name.str.contains('Tomato').astype(int)
passo02 = passo01.sum()
passo02

111

## Será que vendem Coca-Cola?

In [25]:
vendas.item_name.str.contains('Coke').astype(int).sum()

0

In [26]:
vendas

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
6,3,1,Side of Chips,,$1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


In [28]:
# Coke não está na coluna item_name, mas sim na choise_description
vendas.choice_description.str.contains('Coke').astype(int).sum()

ValueError: cannot convert float NaN to integer