# Manipulating a dataset structure
Applying some operations to change DataFrame strucutre and filtering content

In [1]:
import pandas as pd
import numpy as np
tb_veic = pd.read_csv('data/dados_veiculos.csv')

Como podemos ver abaixo, a tabela contém 15 colunas, sendo 9 numéricas e 6 de objetos. Além disso todas as linhas parecem ter todas as variáveis preenchidas.

In [2]:
tb_veic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35952 entries, 0 to 35951
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Make                     35952 non-null  object 
 1   Model                    35952 non-null  object 
 2   Year                     35952 non-null  int64  
 3   Engine Displacement      35952 non-null  float64
 4   Cylinders                35952 non-null  float64
 5   Transmission             35952 non-null  object 
 6   Drivetrain               35952 non-null  object 
 7   Vehicle Class            35952 non-null  object 
 8   Fuel Type                35952 non-null  object 
 9   Fuel Barrels/Year        35952 non-null  float64
 10  City MPG                 35952 non-null  int64  
 11  Highway MPG              35952 non-null  int64  
 12  Combined MPG             35952 non-null  int64  
 13  CO2 Emission Grams/Mile  35952 non-null  float64
 14  Fuel Cost/Year        

Como podemos verificar o que as colunas com tipo 'object' contém? 

## Check DataFrame Column Names

Rename all columns at once:
- `data.columns` is an **attribute** of the DataFrame which results in a list-like of the column names
    - You can substitute it by another list containing the names you want 
    - Note you have to substitute the whole set of column names at once
    
- `data.rename()` is a **method** of a DataFrame, in which you can rename one column at once
    - You just need to pass a dictionary containing {'old_name':'new_name'} 
    - By default, it changes names of a **index** (`axis=0`), you can specify `axis=1` to change **column** names
    - the `inplace` argument

In [3]:
print(tb_veic.columns)

Index(['Make', 'Model', 'Year', 'Engine Displacement', 'Cylinders',
       'Transmission', 'Drivetrain', 'Vehicle Class', 'Fuel Type',
       'Fuel Barrels/Year', 'City MPG', 'Highway MPG', 'Combined MPG',
       'CO2 Emission Grams/Mile', 'Fuel Cost/Year'],
      dtype='object')


In [16]:
print(tb_veic.columns[2])

Year


In [5]:
coluna_0 = tb_veic[tb_veic.columns[0]]

In [17]:
tipo_coluna_0 = coluna_0.dtypes.type
print(tipo_coluna_0)

<class 'numpy.object_'>


In [18]:
tipo_coluna_0 == np.object_

True

In [19]:
tb_veic.columns

Index(['Make', 'Model', 'Year', 'Engine Displacement', 'Cylinders',
       'Transmission', 'Drivetrain', 'Vehicle Class', 'Fuel Type',
       'Fuel Barrels/Year', 'City MPG', 'Highway MPG', 'Combined MPG',
       'CO2 Emission Grams/Mile', 'Fuel Cost/Year', 'id'],
      dtype='object')

In [9]:
for column in tb_veic.columns:
    if tb_veic[column].dtypes.type == np.object_:
        print(tb_veic[column].describe())

count         35952
unique          127
top       Chevrolet
freq           3643
Name: Make, dtype: object
count               35952
unique               3608
top       F150 Pickup 2WD
freq                  197
Name: Model, dtype: object
count               35952
unique                 45
top       Automatic 4-spd
freq                10585
Name: Transmission, dtype: object
count                 35952
unique                    8
top       Front-Wheel Drive
freq                  13044
Name: Drivetrain, dtype: object
count            35952
unique              34
top       Compact Cars
freq              5185
Name: Vehicle Class, dtype: object
count       35952
unique         13
top       Regular
freq        23587
Name: Fuel Type, dtype: object


In [21]:
tb_veic.describe()

Unnamed: 0,Year,Engine Displacement,Cylinders,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
count,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0
mean,2000.7164,3.338493,5.765076,17.609056,17.646139,23.880646,19.929322,475.316339,1892.598465
std,10.08529,1.359395,1.755268,4.467283,4.769349,5.890876,5.112409,119.060773,506.958627
min,1984.0,0.6,2.0,0.06,6.0,9.0,7.0,37.0,600.0
25%,1991.0,2.2,4.0,14.699423,15.0,20.0,16.0,395.0,1500.0
50%,2001.0,3.0,6.0,17.347895,17.0,24.0,19.0,467.736842,1850.0
75%,2010.0,4.3,6.0,20.600625,20.0,27.0,23.0,555.4375,2200.0
max,2017.0,8.4,16.0,47.087143,58.0,61.0,56.0,1269.571429,5800.0


In [11]:
tb_veic['id'] = tb_veic['Make'] + tb_veic['Model'] + tb_veic['Year'].astype('str')

In [12]:
tb_veic['id'].value_counts()

JeepCherokee/Wagoneer1985               23
GMCC15 Pickup 2WD1984                   17
ChevroletS10 Pickup 2WD1984             17
ChevroletC10 Pickup 2WD1984             17
GMCS15 Pickup 2WD1984                   17
                                        ..
InfinitiQX4 4WD1998                      1
ChevroletVan 1500 AWD Conversion2003     1
GMCYukon XL 2500 2WD2013                 1
Mercedes-BenzC63 AMG2009                 1
VolvoS80 FWD2006                         1
Name: id, Length: 16664, dtype: int64

### Substituting `.columns` attribute

In [22]:
print(tb_veic.columns)

Index(['Make', 'Model', 'Year', 'Engine Displacement', 'Cylinders',
       'Transmission', 'Drivetrain', 'Vehicle Class', 'Fuel Type',
       'Fuel Barrels/Year', 'City MPG', 'Highway MPG', 'Combined MPG',
       'CO2 Emission Grams/Mile', 'Fuel Cost/Year', 'id'],
      dtype='object')


In [23]:
old_column_names = list(tb_veic.columns)

In [24]:
print(old_column_names)

['Make', 'Model', 'Year', 'Engine Displacement', 'Cylinders', 'Transmission', 'Drivetrain', 'Vehicle Class', 'Fuel Type', 'Fuel Barrels/Year', 'City MPG', 'Highway MPG', 'Combined MPG', 'CO2 Emission Grams/Mile', 'Fuel Cost/Year', 'id']


In [25]:
print([column.lower().replace(' ', '_') for column in list(tb_veic.columns)])

['make', 'model', 'year', 'engine_displacement', 'cylinders', 'transmission', 'drivetrain', 'vehicle_class', 'fuel_type', 'fuel_barrels/year', 'city_mpg', 'highway_mpg', 'combined_mpg', 'co2_emission_grams/mile', 'fuel_cost/year', 'id']


In [28]:
tb_veic.columns = [column.lower().replace(' ', '_') for column in tb_veic.columns]

In [27]:
tb_veic

Unnamed: 0,make,model,year,engine_displacement,cylinders,transmission,drivetrain,vehicle_class,fuel_type,fuel_barrels/year,city_mpg,highway_mpg,combined_mpg,co2_emission_grams/mile,fuel_cost/year,id
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950,AM GeneralDJ Po Vehicle 2WD1984
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550,AM GeneralFJ8c Post Office1984
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.437500,2100,AM GeneralPost Office DJ5 2WD1985
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550,AM GeneralPost Office DJ8 2WD1985
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.437500,2550,ASC IncorporatedGNX1987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,smart,fortwo coupe,2013,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100,smartfortwo coupe2013
35948,smart,fortwo coupe,2014,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,243.000000,1100,smartfortwo coupe2014
35949,smart,fortwo coupe,2015,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100,smartfortwo coupe2015
35950,smart,fortwo coupe,2016,0.9,3.0,Auto(AM6),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,39,36,246.000000,1100,smartfortwo coupe2016


In [29]:
tb_veic.columns = old_column_names
print(tb_veic.columns)

Index(['Make', 'Model', 'Year', 'Engine Displacement', 'Cylinders',
       'Transmission', 'Drivetrain', 'Vehicle Class', 'Fuel Type',
       'Fuel Barrels/Year', 'City MPG', 'Highway MPG', 'Combined MPG',
       'CO2 Emission Grams/Mile', 'Fuel Cost/Year', 'id'],
      dtype='object')


### Using `.rename() method`

`.rename({'old_column':'new_column'})`

In [30]:
dict_nomes = dict()
dict_nomes['Year'] = 'model_year'
dict_nomes['year'] = 'model_year'
tb_veic = tb_veic.rename(dict_nomes, axis = 1)
print(tb_veic.columns)

Index(['Make', 'Model', 'model_year', 'Engine Displacement', 'Cylinders',
       'Transmission', 'Drivetrain', 'Vehicle Class', 'Fuel Type',
       'Fuel Barrels/Year', 'City MPG', 'Highway MPG', 'Combined MPG',
       'CO2 Emission Grams/Mile', 'Fuel Cost/Year', 'id'],
      dtype='object')


In [31]:
tb_veic.columns = old_column_names
print(tb_veic.columns)

Index(['Make', 'Model', 'Year', 'Engine Displacement', 'Cylinders',
       'Transmission', 'Drivetrain', 'Vehicle Class', 'Fuel Type',
       'Fuel Barrels/Year', 'City MPG', 'Highway MPG', 'Combined MPG',
       'CO2 Emission Grams/Mile', 'Fuel Cost/Year', 'id'],
      dtype='object')


In [32]:
tb_veic = tb_veic.rename({'Year' : 'model_year'}, axis = 1)
print(tb_veic.columns)

Index(['Make', 'Model', 'Year', 'Engine Displacement', 'Cylinders',
       'Transmission', 'Drivetrain', 'Vehicle Class', 'Fuel Type',
       'Fuel Barrels/Year', 'City MPG', 'Highway MPG', 'Combined MPG',
       'CO2 Emission Grams/Mile', 'Fuel Cost/Year', 'id'],
      dtype='object')


In [33]:
tb_veic.rename({'Year' : 'model_year'}, axis = 1)
print(tb_veic.columns)

Index(['Make', 'Model', 'model_year', 'Engine Displacement', 'Cylinders',
       'Transmission', 'Drivetrain', 'Vehicle Class', 'Fuel Type',
       'Fuel Barrels/Year', 'City MPG', 'Highway MPG', 'Combined MPG',
       'CO2 Emission Grams/Mile', 'Fuel Cost/Year', 'id'],
      dtype='object')


In [36]:
dict_rename = {column : column.lower().replace(' ', '_').replace('/', '_by_') for column in tb_veic.columns}


{'Make': 'make',
 'Model': 'model',
 'model_year': 'model_year',
 'Engine Displacement': 'engine_displacement',
 'Cylinders': 'cylinders',
 'Transmission': 'transmission',
 'Drivetrain': 'drivetrain',
 'Vehicle Class': 'vehicle_class',
 'Fuel Type': 'fuel_type',
 'Fuel Barrels/Year': 'fuel_barrels_by_year',
 'City MPG': 'city_mpg',
 'Highway MPG': 'highway_mpg',
 'Combined MPG': 'combined_mpg',
 'CO2 Emission Grams/Mile': 'co2_emission_grams_by_mile',
 'Fuel Cost/Year': 'fuel_cost_by_year',
 'id': 'id'}

In [37]:
tb_veic = tb_veic.rename(dict_rename, axis = 1) #tb_veic.rename(dict_rename, axis = 1, inplace = True)
print(tb_veic.columns)

Index(['make', 'model', 'model_year', 'engine_displacement', 'cylinders',
       'transmission', 'drivetrain', 'vehicle_class', 'fuel_type',
       'fuel_barrels_by_year', 'city_mpg', 'highway_mpg', 'combined_mpg',
       'co2_emission_grams_by_mile', 'fuel_cost_by_year', 'id'],
      dtype='object')


So, we have two options:
> 1. store it again on the variable `data`: 

    data = data.rename(columns={'Make':'Manufacturer', 'Year':'ANO'})
> 2. Use the inplace argument `inplace =  True` to change the values within the dataframe automatically

    data.rename(columns={'Make':'Manufacturer', 'Year':'ANO'}, inplace=True)
    
O parâmetro 'inplace' será deprecado e seu uso é considerado má prática.

## Reorder columns in a dataframe

Remember: You always pass a list of columns to access a dataframe

In [39]:
print(tb_veic.columns)

Index(['make', 'model', 'model_year', 'engine_displacement', 'cylinders',
       'transmission', 'drivetrain', 'vehicle_class', 'fuel_type',
       'fuel_barrels_by_year', 'city_mpg', 'highway_mpg', 'combined_mpg',
       'co2_emission_grams_by_mile', 'fuel_cost_by_year', 'id'],
      dtype='object')


In [40]:
tb_veic['model']

0          DJ Po Vehicle 2WD
1           FJ8c Post Office
2        Post Office DJ5 2WD
3        Post Office DJ8 2WD
4                        GNX
                ...         
35947           fortwo coupe
35948           fortwo coupe
35949           fortwo coupe
35950           fortwo coupe
35951           fortwo coupe
Name: model, Length: 35952, dtype: object

In [42]:
tb_veic[['make', 'model']]

Unnamed: 0,make,model
0,AM General,DJ Po Vehicle 2WD
1,AM General,FJ8c Post Office
2,AM General,Post Office DJ5 2WD
3,AM General,Post Office DJ8 2WD
4,ASC Incorporated,GNX
...,...,...
35947,smart,fortwo coupe
35948,smart,fortwo coupe
35949,smart,fortwo coupe
35950,smart,fortwo coupe


In [43]:
lista_colunas = list(tb_veic.columns)
lista_colunas.sort()
print(lista_colunas)

['city_mpg', 'co2_emission_grams_by_mile', 'combined_mpg', 'cylinders', 'drivetrain', 'engine_displacement', 'fuel_barrels_by_year', 'fuel_cost_by_year', 'fuel_type', 'highway_mpg', 'id', 'make', 'model', 'model_year', 'transmission', 'vehicle_class']


In [45]:
tb_veic = tb_veic[lista_colunas]

Unnamed: 0,city_mpg,co2_emission_grams_by_mile,combined_mpg,cylinders,drivetrain,engine_displacement,fuel_barrels_by_year,fuel_cost_by_year,fuel_type,highway_mpg,id,make,model,model_year,transmission,vehicle_class
0,18,522.764706,17,4.0,2-Wheel Drive,2.5,19.388824,1950,Regular,17,AM GeneralDJ Po Vehicle 2WD1984,AM General,DJ Po Vehicle 2WD,1984,Automatic 3-spd,Special Purpose Vehicle 2WD
1,13,683.615385,13,6.0,2-Wheel Drive,4.2,25.354615,2550,Regular,13,AM GeneralFJ8c Post Office1984,AM General,FJ8c Post Office,1984,Automatic 3-spd,Special Purpose Vehicle 2WD
2,16,555.437500,16,4.0,Rear-Wheel Drive,2.5,20.600625,2100,Regular,17,AM GeneralPost Office DJ5 2WD1985,AM General,Post Office DJ5 2WD,1985,Automatic 3-spd,Special Purpose Vehicle 2WD
3,13,683.615385,13,6.0,Rear-Wheel Drive,4.2,25.354615,2550,Regular,13,AM GeneralPost Office DJ8 2WD1985,AM General,Post Office DJ8 2WD,1985,Automatic 3-spd,Special Purpose Vehicle 2WD
4,14,555.437500,16,6.0,Rear-Wheel Drive,3.8,20.600625,2550,Premium,21,ASC IncorporatedGNX1987,ASC Incorporated,GNX,1987,Automatic 4-spd,Midsize Cars
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,34,244.000000,36,3.0,Rear-Wheel Drive,1.0,9.155833,1100,Premium,38,smartfortwo coupe2013,smart,fortwo coupe,2013,Auto(AM5),Two Seaters
35948,34,243.000000,36,3.0,Rear-Wheel Drive,1.0,9.155833,1100,Premium,38,smartfortwo coupe2014,smart,fortwo coupe,2014,Auto(AM5),Two Seaters
35949,34,244.000000,36,3.0,Rear-Wheel Drive,1.0,9.155833,1100,Premium,38,smartfortwo coupe2015,smart,fortwo coupe,2015,Auto(AM5),Two Seaters
35950,34,246.000000,36,3.0,Rear-Wheel Drive,0.9,9.155833,1100,Premium,39,smartfortwo coupe2016,smart,fortwo coupe,2016,Auto(AM6),Two Seaters


## Remove column (or row)

- The `.drop()` method
- By default, `.drop()` drops a row given its index.

In [46]:
tb_veic.drop('make', axis = 1)

Unnamed: 0,model,model_year,engine_displacement,cylinders,transmission,drivetrain,vehicle_class,fuel_type,fuel_barrels_by_year,city_mpg,highway_mpg,combined_mpg,co2_emission_grams_by_mile,fuel_cost_by_year,id
0,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950,AM GeneralDJ Po Vehicle 2WD1984
1,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550,AM GeneralFJ8c Post Office1984
2,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.437500,2100,AM GeneralPost Office DJ5 2WD1985
3,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550,AM GeneralPost Office DJ8 2WD1985
4,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.437500,2550,ASC IncorporatedGNX1987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,fortwo coupe,2013,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100,smartfortwo coupe2013
35948,fortwo coupe,2014,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,243.000000,1100,smartfortwo coupe2014
35949,fortwo coupe,2015,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100,smartfortwo coupe2015
35950,fortwo coupe,2016,0.9,3.0,Auto(AM6),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,39,36,246.000000,1100,smartfortwo coupe2016


In [51]:
tb_veic.drop(['make', 'model_year'])


KeyError: "['make' 'model_year'] not found in axis"

## Deep vs Shallow copy on pandas

In [None]:
copia_tb_veic = tb_veic
copy_tb_veic = tb_veic.copy(deep = False)
deepcopy_tb_veic = tb_veic.copy(deep = True) # comportamento padrão do método

In [55]:
a = [1, 2, 3]
b = [1, 2, 3]

In [57]:
a is b
a == b

True

In [58]:
c = b
print(c)

[1, 2, 3]


In [59]:
c is b

True

In [62]:
a = 1
b = 1
a is b

True

In [60]:
tb_veic_2 = tb_veic
tb_veic_2 is tb_veic

True

In [61]:
tb_veic_2 = tb_veic.copy()
tb_veic_2 is tb_veic

False

## Sort Values in a DatFrame

In [63]:
tb_veic = tb_veic.sort_values('model_year')
tb_veic

Unnamed: 0,make,model,model_year,engine_displacement,cylinders,transmission,drivetrain,vehicle_class,fuel_type,fuel_barrels_by_year,city_mpg,highway_mpg,combined_mpg,co2_emission_grams_by_mile,fuel_cost_by_year,id
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950,AM GeneralDJ Po Vehicle 2WD1984
16337,GMC,T15 (S15) Pickup 4WD,1984,2.8,6.0,Manual 4-spd,4-Wheel or All-Wheel Drive,Standard Pickup Trucks 4WD,Regular,19.388824,15,21,17,522.764706,1950,GMCT15 (S15) Pickup 4WD1984
16336,GMC,T15 (S15) Pickup 4WD,1984,2.8,6.0,Manual 5-spd,4-Wheel or All-Wheel Drive,Standard Pickup Trucks 4WD,Regular,18.311667,15,22,18,493.722222,1850,GMCT15 (S15) Pickup 4WD1984
5636,Chevrolet,El Camino Pickup 2WD,1984,3.8,6.0,Automatic 3-spd,2-Wheel Drive,Standard Pickup Trucks 2WD,Regular,19.388824,15,19,17,522.764706,1950,ChevroletEl Camino Pickup 2WD1984
5637,Chevrolet,El Camino Pickup 2WD,1984,3.8,6.0,Automatic 4-spd,2-Wheel Drive,Standard Pickup Trucks 2WD,Regular,18.311667,15,22,18,493.722222,1850,ChevroletEl Camino Pickup 2WD1984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1968,BMW,340i xDrive,2017,3.0,6.0,Automatic (S8),All-Wheel Drive,Compact Cars,Premium,13.184400,21,31,25,357.000000,1600,BMW340i xDrive2017
3137,Bentley,Continental GT,2017,4.0,8.0,Automatic (S8),All-Wheel Drive,Compact Cars,Premium,17.347895,15,25,19,475.000000,2150,BentleyContinental GT2017
1965,BMW,340i,2017,3.0,6.0,Manual 6-spd,Rear-Wheel Drive,Compact Cars,Premium,14.330870,19,29,23,395.000000,1750,BMW340i2017
21091,Land Rover,Range Rover Evoque Convertible,2017,2.0,4.0,Automatic (S9),4-Wheel Drive,Small Sport Utility Vehicle 4WD,Premium,14.330870,20,28,23,391.000000,1750,Land RoverRange Rover Evoque Convertible2017


In [66]:
tb_veic = tb_veic.sort_values('model_year', ascending = False)
tb_veic

Unnamed: 0,index,make,model,model_year,engine_displacement,cylinders,transmission,drivetrain,vehicle_class,fuel_type,fuel_barrels_by_year,city_mpg,highway_mpg,combined_mpg,co2_emission_grams_by_mile,fuel_cost_by_year,id
0,20450,Kia,Forte 5,2017,1.6,4.0,Manual 6-spd,Front-Wheel Drive,Large Cars,Regular,13.184400,23,29,25,353.000000,1350,KiaForte 52017
563,29264,Porsche,911 Targa 4S,2017,3.0,6.0,Manual 7-spd,4-Wheel Drive,Minicompact Cars,Premium,14.982273,19,28,22,396.000000,1850,Porsche911 Targa 4S2017
565,140,Acura,MDX AWD,2017,3.5,6.0,Automatic (S9),All-Wheel Drive,Small Sport Utility Vehicle 4WD,Premium,14.982273,19,26,22,404.000000,1850,AcuraMDX AWD2017
566,3895,Cadillac,CTS AWD,2017,2.0,4.0,Automatic (S8),All-Wheel Drive,Midsize Cars,Premium,13.733750,21,29,24,368.000000,1700,CadillacCTS AWD2017
567,20480,Kia,Forte Koup,2017,1.6,4.0,Auto(AM7),Front-Wheel Drive,Compact Cars,Regular,12.207778,25,32,27,328.000000,1250,KiaForte Koup2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35522,13777,Ford,Ranger Pickup 2WD,1984,2.8,6.0,Manual 5-spd,2-Wheel Drive,Small Pickup Trucks 2WD,Regular,18.311667,16,22,18,493.722222,1850,FordRanger Pickup 2WD1984
35523,31003,Subaru,Brat 4WD,1984,1.8,4.0,Manual 4-spd,4-Wheel or All-Wheel Drive,Special Purpose Vehicle 4WD,Regular,13.733750,22,27,24,370.291667,1400,SubaruBrat 4WD1984
35524,31002,Subaru,Brat 4WD,1984,1.8,4.0,Automatic 3-spd,4-Wheel or All-Wheel Drive,Special Purpose Vehicle 4WD,Regular,15.695714,20,23,21,423.190476,1600,SubaruBrat 4WD1984
35525,26752,Nissan,Pickup 2WD,1984,2.0,4.0,Manual 5-spd,2-Wheel Drive,Small Pickup Trucks 2WD,Regular,12.207778,24,31,27,329.148148,1250,NissanPickup 2WD1984


In [67]:
tb_veic = tb_veic.sort_values(['model_year', 'engine_displacement'], ascending = [False, True])
tb_veic

Unnamed: 0,index,make,model,model_year,engine_displacement,cylinders,transmission,drivetrain,vehicle_class,fuel_type,fuel_barrels_by_year,city_mpg,highway_mpg,combined_mpg,co2_emission_grams_by_mile,fuel_cost_by_year,id
626,35939,smart,fortwo cabriolet,2017,0.9,3.0,Manual 5-spd,Rear-Wheel Drive,Two Seaters,Premium,9.694412,31,38,34,258.000000,1200,smartfortwo cabriolet2017
627,35938,smart,fortwo cabriolet,2017,0.9,3.0,Auto(AM6),Rear-Wheel Drive,Two Seaters,Premium,9.417429,33,38,35,251.000000,1150,smartfortwo cabriolet2017
336,25667,Mitsubishi,Mirage G4,2017,1.2,3.0,Automatic (variable gear ratios),Front-Wheel Drive,Compact Cars,Regular,8.908378,35,42,37,237.000000,900,MitsubishiMirage G42017
337,25666,Mitsubishi,Mirage G4,2017,1.2,3.0,Manual 5-spd,Front-Wheel Drive,Compact Cars,Regular,9.417429,33,40,35,249.000000,950,MitsubishiMirage G42017
338,25665,Mitsubishi,Mirage,2017,1.2,3.0,Automatic (variable gear ratios),Front-Wheel Drive,Compact Cars,Regular,8.451538,37,43,39,226.000000,850,MitsubishiMirage2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35498,14493,GMC,C25 Pickup 2WD,1984,6.2,8.0,Manual 4-spd,2-Wheel Drive,Standard Pickup Trucks 2WD,Diesel,20.115000,18,22,19,535.789474,1900,GMCC25 Pickup 2WD1984
35488,14476,GMC,C15 Suburban 2WD,1984,6.2,8.0,Automatic 4-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Diesel,21.232500,16,22,18,565.555556,2000,GMCC15 Suburban 2WD1984
35502,14490,GMC,C25 Pickup 2WD,1984,6.2,8.0,Manual 4-spd,2-Wheel Drive,Standard Pickup Trucks 2WD,Diesel,19.109250,18,22,20,509.000000,1800,GMCC25 Pickup 2WD1984
35486,14477,GMC,C15 Suburban 2WD,1984,6.2,8.0,Automatic 4-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Diesel,22.481471,15,19,17,598.823529,2150,GMCC15 Suburban 2WD1984


In [68]:
tb_veic = tb_veic.sort_values(['model_year', 'engine_displacement', 'make'], ascending = False)
tb_veic

Unnamed: 0,index,make,model,model_year,engine_displacement,cylinders,transmission,drivetrain,vehicle_class,fuel_type,fuel_barrels_by_year,city_mpg,highway_mpg,combined_mpg,co2_emission_grams_by_mile,fuel_cost_by_year,id
670,10889,Dodge,Viper,2017,8.4,10.0,Manual 6-spd,Rear-Wheel Drive,Two Seaters,Premium,23.543571,12,19,14,623.000000,2900,DodgeViper2017
227,3191,Bentley,Mulsanne,2017,6.8,8.0,Automatic (S8),Rear-Wheel Drive,Midsize Cars,Premium,23.543571,11,18,14,652.000000,2900,BentleyMulsanne2017
464,30043,Rolls-Royce,Phantom Coupe,2017,6.7,12.0,Automatic (S8),Rear-Wheel Drive,Compact Cars,Premium,23.543571,11,19,14,638.000000,2900,Rolls-RoycePhantom Coupe2017
69,30064,Rolls-Royce,Phantom EWB,2017,6.7,12.0,Automatic (S8),Rear-Wheel Drive,Large Cars,Premium,23.543571,11,19,14,637.000000,2900,Rolls-RoycePhantom EWB2017
350,30053,Rolls-Royce,Phantom Drophead Coupe,2017,6.7,12.0,Automatic (S8),Rear-Wheel Drive,Compact Cars,Premium,23.543571,11,19,14,637.000000,2900,Rolls-RoycePhantom Drophead Coupe2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35363,16887,Grumman Olson,Kubvan,1984,1.6,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Diesel,14.155000,27,27,27,377.037037,1350,Grumman OlsonKubvan1984
35308,31980,Suzuki,SJ 410 4WD,1984,1.0,4.0,Manual 4-spd,4-Wheel or All-Wheel Drive,Special Purpose Vehicle 4WD,Regular,14.982273,22,22,22,403.954545,1500,SuzukiSJ 410 4WD1984
35349,31982,Suzuki,SJ 410V 4WD,1984,1.0,4.0,Manual 4-spd,4-Wheel or All-Wheel Drive,Special Purpose Vehicle 4WD,Regular,14.982273,22,22,22,403.954545,1500,SuzukiSJ 410V 4WD1984
35360,31984,Suzuki,SJ410K P/U 4WD,1984,1.0,4.0,Manual 4-spd,4-Wheel or All-Wheel Drive,Small Pickup Trucks 4WD,Regular,14.982273,22,22,22,403.954545,1500,SuzukiSJ410K P/U 4WD1984


# Filter records
>    - `mask` concept
>    - `.query()` method

This is really important for data wrangling.

## Simple Example: Starting with a numpy array. How can I filter the values of a list?

In [70]:
meu_array = np.array(range(1,10,2))
print(meu_array)

[1 3 5 7 9]


The results of `my_array > 5` is what is called **a mask**. A result containing the `True` and `False` results of an operation. 

In [71]:
print(meu_array > 5)

[False False False  True  True]


Masks can be used as an index to select data!

In [72]:
print(meu_array[meu_array > 5])

[7 9]


After selecting, you can do anything with it, for example assigning it. This operation is called a `vectorial` operation. It is done all at once.

In [73]:
meu_array_filtrado = meu_array[meu_array > 5]
print(meu_array_filtrado)

[7 9]


You can also save the condition

In [74]:
num_gt5 = meu_array > 5
print(num_gt5)

[False False False  True  True]


## Bitwise logical operators - Combining conditions

To make more than one condition together, you can use 
- `&` - analogous to `and`
- `|` - analogous to `or` 

For example, get all numbers from my_array that are greater than 3 and smaller than 8

Let's do it in steps:
- get values greater than 3

In [76]:
num_gt3 = meu_array > 3
print(meu_array)
print(num_gt3)
print(meu_array[num_gt3])

[1 3 5 7 9]
[False False  True  True  True]
[5 7 9]


- get values smaller than 8

In [77]:
num_st8 = meu_array < 8
print(meu_array)
print(num_st8)
print(meu_array[num_st8])

[1 3 5 7 9]
[ True  True  True  True False]
[1 3 5 7]


- get values greater than 3 AND smaller than 8

In [79]:
num_3a8 = num_st8 & num_gt3
print(meu_array)
print(num_3a8)
print(meu_array[num_3a8])

[1 3 5 7 9]
[False False  True  True False]
[5 7]


- get values greater than 3 OR smaller than 8

In [80]:
num_3a8 = num_st8 | num_gt3
print(meu_array)
print(num_3a8)

[1 3 5 7 9]
[ True  True  True  True  True]


## Now in a DataFrame

Let's find the rows in which the Cylinders values are exactly 6.

In [81]:
cyl_6 = tb_veic['cylinders'] == 6
print(cyl_6)
sum(cyl_6)

670      False
227      False
464      False
69       False
350      False
         ...  
35363    False
35308    False
35349    False
35360    False
35359    False
Name: cylinders, Length: 35952, dtype: bool


12765

In [82]:
#tb_veic_cyl6 = tb_veic[cyl_6] 
# mais abreviado seria:
tb_veic_cyl6 = tb_veic[tb_veic['cylinders'] == 6]
tb_veic_cyl6

Unnamed: 0,index,make,model,model_year,engine_displacement,cylinders,transmission,drivetrain,vehicle_class,fuel_type,fuel_barrels_by_year,city_mpg,highway_mpg,combined_mpg,co2_emission_grams_by_mile,fuel_cost_by_year,id
65,16088,GMC,Sierra C15 2WD,2017,4.3,6.0,Automatic 6-spd,Rear-Wheel Drive,Standard Pickup Trucks 2WD,Gasoline or E85,16.480500,18,24,20,448.000000,1650,GMCSierra C15 2WD2017
240,16168,GMC,Sierra K15 4WD,2017,4.3,6.0,Automatic 6-spd,4-Wheel Drive,Standard Pickup Trucks 4WD,Gasoline or E85,17.347895,17,22,19,474.000000,1750,GMCSierra K15 4WD2017
58,7207,Chevrolet,Silverado K15 4WD,2017,4.3,6.0,Automatic 6-spd,4-Wheel Drive,Standard Pickup Trucks 4WD,Gasoline or E85,17.347895,17,22,19,473.000000,1750,ChevroletSilverado K15 4WD2017
63,7131,Chevrolet,Silverado C15 2WD,2017,4.3,6.0,Automatic 6-spd,Rear-Wheel Drive,Standard Pickup Trucks 2WD,Gasoline or E85,16.480500,18,24,20,448.000000,1650,ChevroletSilverado C15 2WD2017
619,29301,Porsche,911 Turbo Cabriolet,2017,3.8,6.0,Auto(AM-S7),All-Wheel Drive,Minicompact Cars,Premium,15.695714,19,24,21,430.000000,1950,Porsche911 Turbo Cabriolet2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35861,7457,Chevrolet,T10 (S10) Blazer 4WD,1984,2.8,6.0,Manual 5-spd,4-Wheel or All-Wheel Drive,Special Purpose Vehicle 4WD,Regular,19.388824,15,22,17,522.764706,1950,ChevroletT10 (S10) Blazer 4WD1984
35858,7455,Chevrolet,T10 (S10) Blazer 4WD,1984,2.8,6.0,Automatic 4-spd,4-Wheel or All-Wheel Drive,Special Purpose Vehicle 4WD,Regular,18.311667,15,22,18,493.722222,1850,ChevroletT10 (S10) Blazer 4WD1984
35859,7456,Chevrolet,T10 (S10) Blazer 4WD,1984,2.8,6.0,Manual 4-spd,4-Wheel or All-Wheel Drive,Special Purpose Vehicle 4WD,Regular,19.388824,15,20,17,522.764706,1950,ChevroletT10 (S10) Blazer 4WD1984
35853,6772,Chevrolet,S10 Cab Chassis 2WD,1984,2.8,6.0,Automatic 4-spd,2-Wheel Drive,Special Purpose Vehicles,Regular,25.354615,13,13,13,683.615385,2550,ChevroletS10 Cab Chassis 2WD1984


In [83]:
tb_veic_cyl6.describe()

Unnamed: 0,index,model_year,engine_displacement,cylinders,fuel_barrels_by_year,city_mpg,highway_mpg,combined_mpg,co2_emission_grams_by_mile,fuel_cost_by_year
count,12765.0,12765.0,12765.0,12765.0,12765.0,12765.0,12765.0,12765.0,12765.0,12765.0
mean,16781.892754,2001.294242,3.439342,6.0,18.086572,16.328946,22.661261,18.606189,487.609906,1943.19624
std,10303.764189,9.575349,0.549034,0.0,2.579958,2.130492,3.747362,2.603443,69.240664,274.865226
min,1.0,1984.0,1.8,6.0,0.109412,9.0,10.0,10.0,206.0,1200.0
25%,7698.0,1993.0,3.0,6.0,16.4805,15.0,20.0,17.0,444.35,1750.0
50%,16366.0,2002.0,3.5,6.0,17.347895,16.0,23.0,19.0,467.736842,1950.0
75%,25901.0,2009.0,3.8,6.0,19.388824,18.0,25.0,20.0,522.764706,2150.0
max,35904.0,2017.0,5.3,6.0,32.961,32.0,38.0,31.0,888.7,3400.0


### You can combine conditions

Cars from `Ford` and 6 `Cylinders`

In [84]:
print(tb_veic.columns)

Index(['index', 'make', 'model', 'model_year', 'engine_displacement',
       'cylinders', 'transmission', 'drivetrain', 'vehicle_class', 'fuel_type',
       'fuel_barrels_by_year', 'city_mpg', 'highway_mpg', 'combined_mpg',
       'co2_emission_grams_by_mile', 'fuel_cost_by_year', 'id'],
      dtype='object')


In [85]:
print(tb_veic['make'].unique())

['Dodge' 'Bentley' 'Rolls-Royce' 'Lamborghini' 'Jeep' 'Ferrari' 'GMC'
 'Chevrolet' 'Cadillac' 'Mercedes-Benz' 'Aston Martin' 'Toyota' 'Ram'
 'Lexus' 'Nissan' 'Ford' 'Audi' 'Kia' 'Jaguar' 'Genesis' 'Porsche'
 'Maserati' 'BMW' 'McLaren Automotive' 'Lincoln' 'Infiniti' 'Volkswagen'
 'Subaru' 'Chrysler' 'Buick' 'Lotus' 'Honda' 'Acura' 'Hyundai' 'Mazda'
 'Mitsubishi' 'Volvo' 'MINI' 'Land Rover' 'Fiat' 'smart' 'Pagani'
 'Roush Performance' 'Scion' 'Alfa Romeo' 'Bugatti' 'SRT'
 'Mobility Ventures LLC' 'VPG' 'Suzuki' 'Maybach' 'Saab' 'Fisker'
 'Mercury' 'Spyker' 'Mahindra' 'Hummer' 'Saturn' 'Pontiac'
 'Saleen Performance' 'Isuzu' 'Tecstar, LP' 'Shelby' 'BMW Alpina'
 'Oldsmobile' 'Daewoo' 'Morgan' 'London Taxi' 'Plymouth' 'Qvale'
 'Quantum Technologies' 'Eagle' 'Vector' 'Geo' 'Saleen' 'Federal Coach'
 'Dabryan Coach Builders Inc' 'Panoz Auto-Development' 'Panos'
 'J.K. Motors' 'Autokraft Limited' 'PAS, Inc' 'CX Automotive'
 'Wallace Environmental' 'PAS Inc - GMC' 'Isis Imports Ltd'
 'Import Tra

In [86]:
True and False

False

In [87]:
print(tb_veic['make'] == 'Ford')

670      False
227      False
464      False
69       False
350      False
         ...  
35363    False
35308    False
35349    False
35360    False
35359    False
Name: make, Length: 35952, dtype: bool


In [88]:
cond_ford_6 = ((tb_veic['make'] == 'Ford') & (tb_veic['cylinders'] == 6)) | ((tb_veic['make'] == 'Bentley') & (tb_veic['cylinders'] == 6))

print(cond_ford_6)
sum(cond_ford_6)

670      False
227      False
464      False
69       False
350      False
         ...  
35363    False
35308    False
35349    False
35360    False
35359    False
Length: 35952, dtype: bool


1293

In [91]:
tb_ford_cyl6 = tb_veic[cond_ford_6]
tb_ford_cyl6

Unnamed: 0,index,make,model,model_year,engine_displacement,cylinders,transmission,drivetrain,vehicle_class,fuel_type,fuel_barrels_by_year,city_mpg,highway_mpg,combined_mpg,co2_emission_grams_by_mile,fuel_cost_by_year,id
474,14355,Ford,Transit T150 Wagon FFV,2017,3.7,6.0,Automatic (S6),Rear-Wheel Drive,"Vans, Passenger Type",Gasoline or E85,20.600625,14,18,16,570.000000,2100,FordTransit T150 Wagon FFV2017
785,13680,Ford,Mustang,2017,3.7,6.0,Automatic (S6),Rear-Wheel Drive,Subcompact Cars,Regular,15.695714,18,27,21,423.000000,1600,FordMustang2017
816,13681,Ford,Mustang,2017,3.7,6.0,Manual 6-spd,Rear-Wheel Drive,Subcompact Cars,Regular,15.695714,18,27,21,428.000000,1600,FordMustang2017
516,14352,Ford,Transit T150 Wagon,2017,3.5,6.0,Automatic (S6),Rear-Wheel Drive,"Vans, Passenger Type",Regular,20.600625,15,19,16,552.000000,2100,FordTransit T150 Wagon2017
146,12359,Ford,Expedition 4WD,2017,3.5,6.0,Automatic (S6),Part-time 4-Wheel Drive,Standard Sport Utility Vehicle 4WD,Regular,19.388824,15,20,17,537.000000,1950,FordExpedition 4WD2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35516,13771,Ford,Ranger Pickup 2WD,1984,2.8,6.0,Automatic 3-spd,2-Wheel Drive,Small Pickup Trucks 2WD,Regular,18.311667,16,21,18,493.722222,1850,FordRanger Pickup 2WD1984
35519,13772,Ford,Ranger Pickup 2WD,1984,2.8,6.0,Manual 4-spd,2-Wheel Drive,Small Pickup Trucks 2WD,Regular,16.480500,18,23,20,444.350000,1650,FordRanger Pickup 2WD1984
35520,13775,Ford,Ranger Pickup 2WD,1984,2.8,6.0,Automatic 3-spd,2-Wheel Drive,Small Pickup Trucks 2WD,Regular,19.388824,15,19,17,522.764706,1950,FordRanger Pickup 2WD1984
35521,13776,Ford,Ranger Pickup 2WD,1984,2.8,6.0,Manual 4-spd,2-Wheel Drive,Small Pickup Trucks 2WD,Regular,17.347895,17,22,19,467.736842,1750,FordRanger Pickup 2WD1984


### Using conditions to create new columns

In [92]:
tb_veic_2 = tb_veic.copy()

In [95]:
tb_veic_2.loc[tb_veic['cylinders'] == 6,'make']

array(['GMC', 'Chevrolet', 'Porsche', 'Nissan', 'Kia', 'Genesis',
       'Lincoln', 'Infiniti', 'Ford', 'Volkswagen', 'Subaru', 'Ram',
       'Jeep', 'Dodge', 'Chrysler', 'Cadillac', 'Buick', 'Toyota',
       'Mercedes-Benz', 'Lotus', 'Lexus', 'Honda', 'Acura', 'Hyundai',
       'Maserati', 'Jaguar', 'BMW', 'Audi', 'Volvo', 'Mitsubishi',
       'Land Rover', 'Mazda', 'Suzuki', 'Saab', 'Mercury', 'Saturn',
       'Pontiac', 'Isuzu', 'Oldsmobile', 'Daewoo', 'Plymouth',
       'Aston Martin', 'Eagle', 'Alfa Romeo', 'PAS, Inc', 'J.K. Motors',
       'CX Automotive', 'PAS Inc - GMC', 'Wallace Environmental',
       'Import Trade Services', 'Sterling', 'Goldacre', 'Peugeot',
       'Merkur', 'Environmental Rsch and Devp Corp',
       'Ruf Automobile Gmbh', 'JBA Motorcars, Inc.',
       'American Motors Corporation', 'Bitter Gmbh and Co. Kg',
       'ASC Incorporated', 'Lambda Control Systems',
       'TVR Engineering Ltd', 'Vixen Motor Company', 'AM General'],
      dtype=object)

In [96]:
tb_veic_2.loc[tb_veic['cylinders'] == 6, 'cyl_6'] = True
tb_veic_2

Unnamed: 0,index,make,model,model_year,engine_displacement,cylinders,transmission,drivetrain,vehicle_class,fuel_type,fuel_barrels_by_year,city_mpg,highway_mpg,combined_mpg,co2_emission_grams_by_mile,fuel_cost_by_year,id,cyl_6
670,10889,Dodge,Viper,2017,8.4,10.0,Manual 6-spd,Rear-Wheel Drive,Two Seaters,Premium,23.543571,12,19,14,623.000000,2900,DodgeViper2017,
227,3191,Bentley,Mulsanne,2017,6.8,8.0,Automatic (S8),Rear-Wheel Drive,Midsize Cars,Premium,23.543571,11,18,14,652.000000,2900,BentleyMulsanne2017,
464,30043,Rolls-Royce,Phantom Coupe,2017,6.7,12.0,Automatic (S8),Rear-Wheel Drive,Compact Cars,Premium,23.543571,11,19,14,638.000000,2900,Rolls-RoycePhantom Coupe2017,
69,30064,Rolls-Royce,Phantom EWB,2017,6.7,12.0,Automatic (S8),Rear-Wheel Drive,Large Cars,Premium,23.543571,11,19,14,637.000000,2900,Rolls-RoycePhantom EWB2017,
350,30053,Rolls-Royce,Phantom Drophead Coupe,2017,6.7,12.0,Automatic (S8),Rear-Wheel Drive,Compact Cars,Premium,23.543571,11,19,14,637.000000,2900,Rolls-RoycePhantom Drophead Coupe2017,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35363,16887,Grumman Olson,Kubvan,1984,1.6,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Diesel,14.155000,27,27,27,377.037037,1350,Grumman OlsonKubvan1984,
35308,31980,Suzuki,SJ 410 4WD,1984,1.0,4.0,Manual 4-spd,4-Wheel or All-Wheel Drive,Special Purpose Vehicle 4WD,Regular,14.982273,22,22,22,403.954545,1500,SuzukiSJ 410 4WD1984,
35349,31982,Suzuki,SJ 410V 4WD,1984,1.0,4.0,Manual 4-spd,4-Wheel or All-Wheel Drive,Special Purpose Vehicle 4WD,Regular,14.982273,22,22,22,403.954545,1500,SuzukiSJ 410V 4WD1984,
35360,31984,Suzuki,SJ410K P/U 4WD,1984,1.0,4.0,Manual 4-spd,4-Wheel or All-Wheel Drive,Small Pickup Trucks 4WD,Regular,14.982273,22,22,22,403.954545,1500,SuzukiSJ410K P/U 4WD1984,


In [98]:
tb_veic_2[tb_veic['cylinders'] == 6]

Unnamed: 0,index,make,model,model_year,engine_displacement,cylinders,transmission,drivetrain,vehicle_class,fuel_type,fuel_barrels_by_year,city_mpg,highway_mpg,combined_mpg,co2_emission_grams_by_mile,fuel_cost_by_year,id,cyl_6
65,16088,GMC,Sierra C15 2WD,2017,4.3,6.0,Automatic 6-spd,Rear-Wheel Drive,Standard Pickup Trucks 2WD,Gasoline or E85,16.480500,18,24,20,448.000000,1650,GMCSierra C15 2WD2017,True
240,16168,GMC,Sierra K15 4WD,2017,4.3,6.0,Automatic 6-spd,4-Wheel Drive,Standard Pickup Trucks 4WD,Gasoline or E85,17.347895,17,22,19,474.000000,1750,GMCSierra K15 4WD2017,True
58,7207,Chevrolet,Silverado K15 4WD,2017,4.3,6.0,Automatic 6-spd,4-Wheel Drive,Standard Pickup Trucks 4WD,Gasoline or E85,17.347895,17,22,19,473.000000,1750,ChevroletSilverado K15 4WD2017,True
63,7131,Chevrolet,Silverado C15 2WD,2017,4.3,6.0,Automatic 6-spd,Rear-Wheel Drive,Standard Pickup Trucks 2WD,Gasoline or E85,16.480500,18,24,20,448.000000,1650,ChevroletSilverado C15 2WD2017,True
619,29301,Porsche,911 Turbo Cabriolet,2017,3.8,6.0,Auto(AM-S7),All-Wheel Drive,Minicompact Cars,Premium,15.695714,19,24,21,430.000000,1950,Porsche911 Turbo Cabriolet2017,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35861,7457,Chevrolet,T10 (S10) Blazer 4WD,1984,2.8,6.0,Manual 5-spd,4-Wheel or All-Wheel Drive,Special Purpose Vehicle 4WD,Regular,19.388824,15,22,17,522.764706,1950,ChevroletT10 (S10) Blazer 4WD1984,True
35858,7455,Chevrolet,T10 (S10) Blazer 4WD,1984,2.8,6.0,Automatic 4-spd,4-Wheel or All-Wheel Drive,Special Purpose Vehicle 4WD,Regular,18.311667,15,22,18,493.722222,1850,ChevroletT10 (S10) Blazer 4WD1984,True
35859,7456,Chevrolet,T10 (S10) Blazer 4WD,1984,2.8,6.0,Manual 4-spd,4-Wheel or All-Wheel Drive,Special Purpose Vehicle 4WD,Regular,19.388824,15,20,17,522.764706,1950,ChevroletT10 (S10) Blazer 4WD1984,True
35853,6772,Chevrolet,S10 Cab Chassis 2WD,1984,2.8,6.0,Automatic 4-spd,2-Wheel Drive,Special Purpose Vehicles,Regular,25.354615,13,13,13,683.615385,2550,ChevroletS10 Cab Chassis 2WD1984,True


In [99]:
tb_veic_2.loc[tb_veic['cylinders'] != 6, 'cyl_6'] = False
tb_veic_2

Unnamed: 0,index,make,model,model_year,engine_displacement,cylinders,transmission,drivetrain,vehicle_class,fuel_type,fuel_barrels_by_year,city_mpg,highway_mpg,combined_mpg,co2_emission_grams_by_mile,fuel_cost_by_year,id,cyl_6
670,10889,Dodge,Viper,2017,8.4,10.0,Manual 6-spd,Rear-Wheel Drive,Two Seaters,Premium,23.543571,12,19,14,623.000000,2900,DodgeViper2017,False
227,3191,Bentley,Mulsanne,2017,6.8,8.0,Automatic (S8),Rear-Wheel Drive,Midsize Cars,Premium,23.543571,11,18,14,652.000000,2900,BentleyMulsanne2017,False
464,30043,Rolls-Royce,Phantom Coupe,2017,6.7,12.0,Automatic (S8),Rear-Wheel Drive,Compact Cars,Premium,23.543571,11,19,14,638.000000,2900,Rolls-RoycePhantom Coupe2017,False
69,30064,Rolls-Royce,Phantom EWB,2017,6.7,12.0,Automatic (S8),Rear-Wheel Drive,Large Cars,Premium,23.543571,11,19,14,637.000000,2900,Rolls-RoycePhantom EWB2017,False
350,30053,Rolls-Royce,Phantom Drophead Coupe,2017,6.7,12.0,Automatic (S8),Rear-Wheel Drive,Compact Cars,Premium,23.543571,11,19,14,637.000000,2900,Rolls-RoycePhantom Drophead Coupe2017,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35363,16887,Grumman Olson,Kubvan,1984,1.6,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Diesel,14.155000,27,27,27,377.037037,1350,Grumman OlsonKubvan1984,False
35308,31980,Suzuki,SJ 410 4WD,1984,1.0,4.0,Manual 4-spd,4-Wheel or All-Wheel Drive,Special Purpose Vehicle 4WD,Regular,14.982273,22,22,22,403.954545,1500,SuzukiSJ 410 4WD1984,False
35349,31982,Suzuki,SJ 410V 4WD,1984,1.0,4.0,Manual 4-spd,4-Wheel or All-Wheel Drive,Special Purpose Vehicle 4WD,Regular,14.982273,22,22,22,403.954545,1500,SuzukiSJ 410V 4WD1984,False
35360,31984,Suzuki,SJ410K P/U 4WD,1984,1.0,4.0,Manual 4-spd,4-Wheel or All-Wheel Drive,Small Pickup Trucks 4WD,Regular,14.982273,22,22,22,403.954545,1500,SuzukiSJ410K P/U 4WD1984,False


In [100]:
tb_veic_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35952 entries, 670 to 35359
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   index                       35952 non-null  int64  
 1   make                        35952 non-null  object 
 2   model                       35952 non-null  object 
 3   model_year                  35952 non-null  int64  
 4   engine_displacement         35952 non-null  float64
 5   cylinders                   35952 non-null  float64
 6   transmission                35952 non-null  object 
 7   drivetrain                  35952 non-null  object 
 8   vehicle_class               35952 non-null  object 
 9   fuel_type                   35952 non-null  object 
 10  fuel_barrels_by_year        35952 non-null  float64
 11  city_mpg                    35952 non-null  int64  
 12  highway_mpg                 35952 non-null  int64  
 13  combined_mpg                3

In [101]:
tb_veic_2.describe()

Unnamed: 0,index,model_year,engine_displacement,cylinders,fuel_barrels_by_year,city_mpg,highway_mpg,combined_mpg,co2_emission_grams_by_mile,fuel_cost_by_year
count,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0
mean,17975.5,2000.7164,3.338493,5.765076,17.609056,17.646139,23.880646,19.929322,475.316339,1892.598465
std,10378.592776,10.08529,1.359395,1.755268,4.467283,4.769349,5.890876,5.112409,119.060773,506.958627
min,0.0,1984.0,0.6,2.0,0.06,6.0,9.0,7.0,37.0,600.0
25%,8987.75,1991.0,2.2,4.0,14.699423,15.0,20.0,16.0,395.0,1500.0
50%,17975.5,2001.0,3.0,6.0,17.347895,17.0,24.0,19.0,467.736842,1850.0
75%,26963.25,2010.0,4.3,6.0,20.600625,20.0,27.0,23.0,555.4375,2200.0
max,35951.0,2017.0,8.4,16.0,47.087143,58.0,61.0,56.0,1269.571429,5800.0


In [102]:
tb_veic_2.loc[tb_veic_2['city_mpg']  < 15, 'eff_city'] = 'C'
tb_veic_2.loc[(tb_veic_2['city_mpg']  >= 15) & (tb_veic_2['city_mpg']  < 20), 'eff_city'] = 'B'
tb_veic_2.loc[tb_veic_2['city_mpg']  >= 20, 'eff_city'] = 'A'

In [103]:
tb_veic_2['eff_city'].value_counts()

B    17601
A     9879
C     8472
Name: eff_city, dtype: int64

In [105]:
p25_citympg = tb_veic_2['city_mpg']  < 15
p75_citympg = tb_veic_2['city_mpg'] < 20

In [107]:
print(p75_citympg)

670       True
227       True
464       True
69        True
350       True
         ...  
35363    False
35308    False
35349    False
35360    False
35359     True
Name: city_mpg, Length: 35952, dtype: bool


In [108]:
tb_veic_2.loc[p25_citympg, 'eff_city'] = 'C'
tb_veic_2.loc[~p25_citympg & p75_citympg, 'eff_city'] = 'B'
tb_veic_2.loc[~p75_citympg, 'eff_city'] = 'A'

In [109]:
tb_veic_2['eff_city'].value_counts()

B    17601
A     9879
C     8472
Name: eff_city, dtype: int64

In [110]:
# np.where(CONDICAO, VALOR_SE_VERDADEIRO, VALOR_SE_FALSO)
tb_veic['cyl_6'] = np.where(tb_veic['cylinders'] == 6, True, False)
print(tb_veic['cyl_6'].describe())
sum(tb_veic['cyl_6'])

count     35952
unique        2
top       False
freq      23187
Name: cyl_6, dtype: object


12765

In [111]:
tb_veic['cyl_6_ford'] = np.where((tb_veic['make'] == 'Ford') & (tb_veic['cylinders'] == 6),
                                 True,
                                 False)
print(tb_veic['cyl_6_ford'].describe())
sum(tb_veic['cyl_6_ford'])

count     35952
unique        2
top       False
freq      34659
Name: cyl_6_ford, dtype: object


1293

In [113]:
tb_veic['seila'] = tb_veic['cylinders'] * tb_veic['highway_mpg']
tb_veic.describe()

Unnamed: 0,index,model_year,engine_displacement,cylinders,fuel_barrels_by_year,city_mpg,highway_mpg,combined_mpg,co2_emission_grams_by_mile,fuel_cost_by_year,seila
count,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0
mean,17975.5,2000.7164,3.338493,5.765076,17.609056,17.646139,23.880646,19.929322,475.316339,1892.598465,130.949905
std,10378.592776,10.08529,1.359395,1.755268,4.467283,4.769349,5.890876,5.112409,119.060773,506.958627,28.964032
min,0.0,1984.0,0.6,2.0,0.06,6.0,9.0,7.0,37.0,600.0,40.0
25%,8987.75,1991.0,2.2,4.0,14.699423,15.0,20.0,16.0,395.0,1500.0,108.0
50%,17975.5,2001.0,3.0,6.0,17.347895,17.0,24.0,19.0,467.736842,1850.0,128.0
75%,26963.25,2010.0,4.3,6.0,20.600625,20.0,27.0,23.0,555.4375,2200.0,150.0
max,35951.0,2017.0,8.4,16.0,47.087143,58.0,61.0,56.0,1269.571429,5800.0,264.0


In [114]:
sum(tb_veic['city_mpg'] > tb_veic['highway_mpg'])

183

In [115]:
tb_veic['co2grams_by_gallon'] = np.where(tb_veic['city_mpg'] > tb_veic['highway_mpg'],
                               tb_veic['co2_emission_grams_by_mile']/tb_veic['highway_mpg'],
                               tb_veic['co2_emission_grams_by_mile']/tb_veic['city_mpg'],
                              )
print(tb_veic['co2grams_by_gallon'].describe())

count    35952.000000
mean        30.523495
std         16.005434
min          1.000000
25%         19.900000
50%         27.513932
75%         37.029167
max        211.595238
Name: co2grams_by_gallon, dtype: float64


In [116]:
tb_veic['eff_city'] = np.where(tb_veic['city_mpg'] < 15, 'C',
                               np.where(tb_veic['city_mpg'] < 20, 'B', 'A'))
tb_veic['eff_city'].value_counts()

B    17601
A     9879
C     8472
Name: eff_city, dtype: int64

## Another way to do the same thing

* using the method `query`

The method `query` receives a string in which you can say your condition. Important things:
- `.query()` is a method of your dataframe
- `.query()` method receives a string 
- Every word inside the string that is not `quoted` is considered a variable of your dataframe (so, for example `.query('Year == 1999')` will look for the variable `Year`. Another example: if you try to run `.query('Make == Ford')` will look both for the column name `Make` and the column named `Ford`. If you want the results of the column `Make` to match the **string** Ford, you have to run `.query('Make == "Ford"')`
- If your column has spaces, you have to call it using backticks like in **.query('\`Engine Displacement\` < 4')**:

In [117]:
tb_veic.query('model_year == 2016')

Unnamed: 0,index,make,model,model_year,engine_displacement,cylinders,transmission,drivetrain,vehicle_class,fuel_type,...,highway_mpg,combined_mpg,co2_emission_grams_by_mile,fuel_cost_by_year,id,cyl_6,cyl_6_ford,seila,co2grams_by_gallon,eff_city
1113,10911,Dodge,Viper SRT,2016,8.4,10.0,Manual 6-spd,Rear-Wheel Drive,Two Seaters,Premium,...,21,15,579.0,2700,DodgeViper SRT2016,False,False,210.0,48.250000,C
1952,3190,Bentley,Mulsanne,2016,6.8,8.0,Automatic (S8),Rear-Wheel Drive,Midsize Cars,Premium,...,18,13,656.0,3100,BentleyMulsanne2016,False,False,144.0,59.636364,C
1970,30052,Rolls-Royce,Phantom Drophead Coupe,2016,6.7,12.0,Automatic (S8),Rear-Wheel Drive,Compact Cars,Premium,...,19,14,637.0,2900,Rolls-RoycePhantom Drophead Coupe2016,False,False,228.0,57.909091,C
1958,30063,Rolls-Royce,Phantom EWB,2016,6.7,12.0,Automatic (S8),Rear-Wheel Drive,Large Cars,Premium,...,19,14,637.0,2900,Rolls-RoycePhantom EWB2016,False,False,228.0,57.909091,C
1904,30042,Rolls-Royce,Phantom Coupe,2016,6.7,12.0,Automatic (S8),Rear-Wheel Drive,Compact Cars,Premium,...,19,14,638.0,2900,Rolls-RoycePhantom Coupe2016,False,False,228.0,58.000000,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1251,13310,Ford,Focus FWD,2016,1.0,3.0,Manual 6-spd,Front-Wheel Drive,Compact Cars,Regular,...,42,35,255.0,950,FordFocus FWD2016,False,False,126.0,8.500000,A
1271,13314,Ford,Focus FWD,2016,1.0,3.0,Automatic (S6),Front-Wheel Drive,Compact Cars,Regular,...,40,32,272.0,1050,FordFocus FWD2016,False,False,120.0,9.714286,A
1740,35950,smart,fortwo coupe,2016,0.9,3.0,Auto(AM6),Rear-Wheel Drive,Two Seaters,Premium,...,39,36,246.0,1100,smartfortwo coupe2016,False,False,117.0,7.235294,A
1005,35951,smart,fortwo coupe,2016,0.9,3.0,Manual 5-spd,Rear-Wheel Drive,Two Seaters,Premium,...,39,35,255.0,1150,smartfortwo coupe2016,False,False,117.0,7.968750,A


In [119]:
tb_veic.\
    query('make == "Ford"').\
    query('cylinders == 6').\
    query('model_year == 2017')


SyntaxError: invalid syntax (<ipython-input-119-67027e98fa60>, line 1)