# Manipulating a dataset structure
Applying some operations to change DataFrame strucutre and filtering content

In [None]:
# 1-) Import Pandas
# 2-) Read `vehicles.csv`
# 3-) Make some quick analysis
import pandas as pd
import numpy as np
tb_veic = pd.read_csv('data/dados_veiculos.csv')

Como podemos ver abaixo, a tabela contém 15 colunas, sendo 9 numéricas e 6 de objetos. Além disso todas as linhas parecem ter todas as variáveis preenchidas.

In [None]:
tb_veic.info()

Como podemos verificar o que as colunas com tipo 'object' contém? 

## Check DataFrame Column Names

Rename all columns at once:
- `data.columns` is an **attribute** of the DataFrame which results in a list-like of the column names
    - You can substitute it by another list containing the names you want 
    - Note you have to substitute the whole set of column names at once
    
- `data.rename()` is a **method** of a DataFrame, in which you can rename one column at once
    - You just need to pass a dictionary containing {'old_name':'new_name'} 
    - By default, it changes names of a **index** (`axis=0`), you can specify `axis=1` to change **column** names
    - the `inplace` argument

In [None]:
print(tb_veic.columns)

In [None]:
print(tb_veic.columns[0])

In [None]:
coluna_0 = tb_veic[tb_veic.columns[0]]

In [None]:
tipo_coluna_0 = a.dtypes.type

In [None]:
tipo_coluna_0 == np.object_

In [None]:
for column in tb_veic.columns:
    if tb_veic[column].dtypes.type == np.object_:
        print(tb_veic[column].describe())

### Substituting `.columns` attribute

In [None]:
print(tb_veic.columns)

In [None]:
old_column_names = list(tb_veic.columns)

In [None]:
tb_veic.columns = [column.lower().replace(' ', '_') for column in list(tb_veic.columns)]

In [None]:
tb_veic

In [None]:
tb_veic.columns = old_column_names
print(tb_veic.columns)

### Using `.rename() method`

`.rename({'old_column':'new_column'})`

In [None]:
tb_veic = tb_veic.rename({'Year' : 'model_year'}, axis = 1)
print(tb_veic.columns)

In [None]:
tb_veic.columns = old_column_names

In [None]:
tb_veic.rename({'Year' : 'model_year'}, axis = 1)
print(tb_veic.columns)

In [None]:
tb_veic.rename({'Year' : 'model_year'}, axis = 1, inplace = True)
print(tb_veic.columns)

In [None]:
dict_rename = {column : column.lower().replace(' ', '_').replace('/', '_by_') for column in tb_veic.columns}
print(dict_rename)

In [None]:
tb_veic.rename(dict_rename, axis = 1, inplace = True)
print(tb_veic.columns)

So, we have two options:
> 1. store it again on the variable `data`: 

    data = data.rename(columns={'Make':'Manufacturer', 'Year':'ANO'})
> 2. Use the inplace argument `inplace =  True` to change the values within the dataframe automatically

    data.rename(columns={'Make':'Manufacturer', 'Year':'ANO'}, inplace=True)
    
O parâmetro 'inplace' será deprecado e seu uso é considerado má prática.

## Reorder columns in a dataframe

Remember: You always pass a list of columns to access a dataframe

In [None]:
print(tb_veic.columns)

In [None]:
tb_veic[['model', 'make']]

In [None]:
lista_colunas = list(tb_veic.columns)
lista_colunas.sort()
print(lista_colunas)

In [None]:
tb_veic[lista_colunas]

## Remove column (or row)

- The `.drop()` method
- By default, `.drop()` drops a row given its index.

In [None]:
tb_veic.drop('make', axis = 1)

In [None]:
tb_veic.drop(['make', 'model_year'], axis = 1)

## Deep vs Shallow copy on pandas

In [None]:
copia_tb_veic = tb_veic
copy_tb_veic = tb_veic.copy(deep = False)
deepcopy_tb_veic = tb_veic.copy(deep = True) # comportamento padrão do método

In [None]:
a = [1, 2, 3]
b = [1, 2, 3]
a is b

In [None]:
c = b
print(c)

In [None]:
c is b

In [None]:
tb_veic_2 = tb_veic
tb_veic_2 is tb_veic

In [None]:
tb_veic_2 = tb_veic.copy()
tb_veic_2 is tb_veic

## Sort Values in a DatFrame

In [None]:
tb_veic = tb_veic.sort_values('model_year')
tb_veic

In [None]:
tb_veic = tb_veic.sort_values('model_year', ascending = False)
tb_veic

In [None]:
tb_veic = tb_veic.sort_values(['model_year', 'engine_displacement'], ascending = [False, True])
tb_veic

In [None]:
tb_veic = tb_veic.sort_values(['model_year', 'engine_displacement'], ascending = False)
tb_veic

# Filter records
>    - `mask` concept
>    - `.query()` method

This is really important for data wrangling.

## Simple Example: Starting with a numpy array. How can I filter the values of a list?

In [None]:
meu_array = np.array(range(1,10,2))
print(meu_array)

The results of `my_array > 5` is what is called **a mask**. A result containing the `True` and `False` results of an operation. 

In [None]:
print(meu_array > 5)

Masks can be used as an index to select data!

In [None]:
print(meu_array[meu_array > 5])

After selecting, you can do anything with it, for example assigning it. This operation is called a `vectorial` operation. It is done all at once.

In [None]:
meu_array_filtrado = meu_array[meu_array > 5]
print(meu_array_filtrado)

You can also save the condition

In [None]:
num_gt5 = meu_array > 5
print(num_gt5)

## Bitwise logical operators - Combining conditions

To make more than one condition together, you can use 
- `&` - analogous to `and`
- `|` - analogous to `or` 

For example, get all numbers from my_array that are greater than 3 and smaller than 8

Let's do it in steps:
- get values greater than 3

In [None]:
num_gt3 = meu_array > 3
print(meu_array)
print(num_gt3)

- get values smaller than 8

In [None]:
num_st8 = meu_array < 8
print(meu_array)
print(num_st8)

- get values greater than 3 AND smaller than 8

In [None]:
num_3a8 = num_st8 & num_gt3
print(meu_array)
print(num_3a8)

- get values greater than 3 OR smaller than 8

In [None]:
num_3a8 = num_st8 | num_gt3
print(meu_array)
print(num_3a8)

## Now in a DataFrame

Let's find the rows in which the Cylinders values are exactly 6.

In [None]:
cyl_6 = tb_veic['cylinders'] == 6
print(cyl_6)
sum(cyl_6)

In [None]:
tb_veic_cyl6 = tb_veic[cyl_6] 
# mais abreviado seria:
# tb_veic_cyl6 = tb_veic[tb_veic['cylinders'] == 6]
tb_veic_cyl6

In [None]:
tb_veic_cyl6.describe()

### You can combine conditions

Cars from `Ford` and 6 `Cylinders`

In [None]:
print(tb_veic.columns)

In [None]:
print(tb_veic['make'].unique())

In [None]:
tb_veic[(tb_veic['make'] == 'Ford') & (tb_veic['cylinders'] == 6)]

### Using conditions to create new columns

In [None]:
tb_veic_2 = tb_veic.copy()

In [None]:
tb_veic_2.loc[tb_veic['cylinders'] == 6, 'cyl_6'] = True
tb_veic_2.loc[tb_veic['cylinders'] != 6, 'cyl_6'] = False
tb_veic_2

In [None]:
tb_veic_2.describe()

In [None]:
tb_veic_2.loc[tb_veic_2['city_mpg']  < 15, 'eff_city'] = 'C'
tb_veic_2.loc[(tb_veic_2['city_mpg']  >= 15) & (tb_veic_2['city_mpg']  < 20), 'eff_city'] = 'B'
tb_veic_2.loc[tb_veic_2['city_mpg']  >= 20, 'eff_city'] = 'A'

In [None]:
tb_veic_2['eff_city'].value_counts()

In [None]:
p25_citympg = tb_veic_2['city_mpg']  < 15
p75_citympg = tb_veic_2['city_mpg']  < 20
tb_veic_2.loc[p25_citympg, 'eff_city'] = 'C'
tb_veic_2.loc[~p25_citympg & p75_citympg, 'eff_city'] = 'B'
tb_veic_2.loc[~p75_citympg, 'eff_city'] = 'A'

In [None]:
tb_veic_2['eff_city'].value_counts()

In [None]:
tb_veic['cyl_6'] = np.where(tb_veic['cylinders'] == 6, True, False)
print(tb_veic['cyl_6'].describe())
sum(tb_veic['cyl_6'])

In [None]:
tb_veic['cyl_6_ford'] = np.where((tb_veic['make'] == 'Ford') & (tb_veic['cylinders'] == 6),
                                 True,
                                 False)
print(tb_veic['cyl_6_ford'].describe())
sum(tb_veic['cyl_6_ford'])

In [None]:
sum(tb_veic['city_mpg'] > tb_veic['highway_mpg'])

In [None]:
tb_veic['co2grams_by_gallon'] = np.where(tb_veic['city_mpg'] > tb_veic['highway_mpg'],
                               tb_veic['co2_emission_grams_by_mile']/tb_veic['highway_mpg'],
                               tb_veic['co2_emission_grams_by_mile']/tb_veic['city_mpg'],
                              )
print(tb_veic['co2grams_by_gallon'].describe())

In [None]:
tb_veic['eff_city'] = np.where(tb_veic['city_mpg'] < 15, 'C',
                               np.where(tb_veic['city_mpg'] < 20, 'B', 'A'))
tb_veic['eff_city'].value_counts()

## Another way to do the same thing

* using the method `query`

The method `query` receives a string in which you can say your condition. Important things:
- `.query()` is a method of your dataframe
- `.query()` method receives a string 
- Every word inside the string that is not `quoted` is considered a variable of your dataframe (so, for example `.query('Year == 1999')` will look for the variable `Year`. Another example: if you try to run `.query('Make == Ford')` will look both for the column name `Make` and the column named `Ford`. If you want the results of the column `Make` to match the **string** Ford, you have to run `.query('Make == "Ford"')`
- If your column has spaces, you have to call it using backticks like in **.query('\`Engine Displacement\` < 4')**:

In [None]:
tb_veic.query('model_year == 2016')

In [None]:
tb_veic.\
    query('make == "Ford"').\
    query('cylinders == 6').\
    query('model_year == 2017')
