# Pandas: Importing and Exporting

In [None]:
!pip install openpyxl

In [None]:
import xlrd

In [None]:
import pandas as pd

## Import **delimited files**

In [None]:
file = open('data/dados_veiculos.csv', 'rb')
file_lines = file.readlines(10000)
for i in range(5):
    print(file_lines[i].decode('utf-8'))
file.close()

Most common: comma separated values `(.csv)` files. A file that is separated (or delimited) by commas (or anything else):

Examples:
    
**-- comma separated file --**

    name,year,value
    Andre,2020,100
    Fernanda,1900,1
    
**-- tab separated file --**

    name    year    value
    Andre    2020    100
    Fernanda    1900    1
    
**-- tab separated file (another way) --**

    name\tyear\tvalue
    Andre\t2020\t100
    Fernanda\t1900\t1
    
**-- hash separated file --**

    name#year#value
    Andre#2020#100
    Fernanda#1900#1

**-- pipe separated file --**

    name|year|value
    Andre|2020|100
    Fernanda|1900|1

    ...

### Comma Separated Files

In [None]:
pd.read_csv('data/dados_veiculos.csv')

In this way, the `pd.read_csv()` method is just outputing the results on screen.

If I wanted to store it in a variable I would have to **assign** this result into a variable

In [None]:
tb_veic = pd.read_csv('data/dados_veiculos.csv')

In [None]:
tb_veic.info()

### Import tab-delimited file


In [None]:
tb_oscsp = pd.read_csv('data/tb_empenho_cnpj_osc.txt')

In [None]:
file = open('data/tb_empenho_cnpj_osc.txt', 'rb')
file_lines = file.readlines(10000)
for i in range(5):
    print(file_lines[i].decode('utf-8'))
file.close()

In [None]:
tb_oscsp = pd.read_csv('data/tb_empenho_cnpj_osc.txt', sep = '\t')
tb_oscsp.info()

In [None]:
tb_oscsp = pd.read_csv('data/tb_empenho_cnpj_osc.txt', sep = '\t', decimal = ',')
tb_oscsp.info()

In [None]:
tb_oscsp

In [None]:
tb_oscsp = pd.read_csv('data/tb_empenho_cnpj_osc.txt', sep = '\t', decimal = ',', encoding = 'latin-1')

In [None]:
tb_oscsp

### Import pipe delimited file 

In [None]:
file = open('data/eletricidade_india.csv', 'rb')
file_lines = file.readlines(10000)
for i in range(5):
    print(file_lines[i].decode('latin-1'))
file.close()

In [None]:
tb_elecindia = pd.read_csv('data/eletricidade_india.csv', sep = '|', decimal = '.')
tb_elecindia

In [None]:
tb_elecindia.info()

In [None]:
tb_elecindia = pd.read_csv('data/eletricidade_india.csv', sep = '|', decimal = '.',
                          parse_dates = ['Date'], infer_datetime_format = True)
tb_elecindia['Date'].describe()

## Import files from URLs

Example: https://datasets.imdbws.com/

In [None]:
url_csv = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
tb_imdbratings = pd.read_csv(url_csv, sep = '\t', encoding = 'utf-8', na_values = '\\N')

## Import Excel Files

Fonte: https://www.kaggle.com/sanjeetsinghnaik/most-expensive-footballers-2021

In [None]:
pd.read_csv('data/Dados Jogadores Futebol.xlsx')

In [None]:
tb_futebol = pd.read_excel('data/Dados Jogadores Futebol.xlsx')
tb_futebol

## Import Google Drive Files

**Cool Application: Google Sheets**

Get the shareable link and replace the last piece of the url:
* replace `/edit?usp=sharing` by `/export?format=csv`
    
(https://stackoverflow.com/questions/19611729/getting-google-spreadsheet-csv-into-a-pandas-dataframe)    

In [None]:
def read_from_gsheets(spreadsheet):
    """
    Transform url into csv 
    """
    working_spreadsheet = spreadsheet.replace('/edit?usp=sharing','/export?format=csv')
    
    return pd.read_csv(working_spreadsheet)

In [None]:
tb_tsunami = read_from_gsheets('https://docs.google.com/spreadsheets/d/1pfUsMP5IVvNagS-YaqpoBWQewmc0_RK9Mr1DlmQnlqE/edit?usp=sharing')

In [None]:
tb_tsunami.info()

## Import and export JSON files

What is a JSON file?

JSON 1:
```json
{ "name":"John", "age":30, "car":null }
```

JSON 2: 
```json
{"students":[
   {"name":"Andre", "age":23, "state":"SP"},
   {"name":"Rodrigo", "age":28, "state":"SP"},
   {"name":"Raiana", "age":32, "state":"DF"},
   {"name":"Tieko", "age":28, "state":"BA"}
]}
```

Orientação de `records`, onde o JSON é uma lista de dicionários. Cada dicionário desta lista é uma linha da nossa tabela
```python
[
    {"coluna1" : valor, "coluna2" : valor},
    {"coluna1" : valor, "coluna2" : valor},
    {"coluna1" : valor, "coluna2" : valor},
    {"coluna1" : valor, "coluna2" : valor},
    {"coluna1" : valor, "coluna2" : valor},
]
```

Orientação de `index`, onde o JSON é um dicionário. Cada chave desse dicionário é uma linha de nossa tabela e guarda um outro dicionário com os dados da tabela referente àquela linha.
```python
{
    linha_1 : {"coluna1" : valor, "coluna2" : valor},
    linha_2 : {"coluna1" : valor, "coluna2" : valor},
    linha_3 : {"coluna1" : valor, "coluna2" : valor},
    linha_4 : {"coluna1" : valor, "coluna2" : valor},
}
```

In [None]:
tb_crypto = pd.read_json('data/crypto_data_records.json', orient = 'records') # orient informa a orientação do JSO
tb_crypto

In [None]:
tb_crypto = pd.read_json('data/crypto_data_index.json', orient = 'index') # orient informa a orientação do JSO
tb_crypto

## Bonus: Read CSV using For
* create copy with files vehicles
* read and append

In [None]:
import os
for file in os.listdir('data/dados_censo/'):
    print(file)

In [None]:
file = open('data/dados_censo/Basico_BA.csv', 'rb')
file_lines = file.readlines(10000)
for i in range(5):
    print(file_lines[i].decode('latin-1'))
file.close()

In [None]:
first = True
for file in os.listdir('data/dados_censo/'):
    file_path = 'data/dados_censo/' + file
    if first:
        tb_censo = pd.read_csv(file_path, sep = ";", decimal = ",")
    else:
        tb_censo_temp = tb_censo = pd.read_csv(file_path, sep = ";", decimal = ",")
        tb_censo.append(tb_censo_temp)

In [None]:
tb_censo.info()

----

## 6. Export to `.csv`

In [None]:
tb_censo.to_csv('data/tb_censo.csv', sep = ";", decimal = ",", encoding = 'utf-8')

**NOTE**: If you do not specify the argument `index=False`, the output in the csv file will create an unnamed index column with the *dataframe indexes*.

In [None]:
file = open('data/tb_censo.csv', 'rb')
file_lines = file.readlines(10000)
for i in range(5):
    print(file_lines[i].decode('utf-8'))
file.close()

### Export to csv using a specific separator

In [None]:
tb_censo.to_csv('data/tb_censo_virgula.csv', sep = ",", decimal = ".", encoding = 'utf-8')

In [None]:
file = open('data/tb_censo_virgula.csv', 'rb')
file_lines = file.readlines(10000)
for i in range(5):
    print(file_lines[i].decode('utf-8'))
file.close()

In [None]:
tb_censo.to_csv('data/tb_censo_pipe.csv', sep = "|", decimal = ",", encoding = 'utf-8')

In [None]:
file = open('data/tb_censo_pipe.csv', 'rb')
file_lines = file.readlines(10000)
for i in range(5):
    print(file_lines[i].decode('utf-8'))
file.close()

## 7. Export to `.xlsx`

In [None]:
tb_censo.to_excel('data/Tabela do Censo.xlsx', sheet_name = 'dados')

-------

## 8. Export to `.json`

For a JSON file, you can have different `orient` options.

`'split'`: Dictionary containing indexes, columns, and data.

`'index'`: Nested dictionaries containing {index:{column:value}}.

`'columns'`: Nested dictionaries containing {column:{index:value}}

`'values'`: Nested list where each sublist contains the values for a record.

`'records'`: Nested dictionaries containing schema and data (records).

In [None]:
tb_censo.to_json('tb_censo_index.json', orient = 'index')
tb_censo.to_json('tb_censo_records.json', orient = 'records')

----