# Prototype

## Introduction

### Imports

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

### Parameters

In [2]:
data = "Takeout/E-mail/bikesampa.mbox"
subject_target = "Subject: Bike Sampa - Como foi sua viagem?"

In [3]:
html_open_tag = "<html>"
html_close_tag = "</html>"

## Extacting data

### Selecting emails with ride details

In [4]:
with open(data, "r") as f:
    emails = f.readlines()

In [5]:
i = 0
selected_emails = []
while i < len(emails):
    if subject_target in emails[i]:
        open_i = None
        close_i = None
        while open_i is None or close_i is None:
            if html_open_tag in emails[i]:
                open_i = i
            if html_close_tag in emails[i]:
                close_i = i
            i+=1
        new_email = "".join(emails[open_i:close_i+1])
        selected_emails.append(new_email)
    
    i+=1

In [6]:
len(selected_emails)

235

### Parsing html

I want to extract:
- Ride source stations
- Ride target station
- Ride start time
- Ride end time
- Ride duration

In [47]:
def get_info(soup):
    span_tags = soup.find_all('span')
    source_station = span_tags[0].text
    start_time = span_tags[1].text
    target_station = span_tags[2].text
    end_time = span_tags[3].text
    return (source_station, start_time, target_station, end_time)

In [48]:
data = []
error_list = []
for i, email in enumerate(selected_emails):
    soup = BeautifulSoup(email, 'html.parser')
    try:
        data_point = get_info(soup)
        data.append(data_point)
    except IndexError:
        print(f"Error: {i}")
        error_list.append(i)

In [80]:
df = pd.DataFrame(data, columns=["source_station", "start_time", "target_station", "end_time"])

In [81]:
df = df.applymap(lambda x: x.replace("=\n", "")
                       .replace("=C3=A7", "ç")
                       .replace("=C3=BA", "ú")
                       .replace("=C3=AD", "í")
                       .replace("=C3=A3", "ã")
                       .replace("=C3=A1", "á")
                       .replace("</span>", "").split("\n")[0])

In [82]:
df

Unnamed: 0,source_station,start_time,target_station,end_time
0,1 - Largo da Batata,2019-05-09 09:16:52,26 - Praça Pierre Germayel,2019-05-09 09:38:17
1,12 - Largo de Batata II,2019-09-10 08:51:46,26 - Praça Pierre Germayel,2019-09-10 09:14:08
2,11 - Praça Faria Lima,2019-07-12 09:06:00,26 - Praça Pierre Germayel,2019-07-12 09:32:29
3,12 - Largo de Batata II,2019-04-10 09:33:10,26 - Praça Pierre Germayel,2019-04-10 09:53:25
4,26 - Praça Pierre Germayel,2019-06-24 19:21:58,1 - Largo da Batata,2019-06-24 19:44:29
5,12 - Largo de Batata II,2019-06-24 09:01:13,26 - Praça Pierre Germayel,2019-06-24 09:24:10
6,26 - Praça Pierre Germayel,2019-06-19 19:07:30,1 - Largo da Batata,2019-06-19 19:33:46
7,11 - Praça Faria Lima,2019-06-19 09:02:40,26 - Praça Pierre Germayel,2019-06-19 09:26:00
8,117 - Cubo Itaú,2019-06-12 17:59:33,1 - Largo da Batata,2019-06-12 18:26:53
9,12 - Largo de Batata II,2019-06-12 09:58:13,26 - Praça Pierre Germayel,2019-06-12 10:23:27


In [87]:
df.start_time = pd.to_datetime(df.start_time)
df.end_time = pd.to_datetime(df.end_time)

In [91]:
df = df.sort_values(by="start_time").reset_index(drop=True)

In [93]:
df.to_csv("bikesampa.csv", index=None)