In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

## Data Extraction

In [2]:
url = "https://business-mail.jp/example"
page = requests.get(url)
soup = BeautifulSoup(page.text, "html")

links = soup.find_all("a")
wants = [
    "https://business-mail.jp/cat_example/outside/",
    "https://business-mail.jp/cat_example/office/",
    "https://business-mail.jp/tag_example/"
]
filtered_links = [a['href'] for a in soup.find_all('a') if any(a['href'].startswith(y) for y in wants)]

data = []

for link in filtered_links:
    # get current topic page
    topic_page = requests.get(link)
    topic_soup = BeautifulSoup(topic_page.text, "html")

    # get all example links from current topic
    topic_example_links = [a["href"] for a in topic_soup.find_all("a") \
        if a["href"].startswith("https://business-mail.jp/example/")]
    topic_example_links = [x for x in topic_example_links if x != "https://business-mail.jp/example/"]

    # get title and text from each example
    for example_link in topic_example_links:
        example_page = requests.get(example_link)
        example_soup = BeautifulSoup(example_page.text, "html.parser")
        example_title = example_soup.find_all("h1")[0].get_text()
        example_email = [p.get_text(separator="\n") for p in example_soup.find_all('p')][0]
        data.append([example_link, example_title, example_email])

In [65]:
# clean up and save as csv
columns = ["link", "title", "email"]
df = pd.DataFrame(data, columns=columns)
df = df.drop_duplicates("email").reset_index().drop("index", axis=1)

df.to_csv("japanese_business_email_examples_raw.csv", index=False)


## Data Cleaning

In [115]:
import re

# Cleaning helper methods

def remove_extra_delineations(email_body):
    # Remove excessive line breaks and normalize spacing
    email_body = re.sub(r'\n\s*\n', '\n', email_body.strip())

    return email_body

def truncate_signature1(text):
    index = text.find("----")
    if index != -1:
        return text[:index]
    return text

def truncate_signature2(text):
    index = text.find("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    if index != -1:
        return text[:index]
    return text


In [123]:
df = pd.read_csv("japanese_business_email_examples_raw.csv")

df["email_clean"] = df["email"].apply(lambda x: \
    remove_extra_delineations(truncate_signature1(truncate_signature2(x))))

df_clean = df[['link', 'title', 'email_clean']]
df_clean.to_csv("japanese_business_email_examples_clean.csv", index=False)

## Current State
Token count of all emails flattened is approximately 40,000, total maximum should be under  50,000 with instruction set.