In [None]:
import spacy
from io import StringIO 

In [None]:
def extract_named_entities(text, nlp):
    max_length = 1000000  #Maximum allowed length in characters
    chunks = (text[i:i + max_length] for i in range(0, len(text), max_length))
    named_entities = set()

    for chunk in chunks:
        doc = nlp(chunk)
        named_entities.update((entity.text, entity.label_) for entity in doc.ents)

    return named_entities

In [None]:
nlp_en = spacy.load("en_core_web_sm")
nlp_ja = spacy.load("ja_core_news_sm")

In [None]:
#website data
file_path = "corpus_of_Honda.txt"
with open(file_path, "r", encoding="utf-8") as file:
    scraped_data = file.read()

In [None]:
is_english = any(ord(char) < 128 for char in scraped_data)

if is_english:
    entities_combined = extract_named_entities(scraped_data, nlp_en)
else:
    entities_combined = extract_named_entities(scraped_data, nlp_ja)

In [None]:
buffer = StringIO()
label_entities_dict = {}
for entity, label in entities_combined:
    if label not in label_entities_dict:
        label_entities_dict[label] = set()
    label_entities_dict[label].add(entity)

for label, entities_set in label_entities_dict.items():
    entities_str = ", ".join(entities_set)
    buffer.write(f"Label: {label}, Entities: {entities_str}\n")

In [None]:
data = buffer.getvalue()
data_cleaned = data.replace("\n", " ")
data_cleaned = data_cleaned.replace("Label:", "\nLabel:")

In [None]:
output_file_path = "entities_are_here.txt"
with open(output_file_path, "w", encoding='utf-8') as output_file:
    output_file.write(data_cleaned)