In [1]:
import os
import re
import unicodedata
import pandas as pd
from dotenv import load_dotenv

In [2]:
load_dotenv()
data_dir = os.getenv("DATA_DIR")
data_path = os.path.join(data_dir, "raw", "anime_data.csv")

In [3]:
raw_data = pd.read_csv(data_path)
#duplicates = synopses[synopses.duplicated()]
#duplicates.to_csv(os.path.join(data_dir, "interim", "duplicates.csv"), index=False)

In [8]:
def clean_text(text):
    text = unicodedata.normalize('NFKC', text)  # Unicode normalization
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags if any
    text = re.sub(r"\(.*source.*\)", "", text, flags=re.IGNORECASE)  # Remove source citations
    text = re.sub(r"\[.*MAL.*\]", "", text)  # Remove MAL citations
    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
    text = text.strip()  # Strip whitespace from the beginning and the end
    return text

synopses = raw_data["synopsis"]
synopses = synopses.dropna().drop_duplicates()
synopses = synopses.apply(clean_text)

In [5]:
synopses.to_csv(os.path.join(data_dir, "interim", "synopses.csv"), index=False)

In [9]:
test = synopses[9224]
test

"DVD specials. Episode 1 - 朗読少年 (Lecture Shounen): Macaron's special training by Kuu-chan. Episode 2 - 朗読兄弟 (Lecture brothers): Macaron's special training by his brother Daigorou. Episode 3 - もう一つのエンディング~囚われの安藤なつ~ (Another ending ~ Natsu Andou's imprisonment):"

In [7]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

In [8]:
encoding = tokenizer(test)
encoding['input_ids']
tokenizer.convert_ids_to_tokens(encoding['input_ids'])

['[CLS]',
 'dvd',
 'specials',
 '.',
 'episode',
 '1',
 '-',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '年',
 '(',
 'lecture',
 'sho',
 '##une',
 '##n',
 ')',
 ':',
 'mac',
 '##aro',
 '##n',
 "'",
 's',
 'special',
 'training',
 'by',
 'ku',
 '##u',
 '-',
 'chan',
 '.',
 'episode',
 '2',
 '-',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '(',
 'lecture',
 'brothers',
 ')',
 ':',
 'mac',
 '##aro',
 '##n',
 "'",
 's',
 'special',
 'training',
 'by',
 'his',
 'brother',
 'dai',
 '##gor',
 '##ou',
 '.',
 'episode',
 '3',
 '-',
 'も',
 '##う',
 '一',
 'つ',
 '##の',
 '##エ',
 '##ン',
 '##テ',
 '##ィ',
 '##ン',
 '##ク',
 '~',
 '[UNK]',
 '[UNK]',
 '安',
 '藤',
 'な',
 '##つ',
 '~',
 '(',
 'another',
 'ending',
 '~',
 'nat',
 '##su',
 'and',
 '##ou',
 "'",
 's',
 'imprisonment',
 ')',
 ':',
 '[SEP]']