In [1]:
import os
import re
import ast
import unicodedata
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder

In [2]:
load_dotenv()
data_dir = os.getenv("DATA_DIR")
data_path = os.path.join(data_dir, "raw", "anime_data.csv")

'id,' # Anime ID (integer)\
'title,' # Anime title (string)\
✅'synopsis,' # Anime synopsis (string or null)\
'mean,' # Mean score (float or null)\
'popularity,' # Popularity rank (integer or null)\
'num_list_users,' # Number of users who have the anime in their list (integer)\
'num_scoring_users,' # Number of users who have scored the anime (integer)\
✅'nsfw,' # NSFW classification (white=sfw, gray=partially, black=nsfw) (string or null)\
✅'genres,' # Genres (array of objects)\
✅'studios,' # Studios (array of objects)\
'num_episodes,' # Number of episodes (integer)\
'average_episode_duration,' # Average duration of an episode (integer or null)\
✅'status,' # Airing status (string)\
✅'rating,' # Age rating (string or null) (g, pg, pg_13, r, r+, rx)\
✅'source,' # Source (string or null)\
✅'media_type,' # Media type (string)\
'created_at,' # Date of creation (string <date-time>)\
'updated_at,' # Date of last update (string <date-time>)\
'start_season,' # Start season (object or null)\
'start_date,' # Start date (string or null)\
'end_date,' # End date (string or null)\
'related_anime,' # Related anime (array of objects)\
'related_manga,' # Related manga (array of objects)\
'recommendations,' # Recommendations (array of objects)\
'statistics') # Statistics (object or null)

In [3]:
raw_data = pd.read_csv(data_path)

In [52]:
raw_data.columns

Index(['id', 'title', 'synopsis', 'mean', 'popularity', 'num_list_users',
       'num_scoring_users', 'nsfw', 'genres', 'studios', 'num_episodes',
       'average_episode_duration', 'status', 'rating', 'source', 'media_type',
       'created_at', 'updated_at', 'start_date', 'end_date', 'related_anime',
       'related_manga', 'recommendations', 'main_picture.medium',
       'main_picture.large', 'start_season.year', 'start_season.season',
       'statistics.status.watching', 'statistics.status.completed',
       'statistics.status.on_hold', 'statistics.status.dropped',
       'statistics.status.plan_to_watch', 'statistics.num_list_users'],
      dtype='object')

## Preprocess Start Season ['start_season,' (object or null)]

In [47]:
raw_data['start_season.season'] = raw_data['start_season.season'].fillna('Unknown')
raw_data['start_season.year'] = raw_data['start_season.year'].fillna(raw_data['start_season.year'].mode()[0])

In [48]:
season_encoder = LabelEncoder()
season_encoder.fit(raw_data['start_season.season'].unique())
raw_data['start_season.season'] = season_encoder.transform(raw_data['start_season.season'])

## Preprocess Synopsis Data ['synopsis,' (string or null)]

In [32]:
def clean_text(text):
    text = unicodedata.normalize('NFKC', text)  # Unicode normalization
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags if any
    text = re.sub(r"\(.*source.*\)", "", text, flags=re.IGNORECASE)  # Remove source citations
    text = re.sub(r"\[.*MAL.*\]", "", text)  # Remove MAL citations
    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
    text = text.strip()  # Strip whitespace from the beginning and the end
    return text

raw_data['synopsis'] = raw_data['synopsis'].fillna("")
raw_data['synopsis'] = raw_data['synopsis'].apply(clean_text)

## Preprocess Genre Lables ['genres,' (array of objects, may be empty)]

In [28]:
raw_data['genres'] = raw_data['genres'].fillna('[]')
raw_data['genres'] = raw_data['genres'].apply(ast.literal_eval)

In [29]:
unique_genres = set()

def process(entry):
    genres_set = set(genre['name'] for genre in entry)
    unique_genres.update(genres_set)
    return genres_set

raw_data['genres'] = raw_data['genres'].apply(process)

In [30]:
genre_mlb = MultiLabelBinarizer()
genre_mlb.fit([unique_genres])

raw_data['genres'] = raw_data['genres'].apply(lambda x: np.squeeze(genre_mlb.transform([x])))

## Preprocess Studio Labels ['studios,' (array of objects, may be empty)]

In [35]:
raw_data['studios'] = raw_data['studios'].fillna('[]')
raw_data['studios'] = raw_data['studios'].apply(ast.literal_eval)

In [36]:
unique_studios = set()

def process(entry):
    studios_set = set(studio['name'] for studio in entry)
    unique_studios.update(studios_set)
    return studios_set

raw_data['studios'] = raw_data['studios'].apply(process)

In [37]:
studio_mlb = MultiLabelBinarizer()
studio_mlb.fit([unique_studios])

raw_data['studios'] = raw_data['studios'].apply(lambda x: np.squeeze(studio_mlb.transform([x])))

## Preprocess NSFW Tag ['nsfw,' (white=sfw, gray=partially, black=nsfw) (string or null)]

In [37]:
raw_data['nsfw'] = raw_data['nsfw'].fillna("Unknown")
nsfw_encoder = LabelEncoder()
nsfw_encoder.fit(raw_data['nsfw'].unique())
raw_data['nsfw'] = nsfw_encoder.transform(raw_data['nsfw'])

## Preprocess Source ['source,' (string or null)]

In [7]:
raw_data['source'] = raw_data['source'].fillna("Unknown")
source_encoder = LabelEncoder()
source_encoder.fit(raw_data['source'].unique())
raw_data['source'] = source_encoder.transform(raw_data['source'])

## Preprocess Status ['status,' (string)]

In [5]:
status_encoder = LabelEncoder()
status_encoder.fit(raw_data['status'].unique())
raw_data['status'] = status_encoder.transform(raw_data['status'])

## Preprocess Media Type ['media_type,' (string)]

In [10]:
media_type_encoder = LabelEncoder()
media_type_encoder.fit(raw_data['media_type'].unique())
raw_data['media_type'] = media_type_encoder.transform(raw_data['media_type'])

## Preprocess Rating ['rating,' (string or null) (g, pg, pg_13, r, r+, rx)]

In [15]:
rating_map = {
    "g": 0,
    "pg": 1,
    "pg_13": 2,
    "r": 3,
    "r+": 4,
    "rx": 5
}
raw_data['rating'] = raw_data['rating'].fillna("Unknown")
raw_data['rating'] = raw_data['rating'].map(rating_map)

## Preprocess Numerical Columns

### Preprocess Num Episodes ['num_episodes,' (integer)]

In [16]:
raw_data['num_episodes'].unique()

array([  28,   64,   24,   51,   10,    1,  148,   13,  110,   12,  201,
         22,   25,    2,   14,   74,    7,   23,   26,   16,   75,   11,
          0,   47,    4,   43,   27,   37,   39,  101,    8,   99,  112,
          6,  120,   62,   15,   50,   20,   17,   33,   40,   79,   94,
          3,  500,   52,   78,   77,   96,   70,  291,   38,  170,  203,
        237,  104,  103,  220,   60,  366,   49,    9,   18,  145,    5,
        147,  175,  153,  102,   44,   36,  193,  167,   30,   42,   21,
         48,  224,   41,   35,  178,   45,  113,  127,  293,   46,   65,
         34,  258,  195,  161,  124,   61,  131,   97,  114,   69,  109,
         73,   76,  150, 1787,  154,   31,  373,  243,   29,  100,  276,
         58,  128,   54,  180,   92,   19,  182,   72,  160,  115,   53,
        296,   91,   55,   59,  331,  694,  358,  142,  155,  305,  496,
         32,  140,   86,  105,   85,   67,   63,  137,  192,  146,  136,
        214,   68,   56,  151,  726,  108,   95,  1

## Check Handling of Unicode Characters in Tokenizer

In [16]:
""" List the Unicode code points of the characters in a string
unicode_list = [(char, f"U+{ord(char):04X}") for char in test]

# Print the results
for char, code in unicode_list:
    print(f"'{char}' -> {code}")
"""

'\nunicode_list = [(char, f"U+{ord(char):04X}") for char in test]\n\n# Print the results\nfor char, code in unicode_list:\n    print(f"\'{char}\' -> {code}")\n'

In [9]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

In [11]:
encoding = tokenizer(test)
encoding['input_ids']
output = tokenizer.convert_ids_to_tokens(encoding['input_ids'])

## Save the Processed Dataframe

In [None]:
#raw_data.to_csv(os.path.join(data_dir, "interim", "anime_data_processed.csv"), index=False)