In [1]:
import os
import re
import ast
import unicodedata
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split

In [2]:
load_dotenv()
data_dir = os.getenv("DATA_DIR")
data_path = os.path.join(data_dir, "raw", "anime_data.csv")

'id,' # Anime ID (integer)\
'title,' # Anime title (string)\
✅'synopsis,' # Anime synopsis (string or null)\
'mean,' # Mean score (float or null)\
'popularity,' # Popularity rank (integer or null)\
'num_list_users,' # Number of users who have the anime in their list (integer)\
'num_scoring_users,' # Number of users who have scored the anime (integer)\
✅'nsfw,' # NSFW classification (white=sfw, gray=partially, black=nsfw) (string or null)\
✅'genres,' # Genres (array of objects)\
✅'studios,' # Studios (array of objects)\
'num_episodes,' # Number of episodes (integer)\
'average_episode_duration,' # Average duration of an episode (integer or null)\
✅'status,' # Airing status (string)\
✅'rating,' # Age rating (string or null) (g, pg, pg_13, r, r+, rx)\
✅'source,' # Source (string or null)\
✅'media_type,' # Media type (string)\
'created_at,' # Date of creation (string <date-time>)\
'updated_at,' # Date of last update (string <date-time>)\
'start_season,' # Start season (object or null)\
'start_date,' # Start date (string or null)\
'end_date,' # End date (string or null)\
'related_anime,' # Related anime (array of objects)\
'related_manga,' # Related manga (array of objects)\
'recommendations,' # Recommendations (array of objects)\
'statistics' # Statistics (object or null)

In [3]:
raw_data = pd.read_csv(data_path)
train_data = raw_data

In [None]:
train_data.columns

## Preprocess Date Information

- fill/fix nan/nonsensical values
  - maybe encode start month/day and end month/day as cyclic features
  - for anime with only start year: copy year over to end_date
  - for anime anime with negative duration: swap start_date and end_date
  - check start_season.year for starting year information; copy this over and drop start_season.year
- encode date information
  - encode date either as cyclical feature or something like decimal year
  - decimal year: year + dayOfTheYear/365 (for leap-day use 0.5 of a day, e.g. 59.5/365)

In [7]:
print(len(raw_data))
print(len(raw_data.dropna()))
print(len(raw_data[raw_data['synopsis'].isna()]))

12266
10085
434


In [None]:
def cyclical_encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

## Preprocess Start Season
['start_season,' (object or null)]

In [None]:
train_data['start_season.season'] = train_data['start_season.season'].fillna('Unknown')
train_data['start_season.year'] = train_data['start_season.year'].fillna(train_data['start_season.year'].mode()[0])

In [None]:
season_encoder = LabelEncoder()
season_encoder.fit(train_data['start_season.season'].unique())
train_data['start_season.season'] = season_encoder.transform(train_data['start_season.season'])

## Preprocess Synopsis
['synopsis,' (string or null)]

In [None]:
def clean_text(text):
    text = unicodedata.normalize('NFKC', text)  # Unicode normalization
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags if any
    text = re.sub(r"\(.*source.*\)", "", text, flags=re.IGNORECASE)  # Remove source citations
    text = re.sub(r"\[.*MAL.*\]", "", text)  # Remove MAL citations
    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
    text = text.strip()  # Strip whitespace from the beginning and the end
    return text

train_data['synopsis'] = train_data['synopsis'].fillna("")
train_data['synopsis'] = train_data['synopsis'].apply(clean_text)

## Preprocess Genre Lables
['genres,' (array of objects, may be empty)]

In [None]:
train_data['genres'] = train_data['genres'].fillna('[]')
train_data['genres'] = train_data['genres'].apply(ast.literal_eval)

In [None]:
unique_genres = set()

def process(entry):
    genres_set = set(genre['name'] for genre in entry)
    unique_genres.update(genres_set)
    return genres_set

train_data['genres'] = train_data['genres'].apply(process)

In [None]:
genre_mlb = MultiLabelBinarizer()
genre_mlb.fit([unique_genres])

train_data['genres'] = train_data['genres'].apply(lambda x: np.squeeze(genre_mlb.transform([x])))

## Preprocess Studio Labels
['studios,' (array of objects, may be empty)]

In [None]:
train_data['studios'] = train_data['studios'].fillna('[]')
train_data['studios'] = train_data['studios'].apply(ast.literal_eval)

In [None]:
unique_studios = set()

def process(entry):
    studios_set = set(studio['name'] for studio in entry)
    unique_studios.update(studios_set)
    return studios_set

train_data['studios'] = train_data['studios'].apply(process)

In [None]:
studio_mlb = MultiLabelBinarizer()
studio_mlb.fit([unique_studios])

train_data['studios'] = train_data['studios'].apply(lambda x: np.squeeze(studio_mlb.transform([x])))

## Preprocess NSFW Tag
['nsfw,' (white=sfw, gray=partially, black=nsfw) (string or null)]

In [None]:
train_data['nsfw'] = train_data['nsfw'].fillna("Unknown")
nsfw_encoder = LabelEncoder()
nsfw_encoder.fit(train_data['nsfw'].unique())
train_data['nsfw'] = nsfw_encoder.transform(train_data['nsfw'])

## Preprocess Source
['source,' (string or null)]

In [None]:
train_data['source'] = train_data['source'].fillna("Unknown")
source_encoder = LabelEncoder()
source_encoder.fit(train_data['source'].unique())
train_data['source'] = source_encoder.transform(train_data['source'])

## Preprocess Status
['status,' (string)]

In [None]:
status_encoder = LabelEncoder()
status_encoder.fit(train_data['status'].unique())
train_data['status'] = status_encoder.transform(train_data['status'])

## Preprocess Media Type
['media_type,' (string)]

In [None]:
media_type_encoder = LabelEncoder()
media_type_encoder.fit(train_data['media_type'].unique())
train_data['media_type'] = media_type_encoder.transform(train_data['media_type'])

## Preprocess Rating
['rating,' (string or null) (g, pg, pg_13, r, r+, rx)]

In [None]:
rating_map = {
    "g": 0,
    "pg": 1,
    "pg_13": 2,
    "r": 3,
    "r+": 4,
    "rx": 5
}
train_data['rating'] = train_data['rating'].fillna("Unknown")
train_data['rating'] = train_data['rating'].map(rating_map)

## Preprocess Numerical Columns

### Preprocess Num Episodes ['num_episodes,' (integer)]

In [None]:
train_data['num_episodes'].unique()

## Check Handling of Unicode Characters in Tokenizer

In [None]:
""" List the Unicode code points of the characters in a string
unicode_list = [(char, f"U+{ord(char):04X}") for char in test]

# Print the results
for char, code in unicode_list:
    print(f"'{char}' -> {code}")
"""

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

In [None]:
encoding = tokenizer(test)
encoding['input_ids']
output = tokenizer.convert_ids_to_tokens(encoding['input_ids'])

## Save the Processed Dataframe

In [None]:
#raw_data.to_csv(os.path.join(data_dir, "interim", "anime_data_processed.csv"), index=False)