In [1]:
import os
import re
import ast
from datetime import datetime
import unicodedata
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import textacy.preprocessing as tprep
from transformers import DistilBertTokenizer
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, StandardScaler, RobustScaler, PowerTransformer
from sklearn.model_selection import train_test_split

In [2]:
load_dotenv()
data_dir = os.getenv("DATA_DIR")
data_path = os.path.join(data_dir, "raw", "anime_data.csv")

'id,' # Anime ID (integer)\
🛑'title,' # Anime title (string)\
✅'synopsis,' # Anime synopsis (string or null)\
✅'mean,' # Mean score (float or null)\
✅'popularity,' # Popularity rank (integer or null)\
🛑'num_list_users,' # Number of users who have the anime in their list (integer)\
✅'num_scoring_users,' # Number of users who have scored the anime (integer)\
✅'nsfw,' # NSFW classification (white=sfw, gray=partially, black=nsfw) (string or null)\
✅'genres,' # Genres (array of objects)\
✅'studios,' # Studios (array of objects)\
✅'num_episodes,' # Number of episodes (integer)\
✅'average_episode_duration,' # Average duration of an episode (integer or null)\
✅'status,' # Airing status (string)\
✅'rating,' # Age rating (string or null) (g, pg, pg_13, r, r+, rx)\
✅'source,' # Source (string or null)\
✅'media_type,' # Media type (string)\
🛑'created_at,' # Date of creation (string <date-time>)\
🛑'updated_at,' # Date of last update (string <date-time>)\
✅'start_season,' # Start season (object or null)\
✅'start_date,' # Start date (string or null)\
✅'end_date,' # End date (string or null)\
🛑'related_anime,' # Related anime (array of objects)\
🛑'related_manga,' # Related manga (array of objects)\
🛑'recommendations,' # Recommendations (array of objects)\
✅'statistics' # Statistics (object or null)

In [3]:
raw_data = pd.read_csv(data_path)
# For autoencoder training we drop all NaNs
raw_data = raw_data.dropna()
raw_data = raw_data.drop(columns=['title', 'created_at', 'updated_at', 'related_anime', 'related_manga', 'recommendations', 'main_picture.medium', 'main_picture.large'])
train_data = raw_data.copy()

In [4]:
print(len(raw_data))
print(len(raw_data.dropna()))
print(len(raw_data[raw_data['synopsis'].isna()]))

10085
10085
0


## Preprocess Date/Season Information

### Preprocess Dates
['start_season,' # Start season (object or null)]\
['start_date,' # Start date (string or null)]\
['end_date,' # End date (string or null)]\
['start_season.year,' # Start year (object or null)]

In [5]:
def safe_date_convert(date) -> datetime.date:
    if pd.isna(date):
        return None
    if type(date) is float:
        return datetime.strptime(str(int(date)), '%Y').date()
    if type(date) is str:
        if re.compile("\d{4}-\d{2}-\d{2}").match(date):
            return datetime.strptime(date, '%Y-%m-%d').date()
        elif re.compile("\d{4}-\d{2}").search(date):
            return datetime.strptime(date, '%Y-%m').date()
        else:
            return datetime.strptime(date, '%Y').date()
    raise ValueError(f"Invalid date format: {date}, {type(date)}")

train_data['start_season.year'] = train_data['start_season.year'].apply(safe_date_convert)

In [6]:
def time_diff(start_date, end_date):
    if pd.isna(start_date) or pd.isna(end_date):
        return None
    if start_date <= end_date:
        return (end_date - start_date).days
    else:
        return (start_date - end_date).days

train_data['start_date'] = train_data['start_date'].apply(safe_date_convert)
train_data['end_date'] = train_data['end_date'].apply(safe_date_convert)
# Calculate time difference
train_data['time_diff'] = train_data.apply(lambda x: time_diff(x['start_date'], x['end_date']), axis=1)
train_data = train_data.drop(columns=['start_date', 'end_date'])

# TODO scale time_diff

In [7]:
# TODO handle start_season.year ==> max-min-scaling?

### Preprocess Start Season
['start_season,' (object or null)]

In [8]:
def cyclical_encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

season_encoder = LabelEncoder()
train_data['start_season.season'] = season_encoder.fit_transform(train_data['start_season.season'])

# Apply the cyclical_encode function to create sine and cosine features
train_data = cyclical_encode(train_data, 'start_season.season', max_val=len(season_encoder.classes_))
train_data = train_data.drop(columns=['start_season.season'])

## Preprocess Synopsis
['synopsis,' (string or null)]

In [9]:
def clean_text(text):
    text = unicodedata.normalize('NFKC', text)  # Unicode normalization
    text = tprep.normalize.hyphenated_words(text)  # Normalize hyphenated words
    text = tprep.normalize.quotation_marks(text)  # Normalize quotation marks
    text = tprep.remove.accents(text)  # Remove accents
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags if any
    text = re.sub(r"\(.*source.*\)", "", text, flags=re.IGNORECASE)  # Remove source citations
    text = re.sub(r"\[.*MAL.*\]", "", text)  # Remove MAL citations
    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
    text = text.strip()  # Strip whitespace from the beginning and the end
    return text

train_data['synopsis'] = train_data['synopsis'].apply(clean_text)

In [10]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
train_data['tokenized_synopsis'] = train_data['synopsis'].apply(lambda x: tokenizer(x, truncation=True, padding=True, max_length=512, return_tensors='pt') if not pd.isna(x) else None)
train_data = train_data.drop(columns=['synopsis'])

In [30]:
train_data.iloc[0]

id                                                                             52991
mean                                                                        3.121522
popularity                                                                 -1.341264
num_scoring_users                                                           1.685682
nsfw                                                                               1
genres                             [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
studios                            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
num_episodes                                                                1.297257
average_episode_duration                                                      0.2379
status                                                                             1
rating                                                                             2
source                                                           

## Preprocess Tag/Label Information

### Preprocess Genre Lables
['genres,' (array of objects, may be empty)]

In [12]:
unique_genres = set()

def process(entry):
    genres_set = set(genre['name'] for genre in entry)
    unique_genres.update(genres_set)
    return genres_set

train_data['genres'] = train_data['genres'].apply(ast.literal_eval)
train_data['genres'] = train_data['genres'].apply(process)

In [13]:
genre_mlb = MultiLabelBinarizer()
genre_mlb.fit([unique_genres])

train_data['genres'] = train_data['genres'].apply(lambda x: np.squeeze(genre_mlb.transform([x])))

### Preprocess Studio Labels
['studios,' (array of objects, may be empty)]

In [14]:
unique_studios = set()

def process(entry):
    studios_set = set(studio['name'] for studio in entry)
    unique_studios.update(studios_set)
    return studios_set

train_data['studios'] = train_data['studios'].apply(ast.literal_eval)
train_data['studios'] = train_data['studios'].apply(process)

In [15]:
studio_mlb = MultiLabelBinarizer()
studio_mlb.fit([unique_studios])

train_data['studios'] = train_data['studios'].apply(lambda x: np.squeeze(studio_mlb.transform([x])))

### Preprocess NSFW Tag
['nsfw,' (white=sfw, gray=partially, black=nsfw) (string or null)]

In [16]:
nsfw_encoder = LabelEncoder()
nsfw_encoder.fit(train_data['nsfw'].unique())
train_data['nsfw'] = nsfw_encoder.transform(train_data['nsfw'])

## Preprocess Source
['source,' (string or null)]

In [17]:
source_encoder = LabelEncoder()
source_encoder.fit(train_data['source'].unique())
train_data['source'] = source_encoder.transform(train_data['source'])

## Preprocess Status
['status,' (string)]

In [18]:
status_encoder = LabelEncoder()
status_encoder.fit(train_data['status'].unique())
train_data['status'] = status_encoder.transform(train_data['status'])

## Preprocess Media Type
['media_type,' (string)]

In [19]:
media_type_encoder = LabelEncoder()
media_type_encoder.fit(train_data['media_type'].unique())
train_data['media_type'] = media_type_encoder.transform(train_data['media_type'])

## Preprocess Rating
['rating,' (string or null) (g, pg, pg_13, r, r+, rx)]

In [20]:
rating_map = {
    "g": 0,
    "pg": 1,
    "pg_13": 2,
    "r": 3,
    "r+": 4,
    "rx": 5
}
train_data['rating'] = train_data['rating'].map(rating_map)

## Preprocess Numerical Columns

### Preprocess Mean
['mean,' (float or null)]

In [21]:
# This feature looks similar to a normal distribution, so we try a standard scaler
mean_scaler = StandardScaler()
train_data['mean'] = mean_scaler.fit_transform(train_data['mean'].values.reshape(-1, 1))

### Preprocess Popularity
['popularity,' (integer or null)]

In [22]:
# The distribution of this feature seems to get messed up for anything other then standard scaler
popularity_scaler = StandardScaler()
train_data['popularity'] = popularity_scaler.fit_transform(train_data['popularity'].values.reshape(-1, 1))

### Preprocess Number of Users Who Have Scored The Anime
['num_scoring_users,' (integer)]

In [23]:
# This feature exhibits a long tail distribution, we try power transformer (yeo-johnson)
# This might not the best way to handle this feature
# Perhaps try https://arxiv.org/abs/2111.05956#:~:text=The%20visual%20world%20naturally%20exhibits,models%20based%20on%20deep%20learning.

popularity_scaler = PowerTransformer()
train_data['num_scoring_users'] = popularity_scaler.fit_transform(train_data['num_scoring_users'].values.reshape(-1, 1))

### Preprocess Number of Episodes
['num_episodes,' (integer)]

In [24]:
# This feature exhibits a long tail distribution, we again try power transformer (yeo-johnson)
num_episodes_scaler = PowerTransformer()
train_data['num_episodes'] = num_episodes_scaler.fit_transform(train_data['num_episodes'].values.reshape(-1, 1))

### Preprocess Average Episode Duration
['average_episode_duration,' (integer or null)]

In [25]:
# This feature might also benefit from power transformer (yeo-johnson)
avg_ep_scaler = PowerTransformer()
train_data['average_episode_duration'] = avg_ep_scaler.fit_transform(train_data['average_episode_duration'].values.reshape(-1, 1))

### Preprocess Statistics
'statistics' (object or null)

In [26]:
# The feature 'num_list_users' contains inconsistent data
# We will drop this feature, and instead create a new feature 'statistics.sum'
train_data = train_data.drop(columns=['num_list_users'])
#train_data['statistics.sum'] = train_data['statistics.status.watching'] + train_data['statistics.status.completed'] + train_data['statistics.status.on_hold'] + train_data['statistics.status.dropped'] + train_data['statistics.status.plan_to_watch']

In [27]:
watching_scaler = PowerTransformer()
train_data['statistics.status.watching'] = watching_scaler.fit_transform(train_data['statistics.status.watching'].values.reshape(-1, 1))
completed_scaler = PowerTransformer()
train_data['statistics.status.completed'] = completed_scaler.fit_transform(train_data['statistics.status.completed'].values.reshape(-1, 1))
on_hold_scaler = PowerTransformer()
train_data['statistics.status.on_hold'] = on_hold_scaler.fit_transform(train_data['statistics.status.on_hold'].values.reshape(-1, 1))
dropped_scaler = PowerTransformer()
train_data['statistics.status.dropped'] = dropped_scaler.fit_transform(train_data['statistics.status.dropped'].values.reshape(-1, 1))
plan_to_watch_scaler = PowerTransformer()
train_data['statistics.status.plan_to_watch'] = plan_to_watch_scaler.fit_transform(train_data['statistics.status.plan_to_watch'].values.reshape(-1, 1))
num_list_users_scaler = PowerTransformer()
train_data['statistics.num_list_users'] = num_list_users_scaler.fit_transform(train_data['statistics.num_list_users'].values.reshape(-1, 1))

## Save the Processed Dataframe

In [28]:
train_data.to_csv(os.path.join(data_dir, "interim", "anime_data_processed.csv"), index=False)