In [1]:
import os
import re
import ast
import pickle
from datetime import datetime
import unicodedata
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import textacy.preprocessing as tprep
from transformers import DistilBertTokenizer
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.model_selection import train_test_split

In [2]:
load_dotenv()
data_dir = os.getenv("DATA_DIR")
root_dir = os.getenv("ROOT_DIR")
data_path = os.path.join(data_dir, "interim", "interim_full_anime_data.json")
raw_data = pd.read_json(data_path, orient='records')
# For autoencoder training we drop all NaNs
raw_data = raw_data.dropna()
# Drop irrelevant columns
#raw_data = raw_data.drop(columns=['created_at', 'updated_at', 'related_manga', 'recommendations', 'main_picture.medium', 'main_picture.large'])
train_data, test_data = train_test_split(raw_data, test_size=0.2, random_state=42)

# Anime features and their format

'id,' # Anime ID (integer)\
✅'title,' # Anime title (string)\
✅'synopsis,' # Anime synopsis (string or null)\
✅'mean,' # Mean score (float or null)\
✅'popularity,' # Popularity rank (integer or null)\
✅'num_list_users,' # Number of users who have the anime in their list (integer)\
✅'num_scoring_users,' # Number of users who have scored the anime (integer)\
✅'nsfw,' # NSFW classification (white=sfw, gray=partially, black=nsfw) (string or null)\
✅'genres,' # Genres (array of objects)\
✅'studios,' # Studios (array of objects)\
✅'num_episodes,' # Number of episodes (integer)\
✅'average_episode_duration,' # Average duration of an episode (integer or null)\
✅'status,' # Airing status (string)\
✅'rating,' # Age rating (string or null) (g, pg, pg_13, r, r+, rx)\
✅'source,' # Source (string or null)\
✅'media_type,' # Media type (string)\
🛑'created_at,' # Date of creation (string <date-time>)\
🛑'updated_at,' # Date of last update (string <date-time>)\
✅'start_season,' # Start season (object or null)\
✅'start_date,' # Start date (string or null)\
✅'end_date,' # End date (string or null)\
🚫(no longer supported)'related_anime,' # Related anime (array of objects)\
🚫(no longer supported)'related_manga,' # Related manga (array of objects)\
🚫(no longer supported)'recommendations,' # Recommendations (array of objects)\
🚫(no longer supported)'statistics' # Statistics (object or null)

In [3]:
config = {}
config['simple'] = {}

In [4]:
def save_model(model, model_name, sub_dict=None, save=True):
    if not sub_dict:
        sub_dict = ''
    model_name_clean = model_name.replace('.', '_')
    model_path = os.path.join(root_dir, 'src', 'preprocessing', 'models', sub_dict, model_name_clean + '.pkl')
    if sub_dict:
        config[sub_dict][model_name] = model_path
    else:
        config[model_name] = model_path
    if save:
        with open(model_path, 'wb') as model_file:
            pickle.dump(model, model_file)

## Preprocess Date/Season Information

### Preprocess Dates
['start_season,' # Start season (object or null)]\
['start_date,' # Start date (string or null)]\
['end_date,' # End date (string or null)]\
['start_season.year,' # Start year (object or null)]

In [5]:
def process_dates(data, save=True):
    def safe_date_convert(date) -> datetime.date:
        if pd.isna(date):
            return None
        if type(date) is float:
            return datetime.strptime(str(int(date)), '%Y').date()
        if type(date) is str:
            if re.compile("\d{4}-\d{2}-\d{2}").match(date):
                return datetime.strptime(date, '%Y-%m-%d').date()
            elif re.compile("\d{4}-\d{2}").search(date):
                return datetime.strptime(date, '%Y-%m').date()
            else:
                return datetime.strptime(date, '%Y').date()
        raise ValueError(f"Invalid date format: {date}, {type(date)}")

    def time_diff(start_date, end_date):
        if pd.isna(start_date) or pd.isna(end_date):
            return None
        if start_date <= end_date:
            return (end_date - start_date).days
        else:
            return (start_date - end_date).days

    # Convert dates to datetime objects
    data['start_date'] = data['start_date'].apply(safe_date_convert)
    data['end_date'] = data['end_date'].apply(safe_date_convert)
    # Calculate time difference
    data['time_diff'] = data.apply(lambda x: time_diff(x['start_date'], x['end_date']), axis=1)
    data = data.drop(columns=['start_date', 'end_date'])
    # Scale time_diff
    td_scaler = PowerTransformer()
    data['time_diff'] = td_scaler.fit_transform(data['time_diff'].values.reshape(-1, 1))
    save_model(td_scaler, 'time_diff', save=save)
    # Scale start_season.year
    year_scaler = MinMaxScaler()
    data['start_season.year'] = year_scaler.fit_transform(data['start_season.year'].values.reshape(-1, 1))
    save_model(year_scaler, 'start_season_year', save=save)
    return data

### Preprocess Start Season
['start_season,' (object or null)]

In [6]:
def preprocess_season(data, save=True):
    def cyclical_encode(data, col, max_val):
        data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
        data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
        return data

    season_encoder = LabelEncoder()
    data['start_season.season'] = season_encoder.fit_transform(data['start_season.season'])
    save_model(season_encoder, 'start_season_season', save=save)

    # Apply the cyclical_encode function to create sine and cosine features
    data = cyclical_encode(data, 'start_season.season', max_val=len(season_encoder.classes_))
    data = data.drop(columns=['start_season.season'])
    return data

## Preprocess Text
['synopsis,' (string or null)]\
related column

In [7]:
def preprocess_text(data):
    def clean_text(text):
        text = unicodedata.normalize('NFKC', text)  # Unicode normalization
        text = text.replace('\u2013', '\u002d')  # Replace en dash with hyphen
        text = text.replace('\u00d7', '\u0078')  # Replace multiplication sign with x
        text = tprep.normalize.hyphenated_words(text)  # Normalize hyphenated words
        text = tprep.normalize.quotation_marks(text)  # Normalize quotation marks
        text = tprep.normalize.bullet_points(text)  # Normalize bullet points
        text = tprep.normalize.whitespace(text)  # Normalize whitespace
        text = tprep.remove.accents(text)  # Remove accents
        text = re.sub(r'<.*?>', '', text)  # Remove HTML tags if any
        text = re.sub(r"\([\s+]?source.*?\)+", "", text, flags=re.IGNORECASE)  # Remove source citations
        text = re.sub(r"\[Writ.*?by.*?\]", "", text)  # Remove MAL citations
        text = re.sub(r"\s+", " ", text)  # Normalize whitespace
        text = text.strip()  # Strip whitespace from the beginning and the end
        return text
    
    def preprocess_related(data):
        # f = lambda x: [entry['node']['title'] for entry in ast.literal_eval(x)]
        # cr = lambda x: [clean_text(i) for i in f(x)]
        g = lambda x: [clean_text(x)]
        # data['related'] = data['title'].apply(g) + data['related_anime'].apply(cr)
        # data['related'] = data['related'].apply(sorted)
        data['related'] = data['title'].apply(g)
        data = data.drop(columns=['title'])#, 'related_anime'])
        return data

    data['synopsis'] = data['synopsis'].apply(clean_text)
    data = preprocess_related(data)
    # data['related'] = data['related'].apply(' '.join)
    return data

## Preprocess Tag & Label Information

### Preprocess Genre Lables
['genres,' (array of objects, may be empty)]

In [8]:
def preprocess_genres(data, save=True):
    genres = {
        'Action', 'Adventure', 'Avant Garde', 'Award Winning', 'Boys Love', 'Comedy',
        'Drama', 'Fantasy', 'Girls Love', 'Gourmet', 'Horror', 'Mystery', 'Romance',
        'Sci-Fi',
        'Slice of Life',
        'Sports',
        'Supernatural',
        'Suspense',
        'Ecchi',
        'Erotica',
        'Hentai',
        'Adult Cast',
        'Anthropomorphic',
        'CGDCT',
        'Childcare',
        'Combat Sports',
        'Crossdressing',
        'Delinquents',
        'Detective',
        'Educational',
        'Gag Humor',
        'Gore',
        'Harem',
        'High Stakes Game',
        'Historical',
        'Idols (Female)',
        'Idols (Male)',
        'Isekai',
        'Iyashikei',
        'Love Polygon',
        'Magical Sex Shift',
        'Mahou Shoujo',
        'Martial Arts',
        'Mecha',
        'Medical',
        'Military',
        'Music',
        'Mythology',
        'Organized Crime',
        'Otaku Culture',
        'Parody',
        'Performing Arts',
        'Pets',
        'Psychological',
        'Racing',
        'Reincarnation',
        'Reverse Harem',
        'Romantic Subtext',
        'Samurai',
        'School',
        'Showbiz',
        'Space',
        'Strategy Game',
        'Super Power',
        'Survival',
        'Team Sports',
        'Time Travel',
        'Vampire',
        'Video Game',
        'Visual Arts',
        'Workplace',
        'Josei',
        'Kids',
        'Seinen',
        'Shoujo',
        'Shounen',
    }

    def process(entry):
        genres_set = set(genre['name'] for genre in entry)
        return genres_set

    # data['genres'] = data['genres'].apply(ast.literal_eval)
    data['genres'] = data['genres'].apply(process)
    
    genre_mlb = MultiLabelBinarizer()
    genre_mlb.fit([genres])
    data['genres'] = data['genres'].apply(lambda x: genre_mlb.transform([x]).reshape(1, -1))
    save_model(genre_mlb, 'genres', save=save)
    return data

### Preprocess Studio Labels
['studios,' (array of objects, may be empty)]

In [9]:
def preprocess_studios(data, save=True):

    def process(entry):
        studios_set = [studio['name'] for studio in entry]
        return studios_set

    # data['studios'] = data['studios'].apply(ast.literal_eval)
    data['studios'] = data['studios'].apply(process)

    # Use FeatureHasher to encode studios
    studio_hasher = FeatureHasher(n_features=75, input_type='string')
    studios_hashed = studio_hasher.transform(data['studios']).toarray()
    data['studios'] = [hash for hash in studios_hashed]
    data['studios'] = data['studios'].apply(lambda x: x.reshape(1, -1))
    save_model(studio_hasher, 'studios', save=save)
    return data

### Preprocess NSFW Tag
['nsfw,' (white=sfw, gray=partially, black=nsfw) (string or null)]

In [10]:
def preprocess_nsfw(data, save=True):
    nsfw_encoder = LabelEncoder()
    data['nsfw'] = nsfw_encoder.fit_transform(data['nsfw'])
    save_model(nsfw_encoder, 'nsfw', sub_dict='simple', save=save)
    return data

## Preprocess Source
['source,' (string or null)]

In [11]:
def preprocess_source(data, save=True):
    source_encoder = LabelEncoder()
    sources = {
        'other',
        'original',
        'manga',
        '4_koma_manga',
        'web_manga',
        'digital_manga',
        'novel',
        'light_novel',
        'visual_novel',
        'game',
        'card_game',
        'book',
        'picture_book',
        'radio',
        'music'
    }
    sources.update(data['source'].unique())
    sources = list(sources)
    source_encoder.fit(sources)
    data['source'] = source_encoder.transform(data['source'])
    save_model(source_encoder, 'source', sub_dict='simple', save=save)
    return data

## Preprocess Status
['status,' (string)]

In [12]:
def preprocess_status(data, save=True):
    status_encoder = LabelEncoder()
    data['status'] = status_encoder.fit_transform(data['status'])
    save_model(status_encoder, 'status', sub_dict='simple', save=save)
    return data

## Preprocess Media Type
['media_type,' (string)]

In [13]:
def preprocess_media_type(data, save=True):
    # We might want to change media_type to better reflect the data
    # We might use the following rules:
    # movie = 'avg_ep_dur'>1800
    # tv = ('avg_ep_dur'<=1800 & 'num_episodes'>=6)
    # special = ('avg_ep_dur'<=1800 & 'num_episodes'<6) | ('avg_ep_dur' < 240)
    # This covers all cases, but the duration and num_ep thresholds seem suboptimal after some testing
    # thus we skip this for now

    data['media_type'] = data['media_type'].apply(lambda x: 'special' if x in {'ona', 'ova', 'tv_special'} else x)
    media_type_encoder = LabelEncoder()
    data['media_type'] = media_type_encoder.fit_transform(data['media_type'])
    save_model(media_type_encoder, 'media_type', save=save)
    return data

## Preprocess Rating
['rating,' (string or null) (g, pg, pg_13, r, r+, rx)]

In [14]:
def preprocess_rating(data):
    rating_map = {
        "g": 0,
        "pg": 1,
        "pg_13": 2,
        "r": 3,
        "r+": 4,
        "rx": 5
    }
    data['rating'] = data['rating'].map(rating_map)
    return data

## Preprocess Numerical Columns

### Preprocess Mean
['mean,' (float or null)]

In [15]:
def preprocess_mean(data, save=True):
    # This feature looks similar to a normal distribution, so we try a standard scaler
    mean_scaler = StandardScaler()
    data['mean'] = mean_scaler.fit_transform(data['mean'].values.reshape(-1, 1))
    save_model(mean_scaler, 'mean', sub_dict='simple', save=save)
    return data

### Preprocess Popularity
['popularity,' (integer or null)]

In [16]:
def preprocess_popularity(data, save=True):
    # The distribution of this feature seems to get messed up for anything other then standard scaler
    popularity_scaler = StandardScaler()
    data['popularity'] = popularity_scaler.fit_transform(data['popularity'].values.reshape(-1, 1))
    save_model(popularity_scaler, 'popularity', sub_dict='simple', save=save)
    return data

### Preprocess Number of Users Who Have Scored The Anime
['num_scoring_users,' (integer)]

In [17]:
def preprocess_num_scoring_users(data, save=True):
    # This feature exhibits a long tail distribution, we try power transformer (yeo-johnson)
    # This might not the best way to handle this feature
    # Perhaps try https://arxiv.org/abs/2111.05956#:~:text=The%20visual%20world%20naturally%20exhibits,models%20based%20on%20deep%20learning.

    popularity_scaler = PowerTransformer()
    data['num_scoring_users'] = popularity_scaler.fit_transform(data['num_scoring_users'].values.reshape(-1, 1))
    save_model(popularity_scaler, 'num_scoring_users', sub_dict='simple', save=save)
    return data

### Preprocess Number of Episodes
['num_episodes,' (integer)]

In [18]:
def preprocess_num_episodes(data, save=True):
    # This feature exhibits a long tail distribution, we again try power transformer (yeo-johnson)
    num_episodes_scaler = PowerTransformer()
    data['num_episodes'] = num_episodes_scaler.fit_transform(data['num_episodes'].values.reshape(-1, 1))
    save_model(num_episodes_scaler, 'num_episodes', sub_dict='simple', save=save)
    return data

### Preprocess Average Episode Duration
['average_episode_duration,' (integer or null)]

In [19]:
def preprocess_average_episode_duration(data, save=True):
    # This feature might also benefit from power transformer (yeo-johnson)
    avg_ep_scaler = PowerTransformer()
    data['average_episode_duration'] = avg_ep_scaler.fit_transform(data['average_episode_duration'].values.reshape(-1, 1))
    save_model(avg_ep_scaler, 'average_episode_duration', sub_dict='simple', save=save)
    return data

### Preprocess Statistics
'statistics' (object or null)

In [20]:
def preprocess_stats(data, save=True):
    # The feature 'num_list_users' contains inconsistent data
    # We will drop this feature, and instead create a new feature 'statistics.sum'
    #data = data.drop(columns=['num_list_users'])
    num_list_users_scaler = PowerTransformer()
    data['num_list_users'] = num_list_users_scaler.fit_transform(data['num_list_users'].values.reshape(-1, 1))
    save_model(num_list_users_scaler, 'num_list_users', sub_dict='simple', save=save)
    return data

## Run Preprocessing

In [21]:
def main(data, save=True, full=False):
    # Save text data
    data = preprocess_text(data)

    # Special handling
    data = preprocess_genres(data, save=save)
    data = preprocess_studios(data, save=save)

    if not full:
        # Special handling
        data = process_dates(data, save=save)
        data = preprocess_season(data, save=save)
        
        # Simple features
        data = preprocess_nsfw(data, save=save)
        data = preprocess_source(data, save=save)
        data = preprocess_status(data, save=save)
        
        # Special handling
        data = preprocess_media_type(data, save=save)
        data = preprocess_rating(data)
        
        # Simple features
        data = preprocess_mean(data, save=save)
        data = preprocess_popularity(data, save=save)
        data = preprocess_num_scoring_users(data, save=save)
        data = preprocess_num_episodes(data, save=save)
        data = preprocess_average_episode_duration(data, save=save)
        data = preprocess_stats(data, save=save)
    return data

In [22]:
import json

full = True
if full:
    # To just process the whole dataset:
    # Make sure to disable the save_data method 
    main(raw_data, save=False, full=True)
    raw_data.to_json(os.path.join(data_dir, "processed", "full_data_processed.json"), orient='records')
else:
    # To generate the scalers for training:
    train = True
    train_data = main(train_data)
    train_data.to_json(os.path.join(data_dir, "processed", "train_data_processed.json"), orient='records')
    json.dump(config, open(os.path.join(root_dir, 'configs', 'config_preprocessing.json'), 'w'))