## Dependencies

In [54]:
import nltk
import os
from concurrent.futures import ProcessPoolExecutor
import multiprocessing
import pandas as pd
import re
import contractions
import spacy
from spellchecker import SpellChecker
pd.options.mode.chained_assignment = None

nltk_data_path = "../data/libs/nltk_data"
os.makedirs(nltk_data_path, exist_ok=True)
nltk.data.path.append(nltk_data_path)
nltk.download('wordnet', quiet=True)

nlp = spacy.load('en_core_web_sm')

### Load dataset

In [62]:
df = pd.read_csv(r"../data/raw/top_150_fantasy_reviews.csv")
df["review"] = df["review"].astype(str)
df.head()

Unnamed: 0,review_id,anime_title,review_url,date,username,user_review_count,is_preliminary,episodes_watched,recommendation,rating,review,total_reactions,nice_count,love_it_count,funny_count,confusing_count,informative_count,well_written_count,creative_count
0,503754,Sousou no Frieren,https://myanimelist.net/reviews.php?id=503754,"Oct 13, 2023 8:38 AM",Czekaj,5,True,5/28,Recommended,10,"With lives so short, why do we even bother? To...",1347,281,833,44,58,5,124,2
1,519189,Sousou no Frieren,https://myanimelist.net/reviews.php?id=519189,"Mar 22, 2024 12:40 PM",chekkit,25,False,,Recommended,10,I feel so catered to.\n\r\nIt feels like an et...,1211,248,789,43,50,8,70,3
2,519472,Sousou no Frieren,https://myanimelist.net/reviews.php?id=519472,"Mar 24, 2024 2:03 AM",Trikkiez,3,False,,Not Recommended,4,Style-\r\nFrieren doesn't have its own unique ...,4219,630,105,1966,1355,29,123,11
3,512466,Sousou no Frieren,https://myanimelist.net/reviews.php?id=512466,"Jan 12, 2024 11:25 AM",ShabbaRico,12,True,18/28,Not Recommended,5,"TL;DR: 5/10, I don't recommend this for anyone...",931,183,28,400,267,9,42,2
4,503760,Sousou no Frieren,https://myanimelist.net/reviews.php?id=503760,"Oct 13, 2023 9:10 AM",TheRealist68,16,True,6/28,Mixed Feelings,9,"Through 3 episodes, Frieren appears to be a un...",953,412,60,31,314,10,122,4


In [63]:
print(f"Dataset length: {len(df)}")

Dataset length: 2404


#### Remove NA/Duplicate Reviews

In [65]:
# Drop missing content
df = df.dropna(subset=["review"])
print(f"After dropping missing review: {len(df)}")

# Drop duplicates based on review
df = df.drop_duplicates(subset=["review"])
print(f"After dropping duplicates: {len(df)}")

# Keep only rows where 'review' is an actual string
df = df[df["review"].apply(lambda x: isinstance(x, str))]
print(f"After keeping only string-type reviews: {len(df)}")

After dropping missing review: 2403
After dropping duplicates: 2403
After keeping only string-type reviews: 2403


### A. Sentiment Preprocessing

In [66]:
# Copy review col to review_sentiment
df["review_sentiment"] = df["review"].copy()

#### Remove Escape Characters and Normalize Whitespace

In [67]:
# Remove \n, \r, \t
# Collapse multiple spaces
def remove_escape_chars_and_normalize_whitespaces(text):
    text = re.sub(r'\\[nrt]+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["review_sentiment"] = df["review_sentiment"].apply(remove_escape_chars_and_normalize_whitespaces)
df["review_sentiment"].head()

0    With lives so short, why do we even bother? To...
1    I feel so catered to. It feels like an eternit...
2    Style- Frieren doesn't have its own unique sty...
3    TL;DR: 5/10, I don't recommend this for anyone...
4    Through 3 episodes, Frieren appears to be a un...
Name: review_sentiment, dtype: object

#### Expand contractions

In [68]:
# "can't" → "cannot", "I’m" → "I am"
def expand_contractions(text):
    try:
        return contractions.fix(text)
    except Exception:
        return text

df["review_sentiment"] = df["review_sentiment"].apply(expand_contractions)
df["review_sentiment"].head()

0    With lives so short, why do we even bother? To...
1    I feel so catered to. It feels like an eternit...
2    Style- Frieren does not have its own unique st...
3    TL;DR: 5/10, I do not recommend this for anyon...
4    Through 3 episodes, Frieren appears to be a un...
Name: review_sentiment, dtype: object

#### Remove URLs, mentions, symbols

In [69]:
# Clean web-specific junk: @username, URLs, repeated punctuation
def remove_urls_mentions_symbols(text):
    text = re.sub(r"http\S+|www.\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^\w\s.,!?]", "", text) # Remove non-standard characters (but keep . , ! ? for VADER)
    return text

df["review_sentiment"] = df["review_sentiment"].apply(remove_urls_mentions_symbols)
df["review_sentiment"].head()

0    With lives so short, why do we even bother? To...
1    I feel so catered to. It feels like an eternit...
2    Style Frieren does not have its own unique sty...
3    TLDR 510, I do not recommend this for anyone t...
4    Through 3 episodes, Frieren appears to be a un...
Name: review_sentiment, dtype: object

#### Lowercase text

In [70]:
def lowercase_text(text):
    text = text.lower()
    return text

df["review_sentiment"] = df["review_sentiment"].apply(lowercase_text)
df["review_sentiment"].head()

0    with lives so short, why do we even bother? to...
1    i feel so catered to. it feels like an eternit...
2    style frieren does not have its own unique sty...
3    tldr 510, i do not recommend this for anyone t...
4    through 3 episodes, frieren appears to be a un...
Name: review_sentiment, dtype: object

#### Correct Spellings

In [73]:
# Function must be defined at top level for multiprocessing to work
def correct_text(text):
    spell = SpellChecker()
    correction_cache = {}
    try:
        words = text.split()
        corrected = []
        for word in words:
            if word.lower() in correction_cache:
                corrected_word = correction_cache[word.lower()]
            else:
                corrected_word = spell.correction(word) or word
                correction_cache[word.lower()] = corrected_word
            corrected.append(corrected_word)
        return " ".join(corrected)
    except Exception:
        return text

# Use all CPU cores
num_workers = int(multiprocessing.cpu_count() / 2)

# Apply spellcheck in parallel
with ProcessPoolExecutor(max_workers=num_workers) as executor:
    results = list(executor.map(correct_text, df["review_sentiment"].tolist()))

# Assign corrected text back to DataFrame
df["review_sentiment"] = results

In [74]:
df.to_csv(r'../data/processed/top_150_fantasy_reviews_cleaned.csv', index=False)

In [75]:
df["review_sentiment"]

0       with lives so short why do we even bother to s...
1       i feel so catered to it feels like an eternity...
2       style firemen does not have its own unique sty...
3       told 510, i do not recommend this for anyone t...
4       through 3 episodes firemen appears to be a uni...
                              ...                        
2399    magi is certainly not the best shone anime eve...
2400    i just finished this anime and i am breathless...
2401    when i think magic i think action and comedy p...
2402    initial review for newcomers magi the labyrint...
2403    only saw the first season of magic which is th...
Name: review_sentiment, Length: 2403, dtype: object