## Dependencies

In [120]:
import pandas as pd
import re
import contractions
import html

### Load dataset

In [121]:
df = pd.read_csv(r"../data/raw/top_150_fantasy_reviews.csv")
df["review"] = df["review"].astype(str)

print(df.head())
print(f"Dataset length: {len(df)}")

   review_id        anime_title  \
0     503754  Sousou no Frieren   
1     519189  Sousou no Frieren   
2     519472  Sousou no Frieren   
3     512466  Sousou no Frieren   
4     503760  Sousou no Frieren   

                                      review_url                   date  \
0  https://myanimelist.net/reviews.php?id=503754   Oct 13, 2023 8:38 AM   
1  https://myanimelist.net/reviews.php?id=519189  Mar 22, 2024 12:40 PM   
2  https://myanimelist.net/reviews.php?id=519472   Mar 24, 2024 2:03 AM   
3  https://myanimelist.net/reviews.php?id=512466  Jan 12, 2024 11:25 AM   
4  https://myanimelist.net/reviews.php?id=503760   Oct 13, 2023 9:10 AM   

       username  user_review_count  is_preliminary episodes_watched  \
0        Czekaj                  5            True             5/28   
1       chekkit                 25           False              NaN   
2      Trikkiez                  3           False              NaN   
3    ShabbaRico                 12            True    

#### Remove NA/Duplicate Reviews

In [122]:
# Drop missing content
df = df.dropna(subset=["review"])
print(f"After dropping missing review: {len(df)}")

# Drop duplicates based on review
df = df.drop_duplicates(subset=["review"])
print(f"After dropping duplicates: {len(df)}")

# Keep only rows where 'review' is an actual string
df = df[df["review"].apply(lambda x: isinstance(x, str))]
print(f"After keeping only string-type reviews: {len(df)}")

After dropping missing review: 2404
After dropping duplicates: 2403
After keeping only string-type reviews: 2403


### Basic Cleaning

In [123]:
def basic_clean(text: str) -> str:
    # 1. Decode HTML entities: &amp; → &, etc.
    text = html.unescape(text)

    # 2. Normalize curly quotes to straight quotes
    text = re.sub(r'[“”]', '"', text)        # curly double quotes
    text = re.sub(r"[‘’]", "'", text)        # curly single quotes

    # 3. Collapse duplicate quotes ("" → ")
    text = re.sub(r'""', '"', text)
    text = re.sub(r"''", "'", text)

    # 4. Remove literal \n, \t, \r from escaped strings
    text = re.sub(r'\\[nrt]+', ' ', text)

    # 5. Remove URLs and mentions
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)

    # 6. Expand contractions (can't → can not)
    text = contractions.fix(text)

    # 7. Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text

df["review_basic"] = df["review"].apply(basic_clean)

In [124]:
df[["review", "review_basic"]].head()

Unnamed: 0,review,review_basic
0,"With lives so short, why do we even bother? To...","With lives so short, why do we even bother? To..."
1,I feel so catered to.\n\r\nIt feels like an et...,I feel so catered to. It feels like an eternit...
2,Style-\r\nFrieren doesn't have its own unique ...,Style- Frieren does not have its own unique st...
3,"TL;DR: 5/10, I don't recommend this for anyone...","TL;DR: 5/10, I do not recommend this for anyon..."
4,"Through 3 episodes, Frieren appears to be a un...","Through 3 episodes, Frieren appears to be a un..."


### Sentiment Preprocessing

In [125]:
def for_sentiment(text: str) -> str:
    # 1. Lowercase the text
    text = text.lower()

    # 2. Keep only useful punctuation: ! ? % '
    #    Remove: . , : ; ( ) etc.
    text = re.sub(r"[^\w\s!?%']", " ", text)

    # 3. Collapse whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text

df["review_sentiment"] = df["review_basic"].apply(for_sentiment)

In [126]:
df[["review", "review_sentiment"]].head()

Unnamed: 0,review,review_sentiment
0,"With lives so short, why do we even bother? To...",with lives so short why do we even bother? to ...
1,I feel so catered to.\n\r\nIt feels like an et...,i feel so catered to it feels like an eternity...
2,Style-\r\nFrieren doesn't have its own unique ...,style frieren does not have its own unique sty...
3,"TL;DR: 5/10, I don't recommend this for anyone...",tl dr 5 10 i do not recommend this for anyone ...
4,"Through 3 episodes, Frieren appears to be a un...",through 3 episodes frieren appears to be a uni...


### NER and POS Preprocessing

In [127]:
def for_ner_pos(text: str) -> str:
    # 1. Keep line breaks, but normalize spaces, tabs, carriage returns
    text = re.sub(r'[ \t\r]+', ' ', text)   # collapse spaces/tabs

    # 2. Clean up spacing around line breaks (avoid "word \n word")
    text = re.sub(r' *\n *', '\n', text)

    # 3. Strip leading/trailing whitespace
    return text.strip()

df["review_nerpos"] = df["review_basic"].apply(for_ner_pos)

In [128]:
df[["review", "review_nerpos"]].head()

Unnamed: 0,review,review_nerpos
0,"With lives so short, why do we even bother? To...","With lives so short, why do we even bother? To..."
1,I feel so catered to.\n\r\nIt feels like an et...,I feel so catered to. It feels like an eternit...
2,Style-\r\nFrieren doesn't have its own unique ...,Style- Frieren does not have its own unique st...
3,"TL;DR: 5/10, I don't recommend this for anyone...","TL;DR: 5/10, I do not recommend this for anyon..."
4,"Through 3 episodes, Frieren appears to be a un...","Through 3 episodes, Frieren appears to be a un..."


### Save Cleaned Dataset

In [130]:
df.to_csv(r'../data/processed/top_150_fantasy_reviews_cleaned.csv', index=False)
df.head()

Unnamed: 0,review_id,anime_title,review_url,date,username,user_review_count,is_preliminary,episodes_watched,recommendation,rating,...,nice_count,love_it_count,funny_count,confusing_count,informative_count,well_written_count,creative_count,review_basic,review_sentiment,review_nerpos
0,503754,Sousou no Frieren,https://myanimelist.net/reviews.php?id=503754,"Oct 13, 2023 8:38 AM",Czekaj,5,True,5/28,Recommended,10,...,281,833,44,58,5,124,2,"With lives so short, why do we even bother? To...",with lives so short why do we even bother? to ...,"With lives so short, why do we even bother? To..."
1,519189,Sousou no Frieren,https://myanimelist.net/reviews.php?id=519189,"Mar 22, 2024 12:40 PM",chekkit,25,False,,Recommended,10,...,248,789,43,50,8,70,3,I feel so catered to. It feels like an eternit...,i feel so catered to it feels like an eternity...,I feel so catered to. It feels like an eternit...
2,519472,Sousou no Frieren,https://myanimelist.net/reviews.php?id=519472,"Mar 24, 2024 2:03 AM",Trikkiez,3,False,,Not Recommended,4,...,630,105,1966,1355,29,123,11,Style- Frieren does not have its own unique st...,style frieren does not have its own unique sty...,Style- Frieren does not have its own unique st...
3,512466,Sousou no Frieren,https://myanimelist.net/reviews.php?id=512466,"Jan 12, 2024 11:25 AM",ShabbaRico,12,True,18/28,Not Recommended,5,...,183,28,400,267,9,42,2,"TL;DR: 5/10, I do not recommend this for anyon...",tl dr 5 10 i do not recommend this for anyone ...,"TL;DR: 5/10, I do not recommend this for anyon..."
4,503760,Sousou no Frieren,https://myanimelist.net/reviews.php?id=503760,"Oct 13, 2023 9:10 AM",TheRealist68,16,True,6/28,Mixed Feelings,9,...,412,60,31,314,10,122,4,"Through 3 episodes, Frieren appears to be a un...",through 3 episodes frieren appears to be a uni...,"Through 3 episodes, Frieren appears to be a un..."
