## Dependencies

In [1]:
import pandas as pd
import re
import contractions
import html

### Load dataset

In [3]:
df = pd.read_csv(r"../data/raw/top_150_fantasy_reviews_balanced.csv")
df["review"] = df["review"].astype(str)

print(df.head())
print(f"Dataset length: {len(df)}")

   review_id        anime_title  \
0     503754  Sousou no Frieren   
1     519189  Sousou no Frieren   
2     506314  Sousou no Frieren   
3     506301  Sousou no Frieren   
4     507559  Sousou no Frieren   

                                      review_url                   date  \
0  https://myanimelist.net/reviews.php?id=503754   Oct 13, 2023 8:38 AM   
1  https://myanimelist.net/reviews.php?id=519189  Mar 22, 2024 12:40 PM   
2  https://myanimelist.net/reviews.php?id=506314   Nov 10, 2023 3:29 PM   
3  https://myanimelist.net/reviews.php?id=506301   Nov 10, 2023 9:52 AM   
4  https://myanimelist.net/reviews.php?id=507559   Nov 25, 2023 3:17 PM   

      username  user_review_count  is_preliminary episodes_watched  \
0       Czekaj                  5            True             5/28   
1      chekkit                 25           False              NaN   
2    Hallideus                  3            True            10/28   
3     Gamer651                  9            True         

#### Remove NA/Duplicate Reviews

In [4]:
# Drop missing content
df = df.dropna(subset=["review"])
print(f"After dropping missing review: {len(df)}")

# Drop duplicates based on review
df = df.drop_duplicates(subset=["review"])
print(f"After dropping duplicates: {len(df)}")

# Keep only rows where 'review' is an actual string
df = df[df["review"].apply(lambda x: isinstance(x, str))]
print(f"After keeping only string-type reviews: {len(df)}")

After dropping missing review: 9717
After dropping duplicates: 4860
After keeping only string-type reviews: 4860


### Sentiment Preprocessing

In [5]:
def sentiment_preprocessing(text: str) -> str:
    # 1. Decode HTML entities: &amp; → &, etc.
    text = html.unescape(text)

    # 2. Normalize curly quotes to straight quotes
    text = re.sub(r'[“”]', '"', text)        # curly double quotes
    text = re.sub(r"[‘’]", "'", text)        # curly single quotes

    # 3. Collapse duplicate quotes ("" → ")
    text = re.sub(r'""', '"', text)
    text = re.sub(r"''", "'", text)

    # 4. Remove literal \n, \t, \r from escaped strings
    text = re.sub(r'\\[nrt]+', ' ', text)

    # 5. Remove URLs and mentions
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)

    # 6. Expand contractions (can't → can not)
    text = contractions.fix(text)

    # 7. Lowercase the text
    text = text.lower()

    # 8. Keep only useful punctuation: ! ? % '
    #    Remove: . , : ; ( ) etc.
    text = re.sub(r"[^\w\s!?%']", " ", text)

    # 9. Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text

df["review_sentiment"] = df["review"].apply(sentiment_preprocessing)

In [6]:
df[["review", "review_sentiment"]].head()

Unnamed: 0,review,review_sentiment
0,"With lives so short, why do we even bother? To...",with lives so short why do we even bother? to ...
1,I feel so catered to.\n\r\nIt feels like an et...,i feel so catered to it feels like an eternity...
2,I feel some of the other reviews say it all to...,i feel some of the other reviews say it all to...
3,It's been 3 years since I last wrote up a revi...,it is been 3 years since i last wrote up a rev...
4,First time I felt compelled to write a review ...,first time i felt compelled to write a review ...


### NER & POS Preprocessing

In [7]:
def ner_pos_preprocessing(text: str) -> str:
    # 1. Decode HTML entities: &amp; → &, etc.
    text = html.unescape(text)

    # 2. Normalize curly quotes to straight quotes
    text = re.sub(r'[“”]', '"', text)        # curly double quotes
    text = re.sub(r"[‘’]", "'", text)        # curly single quotes

    # 3. Collapse duplicate quotes ("" → ")
    text = re.sub(r'""', '"', text)
    text = re.sub(r"''", "'", text)

    # 4. Remove literal \n, \t, \r from escaped strings
    text = re.sub(r'\\[nrt]+', ' ', text)

    # 5. Remove URLs and mentions
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)

    # 6. Expand contractions (can't → can not)
    text = contractions.fix(text)

    # 7. Collapse spaces, tabs, and carriage returns
    text = re.sub(r'[ \t\r]+', ' ', text)

    # 8. Clean up spacing around line breaks (avoid "word \n word")
    text = re.sub(r' *\n *', '\n', text)

    # 9. Normalize internal whitespace and strip outer whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text

df["review_nerpos"] = df["review"].apply(ner_pos_preprocessing)

In [8]:
df[["review", "review_nerpos"]].head()

Unnamed: 0,review,review_nerpos
0,"With lives so short, why do we even bother? To...","With lives so short, why do we even bother? To..."
1,I feel so catered to.\n\r\nIt feels like an et...,I feel so catered to. It feels like an eternit...
2,I feel some of the other reviews say it all to...,I feel some of the other reviews say it all to...
3,It's been 3 years since I last wrote up a revi...,It is been 3 years since I last wrote up a rev...
4,First time I felt compelled to write a review ...,First time I felt compelled to write a review ...


### Save Cleaned Dataset

In [9]:
df.to_csv(r'../data/processed/top_150_fantasy_reviews_cleaned_balanced.csv', index=False)
df.head()

Unnamed: 0,review_id,anime_title,review_url,date,username,user_review_count,is_preliminary,episodes_watched,recommendation,rating,...,total_reactions,nice_count,love_it_count,funny_count,confusing_count,informative_count,well_written_count,creative_count,review_sentiment,review_nerpos
0,503754,Sousou no Frieren,https://myanimelist.net/reviews.php?id=503754,"Oct 13, 2023 8:38 AM",Czekaj,5,True,5/28,Recommended,10,...,1352,282,837,44,58,5,124,2,with lives so short why do we even bother? to ...,"With lives so short, why do we even bother? To..."
1,519189,Sousou no Frieren,https://myanimelist.net/reviews.php?id=519189,"Mar 22, 2024 12:40 PM",chekkit,25,False,,Recommended,10,...,1223,252,794,43,52,8,70,4,i feel so catered to it feels like an eternity...,I feel so catered to. It feels like an eternit...
2,506314,Sousou no Frieren,https://myanimelist.net/reviews.php?id=506314,"Nov 10, 2023 3:29 PM",Hallideus,3,True,10/28,Recommended,10,...,222,71,107,23,13,0,8,0,i feel some of the other reviews say it all to...,I feel some of the other reviews say it all to...
3,506301,Sousou no Frieren,https://myanimelist.net/reviews.php?id=506301,"Nov 10, 2023 9:52 AM",Gamer651,9,True,10/28,Recommended,10,...,183,43,88,29,15,0,7,1,it is been 3 years since i last wrote up a rev...,It is been 3 years since I last wrote up a rev...
4,507559,Sousou no Frieren,https://myanimelist.net/reviews.php?id=507559,"Nov 25, 2023 3:17 PM",alexspace38,2,True,12/28,Recommended,10,...,157,30,79,34,11,0,3,0,first time i felt compelled to write a review ...,First time I felt compelled to write a review ...
