In [6]:
import nltk
import os
import pandas as pd
import re
import spacy
from nltk.corpus import stopwords
import datetime
import contractions

nltk_lib_path = "../libs/nltk_data"

os.makedirs(nltk_lib_path, exist_ok=True)

nltk.data.path.append(nltk_lib_path)

nltk.download('stopwords', download_dir=nltk_lib_path)
nltk.download('vader_lexicon', download_dir=nltk_lib_path)
# nltk.download('wordnet', download_dir=nltk_data_path)
# nltk.download('omw-1.4', download_dir=nltk_data_path)
# nltk.download('punkt_tab', download_dir=nltk_data_path)

print(f"NLTK Data Path: {nltk.data.path}")

[nltk_data] Downloading package stopwords to ../libs/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to ../libs/nltk_data...


NLTK Data Path: ['/home/miellilas/nltk_data', '/home/miellilas/Documents/pba/myanimelist/venv/nltk_data', '/home/miellilas/Documents/pba/myanimelist/venv/share/nltk_data', '/home/miellilas/Documents/pba/myanimelist/venv/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data', '../libs/nltk_data']


In [7]:
nlp = spacy.load('en_core_web_sm')
nltk_data_path = "../libs/nltk_data"
nltk.data.path.append(nltk_data_path)

In [10]:
stop_words = set(stopwords.words("english"))
negation_words = {"no", "not", "nor", "never", "n't", "dont"}
stop_words = stop_words - negation_words

# Optional dictionary for post-processing corrections (lemmatization errors, slang, etc.)
post_lemmatization_corrections = {
    "datum": "data",
    "cannot": "can_not",
    "dont": "do_not",
    "doesnt": "does_not",
    "wont": "will_not",
    "cant": "can_not",
    "isnt": "is_not",
    "wasnt": "was_not",
    "arent": "are_not"
}

def preprocess_text(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""

    # Expand contractions safely
    try:
        text = contractions.fix(text)
    except Exception as e:
        print(f"[Warning] Contractions expansion failed for: {text}\nError: {e}")
        return ""

    # Lowercase
    text = text.lower()

    # Remove URLs, emails
    text = re.sub(r"http\S+|www\S+|\S+@\S+", "", text)

    # Remove mentions
    text = re.sub(r"@\w+", "", text)

    # Replace exclamations with a tag (optional emphasis cue)
    text = re.sub(r"!", " EXCLAMATION ", text)

    # Remove non-alpha characters except apostrophes
    text = re.sub(r"[^a-z'\s]", "", text)

    # Tokenize & lemmatize
    try:
        doc = nlp(text)
    except Exception as e:
        print(f"[Warning] spaCy failed to process: {text}\nError: {e}")
        return ""

    tokens = []
    skip_next = False

    for i, token in enumerate(doc):
        if skip_next:
            skip_next = False
            continue

        lemma = token.lemma_.lower()
        lemma = post_lemmatization_corrections.get(lemma, lemma)

        # Preserve negation + meaningful word (negation tagging)
        if lemma in negation_words and i + 1 < len(doc):
            next_token = doc[i + 1]
            if next_token.pos_ in {"ADJ", "VERB", "ADV", "NOUN"}:
                next_lemma = next_token.lemma_.lower()
                next_lemma = post_lemmatization_corrections.get(next_lemma, next_lemma)
                tokens.append(f"{lemma}_{next_lemma}")
                skip_next = True
                continue
            else:
                tokens.append(lemma)
        elif lemma not in stop_words and token.is_alpha and len(lemma) > 1:
            tokens.append(lemma)

    return " ".join(tokens)


In [11]:
df = pd.read_csv("../data/raw/top_150_fantasy_reviews.csv")

df["processed_review"] = df["Review"].apply(preprocess_text)

df = df[df["processed_review"].str.strip().astype(bool)]

date_today = datetime.datetime.today().strftime("%Y%m%d_%H%M")
df.to_csv(f"../data/processed/{date_today}_df_cleaned.csv", index=False)


Story 9      Art 10     Sound  9     Character 8     Enjoyment 8     Overall 8.5


But , not the other one. This Show has charmed me To it's plot. May Spoilers...

A girl with red hair,  A mysterious air, A smooth scenario, and a lot to say about a little show. But unfortunately i
                  ...
don't have much time for this review. İf you interested in these cases, then check the Ova version. I think this is the way worth if we compare the other one. And again i think This one is more original than the other. So that was a review Have a time can be called good for you
don't have much time for this review. İf you interested in these cases, then check the Ova version. I think this is the way worth if we compare the other one. And again i think This one is more original than the other. So that was a review Have a time can be called good for you
Error: string index out of range


In [12]:
df["processed_review"].head(3)

0    life short even bother someone live thousand y...
1    feel catered feel like eternity since give phe...
2    style frieren not_have unique style way feel l...
Name: processed_review, dtype: object