<a href="https://colab.research.google.com/github/paul-66666/CS497-Capstone/blob/main/Root_cause_analysis_system_for_negative_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 安装轻量工具：yake(关键词抽取)；其余库Colab自带，保险起见再装一遍
!pip -q install yake


In [None]:
# 仅当你还没运行过下载数据时使用
import kagglehub
path = kagglehub.dataset_download("beaglelee/amazon-reviews-us-beauty-v1-00-tsv-zip")
print("Path to dataset files:", path)


In [None]:
import os, re, zipfile, json, random, math, itertools, gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential, layers
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Layer
from tensorflow.keras.callbacks import EarlyStopping

import yake  # 关键词抽取

# ---------- 路径处理：承接你此前单元中的变量 ----------
# 你之前的代码里已拿到 `path`（kagglehub 下载目录）。这里稳妥起见做一层检查。
assert 'path' in globals(), "未找到变量 `path`，请先运行你下载数据的单元格。"

DOWNLOAD_DIR = path  # 例如：/root/.cache/kagglehub/datasets/.../versions/1
# 数据集内通常会提供 zip 和/或 tsv，做兼容处理：
tsv_file = None
zip_file = None
for fname in os.listdir(DOWNLOAD_DIR):
    if fname.lower().endswith(".tsv"): tsv_file = os.path.join(DOWNLOAD_DIR, fname)
    if fname.lower().endswith(".zip"): zip_file = os.path.join(DOWNLOAD_DIR, fname)

# 如果只有 zip，就解压得到 tsv
if tsv_file is None and zip_file is not None:
    with zipfile.ZipFile(zip_file, 'r') as zf:
        members = [m for m in zf.namelist() if m.lower().endswith(".tsv")]
        assert len(members) >= 1, "zip里没找到tsv文件"
        zf.extract(members[0], DOWNLOAD_DIR)
        tsv_file = os.path.join(DOWNLOAD_DIR, members[0])

print("TSV file:", tsv_file)
assert os.path.exists(tsv_file), "TSV 文件不存在，请检查下载/解压是否成功"

# ---------- 读取数据 ----------
usecols = [
    "star_rating", "review_body", "review_date",
    "product_title", "verified_purchase", "total_votes", "helpful_votes"
]
df = pd.read_csv(tsv_file, sep='\t', usecols=usecols, quoting=3, on_bad_lines='skip', dtype=str)
# 类型转换
df["star_rating"] = pd.to_numeric(df["star_rating"], errors="coerce")
df["total_votes"] = pd.to_numeric(df["total_votes"], errors="coerce")
df["helpful_votes"] = pd.to_numeric(df["helpful_votes"], errors="coerce")
df["review_body"] = df["review_body"].fillna("").astype(str)

# 仅保留有文本和星级的数据
df = df.dropna(subset=["star_rating"])
df = df[df["review_body"].str.strip().str.len() > 0].reset_index(drop=True)
print("Total Row：", len(df))
df.head(3)
