<a href="https://colab.research.google.com/github/paul-66666/CS497-Capstone/blob/main/Root_cause_analysis_system_for_negative_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# 安装轻量工具：yake(关键词抽取)；其余库Colab自带，保险起见再装一遍
!pip -q install yake


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/80.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.7/80.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/356.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m348.2/356.9 kB[0m [31m15.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m356.9/356.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# 仅当还没运行过下载数据时使用
import kagglehub
path = kagglehub.dataset_download("beaglelee/amazon-reviews-us-beauty-v1-00-tsv-zip")
print("Path to dataset files:", path)


Path to dataset files: /kaggle/input/amazon-reviews-us-beauty-v1-00-tsv-zip


In [4]:
import os, re, zipfile, json, random, math, itertools, gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential, layers
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Layer
from tensorflow.keras.callbacks import EarlyStopping

import yake  # 关键词抽取

# ---------- 路径处理：承接此前单元中的变量 ----------
# 之前的代码里已拿到 `path`（kagglehub 下载目录）。这里稳妥起见做一层检查。
assert 'path' in globals(), "The variable 'path' was not found. Please run the cell where the data was downloaded first"

DOWNLOAD_DIR = path  # 例如：/root/.cache/kagglehub/datasets/.../versions/1
# 数据集内通常会提供 zip 和/或 tsv，做兼容处理：
tsv_file = None
zip_file = None
for fname in os.listdir(DOWNLOAD_DIR):
    if fname.lower().endswith(".tsv"): tsv_file = os.path.join(DOWNLOAD_DIR, fname)
    if fname.lower().endswith(".zip"): zip_file = os.path.join(DOWNLOAD_DIR, fname)

# 如果只有 zip，就解压得到 tsv
if tsv_file is None and zip_file is not None:
    with zipfile.ZipFile(zip_file, 'r') as zf:
        members = [m for m in zf.namelist() if m.lower().endswith(".tsv")]
        assert len(members) >= 1, "The tsv file was not found in the zip"
        zf.extract(members[0], DOWNLOAD_DIR)
        tsv_file = os.path.join(DOWNLOAD_DIR, members[0])

print("TSV file:", tsv_file)
assert os.path.exists(tsv_file), "The TSV file does not exist. Please check if the download/decompression was successful"

# ---------- 读取数据 ----------
usecols = [
    "star_rating", "review_body", "review_date",
    "product_title", "verified_purchase", "total_votes", "helpful_votes"
]
df = pd.read_csv(tsv_file, sep='\t', usecols=usecols, quoting=3, on_bad_lines='skip', dtype=str)
# 类型转换
df["star_rating"] = pd.to_numeric(df["star_rating"], errors="coerce")
df["total_votes"] = pd.to_numeric(df["total_votes"], errors="coerce")
df["helpful_votes"] = pd.to_numeric(df["helpful_votes"], errors="coerce")
df["review_body"] = df["review_body"].fillna("").astype(str)

# 仅保留有文本和星级的数据
df = df.dropna(subset=["star_rating"])
df = df[df["review_body"].str.strip().str.len() > 0].reset_index(drop=True)
print("Total Row：", len(df))
df.head(3)


TSV file: /kaggle/input/amazon-reviews-us-beauty-v1-00-tsv-zip/amazon_reviews_us_Beauty_v1_00.tsv
Total Row： 5115218


Unnamed: 0,product_title,star_rating,helpful_votes,total_votes,verified_purchase,review_body,review_date
0,The Naked Bee Vitmin C Moisturizing Sunscreen ...,5,0,0,Y,"Love this, excellent sun block!!",2015-08-31
1,"Alba Botanica Sunless Tanning Lotion, 4 Ounce",5,0,0,Y,The great thing about this cream is that it do...,2015-08-31
2,"Elysee Infusion Skin Therapy Elixir, 2oz.",5,0,0,Y,"Great Product, I'm 65 years old and this is al...",2015-08-31


In [5]:
# 论文目标：聚焦负面根因。这里先训练 "负面(<=2星) vs 其他" 的分类器
df["label_negative"] = (df["star_rating"] <= 2).astype(int)

# 为了在Colab快速跑通：抽样一个适中规模（可放大到 80k、150k）
MAX_SAMPLES = 30000
if len(df) > MAX_SAMPLES:
    # 按类别分层抽样，避免类别不平衡
    df_small = (df.groupby("label_negative", group_keys=False)
                  .apply(lambda x: x.sample(min(len(x), MAX_SAMPLES//2), random_state=42)))
else:
    df_small = df

print(df_small["label_negative"].value_counts())


label_negative
0    15000
1    15000
Name: count, dtype: int64


  .apply(lambda x: x.sample(min(len(x), MAX_SAMPLES//2), random_state=42)))
