# 06 — Sentiment Pipeline (Thai, lexicon v1, dummy-ready)

In [None]:
%pip -q install pandas numpy matplotlib pythainlp


In [None]:
import os, pandas as pd, numpy as np, matplotlib.pyplot as plt
from pythainlp.tokenize import word_tokenize

# โหลด dataset หลัก
BASE_FILE = "dataset_features_labels.csv"
if not os.path.exists(BASE_FILE):
    raise FileNotFoundError("❌ ไม่พบ dataset_features_labels.csv — โปรดรัน 02_feature_label.ipynb ก่อน")

df = pd.read_csv(BASE_FILE, index_col=0, parse_dates=True)
print("Loaded base dataset:", df.shape)


Loaded base dataset: (2565, 11)


In [None]:
# Lexicon
POS = {"บวก","พุ่ง","กำไร","เติบโต","ฟื้นตัว","ดี","ทะลุ","สูงขึ้น","แข็งแกร่ง","สดใส"}
NEG = {"ลบ","ร่วง","ขาดทุน","ชะลอ","ถดถอย","แย่","ดิ่ง","ต่ำลง","วิกฤต","ซบเซา","กังวล"}

def sentiment_score_th(text: str) -> float:
    toks = word_tokenize(str(text), keep_whitespace=False)
    if not toks: return 0.0
    s = sum((t in POS) - (t in NEG) for t in toks)
    return s / np.sqrt(len(toks))


In [None]:
NEWS_FILE = "news_th.csv"
if os.path.exists(NEWS_FILE):
    news = pd.read_csv(NEWS_FILE)
    news["date"] = pd.to_datetime(news["date"])
else:
    # dummy news
    news = pd.DataFrame({
        "date": pd.date_range(df.index.min(), periods=5, freq="7D"),
        "symbol": ["^SET50"]*5,
        "text": [
            "หุ้นไทยพุ่งแรงหลังเศรษฐกิจฟื้นตัว",
            "ตลาดปรับตัวลบจากความกังวลเศรษฐกิจถดถอย",
            "ผลประกอบการกำไรเติบโต",
            "ความเชื่อมั่นชะลอตัว หุ้นร่วง",
            "แนวโน้มดีขึ้นอย่างต่อเนื่อง",
        ]
    })
    news.to_csv("news_th_template.csv", index=False)
    print("⚠️ ไม่มี news_th.csv → สร้าง news_th_template.csv ให้แล้ว")


In [None]:
# Sentiment
news["score"] = news["text"].apply(sentiment_score_th)
sent_daily = news.groupby(pd.Grouper(key="date", freq="D"))["score"].mean().rename("Sentiment_Daily").to_frame()
sent_daily.to_csv("sent_daily_preview.csv")
df_out = df.join(sent_daily, how="left").fillna(0.0)
df_out.to_csv("dataset_features_labels_with_sentiment.csv")
print("✅ Saved dataset_features_labels_with_sentiment.csv", df_out.shape)


✅ Saved dataset_features_labels_with_sentiment.csv (2565, 12)
