# 2024BDA - Midterm Project

## Import Packages

In [2]:
import os
import pickle
import re
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from collections import Counter
from tqdm import tqdm

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix

## Import Dataset

In [3]:


dataset_dir = "./dataset"

files = [f for f in os.listdir(dataset_dir) if os.path.isfile(os.path.join(dataset_dir, f))]
print("View all datasets:")
print(files)

View all datasets:
['train_data.csv', 'bda2024_202203-202402_內容數據_新聞2-4.csv', 'bda2024_微股力_社群PKTD-2年-1.csv', 'test_data.csv', 'bda2024_202203-202402_內容數據_新聞3-3.csv', 'bda2024_微股力_社群PKTD-2年-6.csv', 'bda2024_202203-202402_討論數據_dcard.csv', 'bda2024_202203-202402_內容數據_新聞3-1.csv', 'bda2024_202203-202402_內容數據_新聞3-9.csv', '.DS_Store', 'bda2024_202203-202402_內容數據_新聞1-7.csv', 'bda2024_202203-202402_討論數據_mobile01-2.csv', 'bda2024_202203-202402_內容數據_新聞2-6.csv', 'bda2024_微股力_社群PKTD-2年-5.csv', 'bda2024_微股力_社群PKTD-2年-7.csv', 'bda2024_202203-202402_內容數據_新聞1-0.csv', 'bda2024_微股力_社群PKTD-2年-4.csv', 'bda2024_202203-202402_內容數據_新聞3-4.csv', 'bda2024_202203-202402_內容數據_新聞1-3.csv', 'bda2024_202203-202402_討論數據_mobile01-1.csv', 'bda2024_微股力_社群PKTD-2年-9.csv', 'bda2024_202203-202402_內容數據_新聞1-4.csv', 'bda2024_202203-202402_內容數據_新聞2-5.csv', 'bda2024_202203-202402_內容數據_新聞1-2.csv', 'bda2024_微股力_社群PKTD-2年-0.csv', 'bda2024_202203-202402_內容數據_新聞3-0.csv', 'bda2024_微股力_個股交易數據-2年.csv', 'bda2024_202203-202402_內容數據_新聞1-9.cs

# Data Preprocess

In [4]:
def load_df(filepath, preview=True):
    print(f"\n----- Loading {filepath}... -----")
    df = pd.read_csv(filepath)
    print(f"Size of dataframe: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    if preview:
        print(df.head())
    return df

因為要討論 ppt / 各種論壇的情緒指標和討論聲量和產業的關係，因此沒有用新聞內容的 data


In [5]:
disc_dcard_df = load_df("./dataset/bda2024_202203-202402_討論數據_dcard.csv", preview=False)
disc_dcard_df.rename(columns={'forum': 'p_type'}, inplace=True)    # Repair column name typo in data
disc_m1_df = load_df("./dataset/bda2024_202203-202402_討論數據_mobile01-1.csv", preview=False)
disc_m2_df = load_df("./dataset/bda2024_202203-202402_討論數據_mobile01-2.csv", preview=False)
disc_ptt_df = load_df("./dataset/bda2024_202203-202402_討論數據_ptt.csv", preview=False)
disc_df = pd.concat([disc_dcard_df, disc_m1_df, disc_m2_df, disc_ptt_df], ignore_index=True)

transaction_df = load_df("./dataset/bda2024_微股力_個股交易數據-2年.csv")

transaction_df['stock_symbol'] = transaction_df['stock_symbol'].astype(str)


----- Loading ./dataset/bda2024_202203-202402_討論數據_dcard.csv... -----
Size of dataframe: (231320, 10)
Columns: ['id', 'forum', 's_name', 's_area_name', 'post_time', 'title', 'author', 'content', 'page_url', 'content_type']

----- Loading ./dataset/bda2024_202203-202402_討論數據_mobile01-1.csv... -----
Size of dataframe: (48725, 10)
Columns: ['id', 'p_type', 's_name', 's_area_name', 'post_time', 'title', 'author', 'content', 'page_url', 'content_type']

----- Loading ./dataset/bda2024_202203-202402_討論數據_mobile01-2.csv... -----
Size of dataframe: (157939, 10)
Columns: ['id', 'p_type', 's_name', 's_area_name', 'post_time', 'title', 'author', 'content', 'page_url', 'content_type']

----- Loading ./dataset/bda2024_202203-202402_討論數據_ptt.csv... -----
Size of dataframe: (50805, 9)
Columns: ['id', 'p_type', 's_name', 's_area_name', 'post_time', 'title', 'author', 'content', 'page_url']

----- Loading ./dataset/bda2024_微股力_個股交易數據-2年.csv... -----
Size of dataframe: (1154225, 8)
Columns: ['stock_nam

  df = pd.read_csv(filepath)


In [6]:
disc_df = disc_df[["id", "post_time", "content"]]

disc_df["content"] = disc_df["content"].astype(str)

# Document Labeling

AI 概念股：

https://www.sinotrade.com.tw/richclub/hotstock/-65af4cb1880d9e29902a677f

https://www.wantgoo.com/index/%5E435/stocks


利用「全部 AI 概念股 n 天後的股價」減掉「全部 AI 概念股今天的股價」來當作判斷，若漲幅超過 m% 則判斷為漲。

n = 7, m = 5 (暫定)


In [7]:
# get target stocks

target_stock_name_1 = ['廣達', '緯創', '台積電', '創意', '世芯-KY', '智原', '智邦', '信驊', '譜瑞-KY', '日月光投控', '台達電', '光寶科', '群光', '奇鋐', '金像電', '台燿', '嘉澤', '技嘉', '緯穎', '英業達', '鴻海', '聯發科', '聯茂', 'M31']
target_stock_name_2 = ['AMAX-KY','緯創','凌群','創意','東元','英業達','原相','長佳智能','精誠','鴻海','華碩','微星','金寶','聯發科','世芯-KY','京元電子','美律','亞信','研華','台積電','宏碁','走著瞧-創','鈺創','廣達','凌華','零壹','台達電','樺漢','群電']

#get rid of the duplicate stock names
target_stock_name = target_stock_name_1 + target_stock_name_2
target_stock_name = list(set(target_stock_name))

In [8]:
transaction_df['date'] = pd.to_datetime(transaction_df['date']).dt.date
disc_df['post_time'] = pd.to_datetime(disc_df['post_time'])
disc_df['post_time'] = disc_df['post_time'].dt.date

#add a column "label" in transaction_df to indicate whether the stock price rises or not, default is null
disc_df["label"] = np.nan

dates_list = sorted(list(set(transaction_df["date"])))
disc_df.head()

Unnamed: 0,id,post_time,content,label
0,1646109801927_F0DCU,2022-03-01,定股美股ETF長期去抓報酬也是適合的投資工具方式,
1,1646109801940_F0DCU,2022-03-01,最近剛申辦覺得定期定額投資美股很方便,
2,1646115341451_F0DCU,2022-03-01,我三百買的 給你參考,
3,1646113689192_F0DCU,2022-03-01,中鋼呢,
4,1646068286032_F0DCU,2022-03-01,有100時候怎麼沒有選擇減碼落袋為安\n現在用什麼心態在做當沖呢？？,


In [9]:
# if the sum of the stock price in stock_name n days later is higher than the sum of stock price today by more than 5%, then label today's disc_df["rise"] as 1
# if the sum of the stock price in stock_name n days later is lower than the sum of stock price today by more than 5%, then label today's disc_df["rise"] as -1
# if the sum of the stock price in stock_name n days later is within 5% of the sum of stock price today, then label today's disc_df["rise"] as 0

n = 7
m = 0.05

for i in range(len(dates_list[:-7])):
    #check the availability of stock price data of stock_name today
    stock_data_today = transaction_df[transaction_df["date"] == dates_list[i]]
    stock_data_today_name = stock_data_today["stock_name"].values

    #check the availability of stock price data of stock_name n days later
    stock_data_later = transaction_df[transaction_df["date"] == dates_list[i + n]]
    stock_data_later_name = stock_data_later["stock_name"].values

    #get the available stocks on both today and later
    stock_name = [stock for stock in stock_data_today_name if stock in stock_data_later_name]

    stock_name = [stock for stock in stock_name if stock in target_stock_name]
    
    if len(stock_name) == 0:
        continue

    #if stock_name is not available in stock data, then label today's disc_df["rise"] as -100

    #calculate the sum of stock price of stock_name today
    stock_price_today = 0
    for stock in stock_name:
        if stock in stock_data_today_name:
            stock_price_today += stock_data_today[stock_data_today["stock_name"] == stock]["open"].values[0]

    #calculate the sum of stock price of stock_name n days later
    stock_price_later = 0
    for stock in stock_name:
        if stock in stock_data_later_name:
            stock_price_later += stock_data_later[stock_data_later["stock_name"] == stock]["close"].values[0]

    #calculate the percentage change of stock price, and label the data in disc_df["rise"]
    percentage_change = (stock_price_later - stock_price_today) / stock_price_today
    if percentage_change > m:
        disc_df.loc[disc_df["post_time"] == dates_list[i], "label"] = 1
    elif percentage_change < -m:
        disc_df.loc[disc_df["post_time"] == dates_list[i], "label"] = -1
    else:
        disc_df.loc[disc_df["post_time"] == dates_list[i], "label"] = 0

In [10]:
# get rid of the data with disc_df["rise"] == null
disc_df = disc_df[disc_df["label"].notnull()]

# reassign the post_id
disc_df["post_id"] = range(len(disc_df))

#export disc_df to csv
disc_df.to_csv("dataset.csv", index=False)

# Document feature extraction
找出具鑑別力 (扣除共通字詞) 的關鍵字列表，合起來建構向量空間

1000維度（？），把所有資料轉換成以這個向量空間為主的向量（嗎）

先取 5000 筆當作 training data 來建構具鑑別力 (扣除共通字詞) 的關 鍵字列表，合起來建構向量空間
1000 筆當作 testing data


## Data Cleaning
把文章內的空白、奇怪的字元去掉

In [20]:
import re


def clean_text(document: str):
    # remove html tags
    CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    clean_document = re.sub(CLEANR, '', document)
    clean_document = re.sub(
        r'^https?:\/\/.*[\r\n]*', '', clean_document, flags=re.MULTILINE)  # remove urls
    clean_document = re.sub(r"\s+", "", clean_document,
                            flags=re.UNICODE)  # remove white spaces
    clean_document = clean_document.replace("\n", "") .replace("\r\n", "")
    # remove line terminator
    clean_document = re.sub(r"/[^\x20-\x7E]/gmi", "", clean_document)

    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    clean_document = re.sub(emoji_pattern, "", clean_document)
    return clean_document

In [21]:
import pandas as pd
dataset = pd.read_csv("./dataset.csv")
dataset["content"] = dataset["content"].astype(str)
dataset["content"] = dataset["content"].apply(lambda x : clean_text(x))
dataset.to_csv("./clean_dataset.csv", index=False)
dataset.head()


Unnamed: 0,id,post_time,content,label,post_id
0,1646109801927_F0DCU,2022-03-01,定股美股ETF長期去抓報酬也是適合的投資工具方式,0.0,0
1,1646109801940_F0DCU,2022-03-01,最近剛申辦覺得定期定額投資美股很方便,0.0,1
2,1646115341451_F0DCU,2022-03-01,我三百買的給你參考,0.0,2
3,1646113689192_F0DCU,2022-03-01,中鋼呢,0.0,3
4,1646068286032_F0DCU,2022-03-01,有100時候怎麼沒有選擇減碼落袋為安現在用什麼心態在做當沖呢？？,0.0,4


## Document Vectorization

In [22]:
feature_dim = 500


In [29]:
import monpa
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
monpa.use_gpu(True)
dataset = pd.read_csv("./clean_dataset.csv")
dataset["content"] = dataset["content"].fillna("")
stopwords = [line.rstrip()
             for line in open('./stopwords.txt', encoding='utf8')]

vectorizer = TfidfVectorizer(
    use_idf=True, stop_words=stopwords, tokenizer=monpa.cut, max_features=feature_dim)
X = vectorizer.fit_transform(tqdm(dataset["content"]))

100%|██████████| 369694/369694 [1:07:04<00:00, 91.86it/s]


In [1]:
document_vector = pd.DataFrame(
    X.toarray(), columns=vectorizer.get_feature_names_out())
document_vector["label"] = dataset["label"]
document_vector.to_csv("document_vector.csv",index=False)
document_vector.head()

NameError: name 'pd' is not defined

## Document Classification

In [6]:
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm

In [7]:
df = pd.read_csv("./document_vector.csv")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
#df = df.drop(df[df["label"] == 0].index)
df['label'].value_counts()

label
 0.0    221157
 1.0     83263
-1.0     65274
Name: count, dtype: int64

In [8]:
features = df.iloc[:, :-1].to_numpy()
labels = df.iloc[:, -1].to_numpy()
print(features.shape , labels.shape)

(369694, 500) (369694,)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    features[:20000], labels[:20000], test_size=0.1, random_state=42)


In [10]:
cls = svm.SVC(kernel='poly', gamma=0.5, C=10,verbose=True , decision_function_shape='ovo').fit(X_train, y_train)

[LibSVM]..................*......*
optimization finished, #iter = 24909
obj = -78145.925479, rho = 1.000543
nSV = 14358, nBSV = 7606
.........*....*
optimization finished, #iter = 13379
obj = -25412.323271, rho = -0.999431
nSV = 5648, nBSV = 2490
............*.....*
optimization finished, #iter = 17960
obj = -27089.437300, rho = -0.999444
nSV = 7948, nBSV = 2365
Total nSV = 16517


In [11]:
pred = cls.predict(X_test)

In [12]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

        -1.0       0.39      0.07      0.12       556
         0.0       0.63      0.96      0.76      1249
         1.0       0.70      0.04      0.07       195

    accuracy                           0.62      2000
   macro avg       0.57      0.36      0.32      2000
weighted avg       0.57      0.62      0.52      2000



## Document Classification with Deep Learning

Use yiyanghkust/finbert-tone-chinese to classify text, the model is fine-tuned with financial domain knowledge, read: [link](https://arxiv.org/abs/1908.10063)

In [55]:
from transformers import TextClassificationPipeline
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import BertTokenizerFast

import numpy as np
model_path = "yiyanghkust/finbert-tone-chinese"
new_model = AutoModelForSequenceClassification.from_pretrained(
    model_path, output_attentions=True)
tokenizer = BertTokenizerFast.from_pretrained(model_path)
PipelineInterface = TextClassificationPipeline(
    model=new_model, tokenizer=tokenizer, return_all_scores=True)
label = PipelineInterface("改裝車燈廠巨鎧精密新廠啟用 宣示今年起進快速成長期")


def convert(nlp_result: list) -> int:
    scores = np.array([cl["score"] for cl in nlp_result[0]])
    return (np.argmax(scores)+1) % 3-1

print(convert(label))

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

-1


In [31]:
import pandas as pd
doc_df = pd.read_csv("./clean_dataset.csv")
doc_df.head()

Unnamed: 0.1,Unnamed: 0,id,post_time,content,label,post_id
0,0,1646109801927_F0DCU,2022-03-01,定股美股ETF長期去抓報酬也是適合的投資工具方式,0.0,0
1,1,1646109801940_F0DCU,2022-03-01,最近剛申辦覺得定期定額投資美股很方便,0.0,1
2,2,1646115341451_F0DCU,2022-03-01,我三百買的給你參考,0.0,2
3,3,1646113689192_F0DCU,2022-03-01,中鋼呢,0.0,3
4,4,1646068286032_F0DCU,2022-03-01,有100時候怎麼沒有選擇減碼落袋為安現在用什麼心態在做當沖呢？？,0.0,4


In [32]:
contents = doc_df["content"].to_list()
labels = df.iloc[:, -1].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(
    contents[:20000], labels[:20000], test_size=0.1, random_state=42)

In [51]:
nlp_pred = [convert(PipelineInterface(str(X_test[i]), padding=True,
                                      truncation=True)) for i in range(len(X_test))]


In [52]:
print(classification_report(y_test, nlp_pred))

              precision    recall  f1-score   support

        -1.0       0.19      0.02      0.03       556
         0.0       0.63      0.93      0.75      1249
         1.0       0.12      0.06      0.08       195

    accuracy                           0.59      2000
   macro avg       0.31      0.33      0.29      2000
weighted avg       0.46      0.59      0.48      2000

