In [1]:
from cassandra.cluster import Cluster
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
cluster = Cluster(['127.0.0.1'])
session = cluster.connect()
session.set_keyspace('data_stock')

In [3]:
rows_price = session.execute("""
    SELECT * FROM candlestick_data ALLOW FILTERING
""")
df_cand = pd.DataFrame(rows_price)
df_cand['time'] = pd.to_datetime(df_cand['time'])
df_cand = df_cand.sort_values(['symbol', 'time'])

In [4]:
rows_fin2 = session.execute("""
    SELECT symbol,eps,pe,pbv,percentYield FROM financal_data_fromsettradeAPI ALLOW FILTERING
""")
df_fun= pd.DataFrame(rows_fin2)
df_fun.head(10)

Unnamed: 0,symbol,eps,pe,pbv,percentyield
0,PPPM,0.02,0.0,0.7,0.0
1,TPCH,0.24112,7.9,0.34,13.91
2,KPNREIT,,,0.23,0.0
3,POLY,0.27632,12.89,2.42,6.38
4,QHBREIT,,,0.41,0.0
5,VCOM,0.21428,7.85,1.48,10.79
6,KDH,4.12186,10.62,1.93,3.01
7,NVD,-0.00942,38.87,0.32,0.0
8,JDF,0.04224,12.19,1.35,4.85
9,SVR,-0.03402,0.0,0.34,0.0


In [5]:
df_fun.isnull().sum()

symbol           0
eps             65
pe              75
pbv             12
percentyield    11
dtype: int64

In [6]:
df_cand.isnull().sum()

symbol         0
time           0
close_price    0
high_price     0
low_price      0
open_price     0
value          0
volume         0
dtype: int64

In [8]:
df_fun.isnull().any(axis=1)
null_rows = df_fun[df_fun.isnull().any(axis=1)]
null_rows.head(10)

Unnamed: 0,symbol,eps,pe,pbv,percentyield
2,KPNREIT,,,0.23,0.0
4,QHBREIT,,,0.41,0.0
39,NWR,0.09585,,,
45,GSTEEL,-0.02979,,,
49,HYDROGEN,,,0.96,0.0
51,KKC,-0.8258,,,
76,BLISS,-0.00047,,,
109,MJLF,,,0.37,14.54
116,M-PAT,,,0.66,0.0
134,MNIT,,,0.37,6.74


In [9]:
df_fun.shape

(913, 5)

In [10]:
df_cand.shape

(597057, 8)

In [11]:
# ✅ ขั้นตอนการเติมค่า NaN แบบ Model-based (KNN Imputer)
import pandas as pd
from sklearn.impute import KNNImputer

# สมมติว่า df_fun คือ DataFrame หลัก
# ตรวจสอบก่อนว่ามีเฉพาะคอลัมน์ตัวเลข (ไม่รวม symbol)
df_numeric = df_fun.drop(columns=["symbol"])  

# ✅ ใช้ KNN Imputer เติมค่า
imputer = KNNImputer(n_neighbors=5, weights="distance")  # เลือก k=5
df_imputed_values = imputer.fit_transform(df_numeric)

# ✅ สร้าง DataFrame ใหม่ (เติมค่ากลับ)
df_imputed = pd.DataFrame(df_imputed_values, columns=df_numeric.columns, index=df_fun.index)

# ✅ รวมกลับกับ symbol
df_final = pd.concat([df_fun["symbol"], df_imputed], axis=1)

print("✅ Missing values หลังเติม:")
print(df_final.isnull().sum())
df_final.head()


✅ Missing values หลังเติม:
symbol          0
eps             0
pe              0
pbv             0
percentyield    0
dtype: int64


Unnamed: 0,symbol,eps,pe,pbv,percentyield
0,PPPM,0.02,0.0,0.7,0.0
1,TPCH,0.24112,7.9,0.34,13.91
2,KPNREIT,-0.01065,5.722026e-07,0.23,0.0
3,POLY,0.27632,12.89,2.42,6.38
4,QHBREIT,0.587335,2.945,0.41,0.0


In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(random_state=42, max_iter=20)
df_imputed_values = imputer.fit_transform(df_numeric)

df_imputed = pd.DataFrame(df_imputed_values, columns=df_numeric.columns, index=df_fun.index)
df_final = pd.concat([df_fun["symbol"], df_imputed], axis=1)
