In [None]:
import pandas as pd
from sklearn.model_selection import ParameterGrid
from neuralprophet import NeuralProphet, set_log_level, set_random_seed

from src.model.features import add_stock_price_feature

# Disable logging messages unless there is an error
set_log_level("ERROR")

df = pd.read_csv('data/stocks/2330_stock_data_0317.csv', parse_dates=['ds'])
df = add_stock_price_feature(df)
plt = df.plot(x="ds", y="y", figsize=(15, 5))
df.info()

: 

In [7]:
# # First: fill missing dates
# df_fill = df.set_index("ds").asfreq("D").reset_index()

# # Create a mask BEFORE filling
# filled_mask = df_fill.isna().any(axis=1)

# # Then forward-fill all columns
# df_fill = df_fill.fillna(method='ffill')

# # Add a column to indicate if the row was filled
# df_fill["was_filled"] = filled_mask.astype(int)  # 1 = was filled, 0 = not

# df_fill.head(10)

In [14]:
llm_factor = pd.read_csv('data/factors/result_台積電.csv', parse_dates=True, index_col=0)
llm_factor.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 725 entries, 2022-11-01 to 2025-03-17
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   factor        725 non-null    float64
 1   explanation   725 non-null    object 
 2   updated_time  725 non-null    object 
 3   news_count    725 non-null    int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 28.3+ KB


In [15]:
llm_factor = llm_factor[~llm_factor.index.duplicated(keep='first')]
llm_factor = llm_factor[['factor']]
llm_factor.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 725 entries, 2022-11-01 to 2025-03-17
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   factor  725 non-null    float64
dtypes: float64(1)
memory usage: 27.5 KB


In [17]:
df_merged = df.merge(llm_factor, how='left', left_on='ds', right_index=True)
df_merged[df_merged['factor'].isnull()]

Unnamed: 0.1,Unnamed: 0,ds,open_price,high_price,low_price,y,volume,foreign,investment,dealer,ratio_over_400_shares,shareholders_400_to_600,shareholders_600_to_800,shareholders_800_to_1000,ratio_over_1000_shares,high_low_diff,MA,factor
157,157,2023-07-03,578.0,580.0,576.0,579.0,15118041,2359214,-305135,1401051,89.61,487,287,206,87.22,4.0,574.8,
158,158,2023-07-04,585.0,585.0,580.0,585.0,17777363,4795086,-1347467,-13098,89.61,487,287,206,87.22,5.0,577.4,
159,159,2023-07-05,589.0,589.0,579.0,582.0,15553503,-840152,-503000,-1092463,89.61,487,287,206,87.22,10.0,579.0,
160,160,2023-07-06,573.0,574.0,565.0,565.0,32069711,-16293697,-574000,-602775,89.61,487,287,206,87.22,9.0,577.4,
161,161,2023-07-07,565.0,572.0,563.0,565.0,19858943,-3669293,-185468,-100394,89.61,487,287,206,87.22,9.0,575.2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,257,2023-11-24,577.0,578.0,574.0,575.0,12503334,-1133584,69706,-117686,89.47,478,285,213,87.06,4.0,578.4,
258,258,2023-11-27,573.0,577.0,568.0,568.0,20321872,-2133745,59033,-56020,89.43,479,285,210,87.03,9.0,576.6,
259,259,2023-11-28,565.0,576.0,565.0,575.0,26932029,3322359,-98500,686784,89.43,479,285,210,87.03,11.0,574.6,
260,260,2023-11-29,578.0,579.0,570.0,574.0,27786565,366570,55000,-553460,89.43,479,285,210,87.03,9.0,574.0,


In [None]:
df_merged['factor'] = df_merged['factor'].fillna(0)
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 566 entries, 4 to 569
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Unnamed: 0                566 non-null    int64         
 1   ds                        566 non-null    datetime64[ns]
 2   open_price                566 non-null    float64       
 3   high_price                566 non-null    float64       
 4   low_price                 566 non-null    float64       
 5   y                         566 non-null    float64       
 6   volume                    566 non-null    int64         
 7   foreign                   566 non-null    int64         
 8   investment                566 non-null    int64         
 9   dealer                    566 non-null    int64         
 10  ratio_over_400_shares     566 non-null    float64       
 11  shareholders_400_to_600   566 non-null    int64         
 12  shareholders_600_to_800   5

In [7]:
from src.model.utils import val_mape

param_grid = {
    'factor': [True]
    # 'volume': [0, 5, 10, 20],
    # 'high_low_diff': [0, 5, 10, 20],
    # 'MA': [0, 5, 10, 20]
}

results = []
# Iterate over each combination of hyperparameters
for params in ParameterGrid(param_grid):
    # Initialize the NeuralProphet model with current hyperparameters
    print([params])
    m = NeuralProphet(
        yearly_seasonality=False,
        weekly_seasonality=True,
        n_lags=3,
    )
    m = m.add_country_holidays("TW")
    
    columes = ['ds', 'y', 'factor']
    m.add_future_regressor('factor')

    # lag_regs = (
    #     ('volume', params['volume']),
    #     ('high_low_diff', params['high_low_diff']),
    #     ('MA', params['MA']),
    # )
    # columes = ['ds', 'y']
    # for col, lag in lag_regs:
    #     if lag > 0:
    #         m.add_lagged_regressor(col, n_lags=lag)
    #         columes.append(col)
    
    df_train, df_val = m.split_df(df_merged[columes], valid_p=0.2)
    set_random_seed(0)
    metrics = m.fit(df_train, validation_df=df_val, freq="D")
    
    # Create a new dataframe reaching 365 into the future for our forecast, n_historic_predictions also shows historic data
    # df_future = m.make_future_dataframe(df_merged[columes], n_historic_predictions=True, periods=7)

    # Predict the future
    forecast = m.predict(df_merged[columes])
    
    rmse = metrics.iloc[-1]['RMSE_val']
    mape = val_mape(df_val[:-1], forecast)
    results.append({**params, 'RMSE': rmse, 'MAPE': mape})

[{'factor': True}]
Training: |          | 0/? [00:00<?, ?it/s]



Finding best initial lr: 100%|██████████| 222/222 [00:00<00:00, 244.97it/s]


Training: |          | 0/? [00:13<?, ?it/s, v_num=2098, MAE_val=10.90, RMSE_val=13.80, Loss_val=0.000955, RegLoss_val=0.000, train_loss=0.000401, reg_loss=0.000, MAE=6.530, RMSE=9.290, Loss=0.000396, RegLoss=0.000]
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 133.66it/s]


In [11]:
results_df = pd.DataFrame(results)
results_df.to_csv('reports/add_future_2330.csv')
results_df

Unnamed: 0,factor,RMSE,MAPE
0,True,13.810317,0.011872


In [12]:
results_df.sort_values(by="RMSE").head(10)

Unnamed: 0,factor,RMSE,MAPE
0,True,13.810317,0.011872


In [13]:
results_df.sort_values(by="MAPE").head(10)

Unnamed: 0,factor,RMSE,MAPE
0,True,13.810317,0.011872
