In [1]:
import pandas as pd
from models.scripts import utils
import optuna
from sklearn.metrics import silhouette_score
import os
import warnings
from dotenv import load_dotenv

2023-12-15 12:55:39.723552: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
warnings.filterwarnings(action="ignore", category=FutureWarning, module="sklearn.*")
warnings.filterwarnings(
    action="ignore", category=FutureWarning, module="category_encoders.*"
)

In [3]:
# Read in the CSV data
load_dotenv()
APPLE_PATH = str(os.getenv("APPLE_PATH"))

df = pd.read_csv(APPLE_PATH)

In [4]:
df["Date"] = pd.to_datetime(df["Date"], format="%d-%m-%Y")
df = df.sort_values(by="Date")

# Extract the year, month, and day as separate columns
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Day"] = df["Date"].dt.day
df = df.drop(columns=["Adjusted Close"])

In [5]:
df = utils.add_seasonality(df)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10590 entries, 0 to 10589
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    10590 non-null  datetime64[ns]
 1   Low                     10590 non-null  float64       
 2   Open                    10590 non-null  float64       
 3   Volume                  10590 non-null  int64         
 4   High                    10590 non-null  float64       
 5   Close                   10590 non-null  float64       
 6   Year                    10590 non-null  int32         
 7   Month                   10590 non-null  int32         
 8   Day                     10590 non-null  int32         
 9   Month_Category_Bearish  10590 non-null  float64       
 10  Month_Category_Bullish  10590 non-null  float64       
 11  Month_Category_Normal   10590 non-null  float64       
dtypes: datetime64[ns](1), float64(7), int32(3), in

In [7]:
# Split the data into training and testing sets
train_size = int(len(df) * 0.8)
val_size = int(len(df) * 0.1)
train = df.iloc[:train_size, :]
val = df.iloc[train_size : train_size + val_size, :]
test = df.iloc[train_size + val_size :, :]

In [8]:
train = utils.trigonometric_date_encoding(train)
val = utils.trigonometric_date_encoding(val)
test = utils.trigonometric_date_encoding(test)
train = train.drop(["Date"], axis=1)
val = val.drop(["Date"], axis=1)
test = test.drop(["Date"], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = pd.to_datetime(df[column], format="%d-%m-%Y")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = pd.to_datetime(df[column], format="%d-%m-%Y")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = pd.to_datetime(df[column], format="%d-%m-%Y")


In [9]:
train.head()

Unnamed: 0,Low,Open,Volume,High,Close,Year,Month,Day,Month_Category_Bearish,Month_Category_Bullish,Month_Category_Normal,sin_date,cos_date
0,0.128348,0.128348,469033600,0.128906,0.128348,1980,12,12,0.0,0.0,1.0,-0.433884,-0.900969
1,0.121652,0.12221,175884800,0.12221,0.121652,1980,12,15,0.0,0.0,1.0,0.0,1.0
2,0.112723,0.113281,105728000,0.113281,0.112723,1980,12,16,0.0,0.0,1.0,0.781831,0.62349
3,0.115513,0.115513,86441600,0.116071,0.115513,1980,12,17,0.0,0.0,1.0,0.974928,-0.222521
4,0.118862,0.118862,73449600,0.11942,0.118862,1980,12,18,0.0,0.0,1.0,0.433884,-0.900969


In [10]:
excluded_columns = ["sin_date", "cos_date", "Month_Category_Bearish", "Month_Category_Bullish", "Month_Category_Normal"]
features_to_standardize = [column for column in train.columns if column not in excluded_columns]


# Calculate the mean and standard deviation only for the required columns
train_mean = train[features_to_standardize].mean()
train_std = train[features_to_standardize].std()

# Standardize only the required columns in the train, validation, and test sets
train[features_to_standardize] = (train[features_to_standardize] - train_mean) / train_std
val[features_to_standardize] = (val[features_to_standardize] - train_mean) / train_std
test[features_to_standardize] = (test[features_to_standardize] - train_mean) / train_std

In [11]:
train

Unnamed: 0,Low,Open,Volume,High,Close,Year,Month,Day,Month_Category_Bearish,Month_Category_Bullish,Month_Category_Normal,sin_date,cos_date
0,-0.504567,-0.505994,0.258566,-0.507303,-0.506040,-1.775713,1.601112,-0.427309,0.0,0.0,1.0,-0.433884,-0.900969
1,-0.505802,-0.507115,-0.554355,-0.508515,-0.507263,-1.775713,1.601112,-0.084198,0.0,0.0,1.0,0.000000,1.000000
2,-0.507450,-0.508745,-0.748904,-0.510132,-0.508895,-1.775713,1.601112,0.030172,0.0,0.0,1.0,0.781831,0.623490
3,-0.506935,-0.508338,-0.802387,-0.509627,-0.508385,-1.775713,1.601112,0.144542,0.0,0.0,1.0,0.974928,-0.222521
4,-0.506317,-0.507726,-0.838414,-0.509020,-0.507773,-1.775713,1.601112,0.258913,0.0,0.0,1.0,0.433884,-0.900969
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8467,3.785582,3.750854,-0.601887,3.793458,3.812037,1.729400,0.145368,-0.656050,0.0,0.0,1.0,0.433884,-0.900969
8468,3.847393,3.823897,-0.664758,3.808845,3.820260,1.729400,0.145368,-0.541680,0.0,0.0,1.0,-0.433884,-0.900969
8469,3.883833,3.846723,-0.567235,3.854099,3.876447,1.729400,0.145368,-0.198569,0.0,0.0,1.0,0.000000,1.000000
8470,3.855234,3.889635,-0.537642,3.852289,3.824828,1.729400,0.145368,-0.084198,0.0,0.0,1.0,0.781831,0.623490


In [14]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import optuna
import numpy as np

def objective_DBSCAN(trial: optuna.Trial, df_fit: pd.DataFrame) -> float:
    CPU_COUNT = os.cpu_count() - 4
    if CPU_COUNT is None:
        CPU_COUNT = 8

   # DBSCAN parameters
    eps = trial.suggest_float("eps", 0.05, 2.0)  # Expanded range
    min_samples = trial.suggest_int("min_samples", 10, 150)  # Expanded range
    metric = trial.suggest_categorical("metric", ["euclidean", "manhattan", "chebyshev"])
    algorithm = trial.suggest_categorical("algorithm", ["auto", "ball_tree", "kd_tree"])

    # Only relevant for 'ball_tree' or 'kd_tree'
    if algorithm in ['ball_tree', 'kd_tree']:
        leaf_size = trial.suggest_int("leaf_size", 10, 100)
    else:
        leaf_size = 30  # Default value
    

    model = DBSCAN(
        eps=eps,
        min_samples=min_samples,
        metric=metric,
        algorithm=algorithm,
        leaf_size=leaf_size,
        n_jobs=CPU_COUNT,
)

    # Fit the model and predict clusters
    clusters = model.fit_predict(df_fit)

    # Parameters for penalty
    base_penalty = 0.05  # Base penalty value

    # Filter out noise points (label -1) before calculating the silhouette score
    valid_clusters = clusters != -1
    num_noise_points = np.sum(~valid_clusters)  # Count the number of noise points

    if len(set(clusters[valid_clusters])) > 5:
        sil_score = silhouette_score(df_fit[valid_clusters], clusters[valid_clusters])
        
        # Apply logarithmic penalty for noise points
        if num_noise_points > 0:
            penalty = base_penalty * np.log1p(num_noise_points)
            sil_score -= penalty
    else:
        sil_score = float('-inf')  # Assign a low score if only one cluster or all are noise



    return sil_score


In [15]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import optuna
import numpy as np
import os

def objective_KMeans(trial: optuna.Trial, df_fit: pd.DataFrame) -> float:
    # KMeans parameters
    n_clusters = trial.suggest_int("n_clusters", 2, 10)  # Range for number of clusters
    init = trial.suggest_categorical("init", ["k-means++", "random"])
    n_init = trial.suggest_int("n_init", 10, 20)  # Number of time the k-means algorithm will be run with different centroid seeds

    model = KMeans(
        n_clusters=n_clusters,
        init=init,
        n_init=n_init,
        random_state=trial.suggest_int("random_state", 1, 42),  # For reproducibility
    )

    # Fit the model and predict clusters
    clusters = model.fit_predict(df_fit)

    # Silhouette score calculation
    if len(set(clusters)) > 1:  # Check if there is more than one cluster
        sil_score = silhouette_score(df_fit, clusters)
    else:
        sil_score = float('-inf')  # Assign a low score if only one cluster

    return sil_score


In [17]:
from datetime import datetime
# Create an Optuna study
study_DBSCAN = optuna.create_study(
    direction="maximize",  
    study_name="DBSCAN_optimization_" + datetime.now().strftime("%Y%m%d%H%M%S"),
    load_if_exists=True,
)

# Start the optimization
study_DBSCAN.optimize(
    lambda trial: objective_DBSCAN(trial, train),
    n_trials=200,
    show_progress_bar=False,
)

[I 2023-12-15 12:59:29,256] A new study created in memory with name: DBSCAN_optimization_20231215125929


[I 2023-12-15 12:59:29,396] Trial 0 finished with value: -inf and parameters: {'eps': 0.26631503972754184, 'min_samples': 148, 'metric': 'euclidean', 'algorithm': 'ball_tree', 'leaf_size': 14}. Best is trial 0 with value: -inf.
[I 2023-12-15 12:59:29,516] Trial 1 finished with value: -inf and parameters: {'eps': 0.6442417134350586, 'min_samples': 82, 'metric': 'chebyshev', 'algorithm': 'auto'}. Best is trial 0 with value: -inf.
[I 2023-12-15 12:59:29,858] Trial 2 finished with value: -0.28768974398938013 and parameters: {'eps': 0.7254656063553245, 'min_samples': 31, 'metric': 'euclidean', 'algorithm': 'auto'}. Best is trial 2 with value: -0.28768974398938013.
[I 2023-12-15 12:59:30,080] Trial 3 finished with value: -inf and parameters: {'eps': 1.131391593163572, 'min_samples': 90, 'metric': 'euclidean', 'algorithm': 'auto'}. Best is trial 2 with value: -0.28768974398938013.
[I 2023-12-15 12:59:31,358] Trial 4 finished with value: -inf and parameters: {'eps': 1.5949908498985876, 'min_sa

In [18]:
from datetime import datetime
# Create an Optuna study
study_KMeans = optuna.create_study(
    direction="maximize",  
    study_name="Kmeans_optimization_" + datetime.now().strftime("%Y%m%d%H%M%S"),
    load_if_exists=True,
)

# Start the optimization
study_KMeans.optimize(
    lambda trial: objective_KMeans(trial, train),
    n_trials=200,
    show_progress_bar=False,
)

[I 2023-12-15 13:00:58,078] A new study created in memory with name: Kmeans_optimization_20231215130058
[I 2023-12-15 13:01:05,314] Trial 0 finished with value: 0.1864085589688209 and parameters: {'n_clusters': 7, 'init': 'random', 'n_init': 20, 'random_state': 5}. Best is trial 0 with value: 0.1864085589688209.
[I 2023-12-15 13:01:06,753] Trial 1 finished with value: 0.28907700689033367 and parameters: {'n_clusters': 3, 'init': 'random', 'n_init': 12, 'random_state': 17}. Best is trial 1 with value: 0.28907700689033367.
[I 2023-12-15 13:01:08,139] Trial 2 finished with value: 0.18416742624908436 and parameters: {'n_clusters': 10, 'init': 'random', 'n_init': 16, 'random_state': 29}. Best is trial 1 with value: 0.28907700689033367.
[I 2023-12-15 13:01:09,421] Trial 3 finished with value: 0.18641017461664933 and parameters: {'n_clusters': 7, 'init': 'k-means++', 'n_init': 19, 'random_state': 37}. Best is trial 1 with value: 0.28907700689033367.
[I 2023-12-15 13:01:10,760] Trial 4 finishe

In [20]:
print("Best trial:", study_DBSCAN.best_trial)
print("Best trial:", study_KMeans.best_trial)

Best trial: FrozenTrial(number=11, state=TrialState.COMPLETE, values=[0.007242899178280582], datetime_start=datetime.datetime(2023, 12, 15, 12, 59, 34, 490659), datetime_complete=datetime.datetime(2023, 12, 15, 12, 59, 34, 605083), params={'eps': 0.7361466164185523, 'min_samples': 10, 'metric': 'manhattan', 'algorithm': 'auto'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'eps': FloatDistribution(high=2.0, log=False, low=0.05, step=None), 'min_samples': IntDistribution(high=150, log=False, low=10, step=1), 'metric': CategoricalDistribution(choices=('euclidean', 'manhattan', 'chebyshev')), 'algorithm': CategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree'))}, trial_id=11, value=None)
Best trial: FrozenTrial(number=8, state=TrialState.COMPLETE, values=[0.4944635004085972], datetime_start=datetime.datetime(2023, 12, 15, 13, 1, 14, 291378), datetime_complete=datetime.datetime(2023, 12, 15, 13, 1, 15, 499320), params={'n_clusters': 2, 'init': 'random', '

In [21]:
model = KMeans(
    n_clusters=study_KMeans.best_trial.params["n_clusters"],
    init=study_KMeans.best_trial.params["init"],
    n_init=study_KMeans.best_trial.params["n_init"],
    random_state=study_KMeans.best_trial.params["random_state"],
)
clusters = model.fit_predict(train)

In [22]:
#print for each cluster how many elements are in it
print("Cluster analysis:")
print(pd.Series(clusters).value_counts())

Cluster analysis:
0    7371
1    1101
Name: count, dtype: int64


In [23]:
clusters = model.predict(val)

In [24]:
#print for each cluster how many elements are in it
print("Cluster analysis:")
print(pd.Series(clusters).value_counts())

Cluster analysis:
1    1059
Name: count, dtype: int64


In [25]:
clusters = model.predict(test)

In [26]:
#print for each cluster how many elements are in it
print("Cluster analysis:")
print(pd.Series(clusters).value_counts())

Cluster analysis:
1    1059
Name: count, dtype: int64
