## Load the libraries

In [42]:
import pandas as pd
import pandas_ta as ta
import pylab as pl
import numpy as np
import scipy.optimize as opt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
%matplotlib inline 
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

<h2 id="load_dataset">Load the data</h2>

In [22]:
#Here we will load from github or do API or something

## Load Data From CSV File


<span style="color:red;">Welcome to Alpha Vantage! Here is your API key: 9NNSMISKUYVUTO0M. Please record this API key at a safe place for future data access.</span>
from tvDatafeed import TvDatafeed, Interval

username = 'pkongdan01'
password = 'Secu@3545'

tv = TvDatafeed(username, password)

In [23]:
# load the data
xau23 = pd.read_csv("./data/xau23.csv", header=None)
xau22 = pd.read_csv("./data/xau22.csv", header=None)
xau21 = pd.read_csv("./data/xau21.csv", header=None)

# create a columns name
cname= ["date","sec","open", "high", "low", "close","volume"]
xau23.columns = cname
xau22.columns = cname
xau21.columns = cname

# union the data set 
df_import = pd.concat([xau23,xau22,xau21])
print(df_import.head())

# Check for N/A
print(df_import.isna().sum())
# Check for types
print(df_import.dtypes)


         date    sec      open      high       low     close  volume
0  2023.01.02  18:00  1826.837  1827.337  1826.617  1826.637       0
1  2023.01.02  18:01  1826.537  1827.357  1826.137  1826.537       0
2  2023.01.02  18:02  1826.137  1826.737  1826.137  1826.737       0
3  2023.01.02  18:05  1827.187  1828.867  1827.187  1828.738       0
4  2023.01.02  18:06  1828.758  1829.958  1828.758  1829.497       0
date      0
sec       0
open      0
high      0
low       0
close     0
volume    0
dtype: int64
date       object
sec        object
open      float64
high      float64
low       float64
close     float64
volume      int64
dtype: object


## Data Processing

#### Adding datetime format

In [24]:
# combine the date and time columns into a single string
df_import['datetime'] = df_import['date'] + df_import['sec']

# convert the combined string to a Pandas datetime object
df_import['datetime'] = pd.to_datetime(df_import['datetime'], format = '%Y.%m.%d%H:%M')

# reselect the data
df_import = df_import[['datetime','open','high','low','close']]

# sort the data ascending
df_import = df_import.sort_values(by='datetime',ascending=True)

print(df_import.dtypes)

# set the datetime column as the index
df_import.set_index('datetime', inplace=True)

# resample data into 1hour interval
df_import = df_import.resample('1H').agg({'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last'})
print(df_import.isnull().sum())

# drop null value (Market Close on those days)
df_import = df_import.dropna()

datetime    datetime64[ns]
open               float64
high               float64
low                float64
close              float64
dtype: object
open     6170
high     6170
low      6170
close    6170
dtype: int64


#### Adding features (lagged candles)

In [25]:
num_lags = 5
for col in df_import.columns:
    for i in range(1, num_lags+1):
        col_name = col + '_lag' + str(i)
        df_import[col_name] = df_import[col].shift(i)
df_import = df_import.dropna()
df_import

Unnamed: 0_level_0,open,high,low,close,open_lag1,open_lag2,open_lag3,open_lag4,open_lag5,high_lag1,...,low_lag1,low_lag2,low_lag3,low_lag4,low_lag5,close_lag1,close_lag2,close_lag3,close_lag4,close_lag5
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-03 23:00:00,1923.119,1923.634,1920.108,1922.605,1921.695,1916.558,1913.278,1913.025,1904.998,1925.145,...,1920.675,1915.394,1912.124,1909.858,1903.288,1923.120,1921.675,1916.524,1913.278,1913.035
2021-01-04 00:00:00,1922.595,1922.925,1918.984,1921.548,1923.119,1921.695,1916.558,1913.278,1913.025,1923.634,...,1920.108,1920.675,1915.394,1912.124,1909.858,1922.605,1923.120,1921.675,1916.524,1913.278
2021-01-04 01:00:00,1921.648,1925.475,1921.345,1922.835,1922.595,1923.119,1921.695,1916.558,1913.278,1922.925,...,1918.984,1920.108,1920.675,1915.394,1912.124,1921.548,1922.605,1923.120,1921.675,1916.524
2021-01-04 02:00:00,1922.828,1925.428,1922.318,1924.668,1921.648,1922.595,1923.119,1921.695,1916.558,1925.475,...,1921.345,1918.984,1920.108,1920.675,1915.394,1922.835,1921.548,1922.605,1923.120,1921.675
2021-01-04 03:00:00,1924.678,1935.085,1924.468,1932.038,1922.828,1921.648,1922.595,1923.119,1921.695,1925.428,...,1922.318,1921.345,1918.984,1920.108,1920.675,1924.668,1922.835,1921.548,1922.605,1923.120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-02-28 19:00:00,1825.374,1825.565,1822.938,1823.355,1826.104,1826.375,1827.978,1826.518,1818.608,1827.218,...,1825.265,1825.316,1827.848,1826.255,1818.608,1825.395,1826.648,1828.915,1829.168,1823.975
2023-02-28 20:00:00,1823.365,1829.425,1823.105,1829.005,1825.374,1826.104,1826.375,1827.978,1826.518,1825.565,...,1822.938,1825.265,1825.316,1827.848,1826.255,1823.355,1825.395,1826.648,1828.915,1829.168
2023-02-28 21:00:00,1829.045,1829.195,1827.305,1827.635,1823.365,1825.374,1826.104,1826.375,1827.978,1829.425,...,1823.105,1822.938,1825.265,1825.316,1827.848,1829.005,1823.355,1825.395,1826.648,1828.915
2023-02-28 22:00:00,1827.625,1831.148,1827.405,1830.895,1829.045,1823.365,1825.374,1826.104,1826.375,1829.195,...,1827.305,1823.105,1822.938,1825.265,1825.316,1827.635,1829.005,1823.355,1825.395,1826.648


#### Adding features (indicators)

In [26]:
df = df_import.copy()
# print(xau_data.isnull().sum())
# df['RSI'] = ta.rsi(df.close, length=15)
df['EMAF'] = ta.ema(df.close, length = 59)
# df['EMAM'] = ta.ema(df.close, length = 100)
# df['EMAS'] = ta.ema(df.close, length = 150)
k=5
stoch = ta.stoch(high=df['high'], low=df['low'], close=df['close'], k=k, d=3, smooth_k=3)
df['STO_K'] = stoch[f'STOCHk_{k}_3_3']
#df['STO_D'] = stoch[f'STOCHd_{k}_3_3']

df = df.dropna()
print(df.head())

df.to_csv("./csv/indicator_check.csv")

                         open      high       low     close  open_lag1  \
datetime                                                                 
2021-01-06 11:00:00  1909.649  1909.848  1900.755  1907.965   1932.345   
2021-01-06 12:00:00  1907.978  1908.028  1902.448  1907.478   1909.649   
2021-01-06 13:00:00  1907.478  1911.348  1905.968  1911.248   1907.978   
2021-01-06 14:00:00  1911.158  1921.208  1908.488  1918.365   1907.478   
2021-01-06 15:00:00  1918.305  1924.175  1915.208  1918.088   1911.158   

                     open_lag2  open_lag3  open_lag4  open_lag5  high_lag1  \
datetime                                                                     
2021-01-06 11:00:00   1932.825   1933.809   1945.168   1956.595   1933.658   
2021-01-06 12:00:00   1932.345   1932.825   1933.809   1945.168   1909.848   
2021-01-06 13:00:00   1909.649   1932.345   1932.825   1933.809   1908.028   
2021-01-06 14:00:00   1907.978   1909.649   1932.345   1932.825   1911.348   
2021-01-06 15

#### Adding Condition for y

In [27]:
highs = df['high'].rolling(window=4).max().shift(-4)
lows = df['low'].rolling(window=4).max().shift(-4)
# create new columns for conditions 
df['high_close_diff'] = highs - df['close']
df['low_close_diff'] = lows - df['close']

def reco(row):
    if row.low_close_diff <= -3.3:
        return 0
    elif row.high_close_diff >= 4.3: #I use 3 plus .3 for bit offer off-set, and 1 in case of delay ordering cause price to change.
        return 1
    else:
        return 0

df['target'] = df.apply(reco,axis=1)
# df["mhigh"]=highs
# df["mlow"]=lows
df = df.dropna()

df_simu = df
print('1', (df['target'] == 1).sum())
print('0', (df['target'] == 0).sum())
print(f"check null value {df.isnull().sum()}")

1 5019
0 7614
check null value open               0
high               0
low                0
close              0
open_lag1          0
open_lag2          0
open_lag3          0
open_lag4          0
open_lag5          0
high_lag1          0
high_lag2          0
high_lag3          0
high_lag4          0
high_lag5          0
low_lag1           0
low_lag2           0
low_lag3           0
low_lag4           0
low_lag5           0
close_lag1         0
close_lag2         0
close_lag3         0
close_lag4         0
close_lag5         0
EMAF               0
STO_K              0
high_close_diff    0
low_close_diff     0
target             0
dtype: int64


#### Making Training, Cross Validation, and Testing sets

In [28]:
n = len(df)
print(n)

train_n = int(round(n*0.70,0))
cross_n = int(round(n*0.15,0))
test_n = int(round(n*0.15,0))


train_start = str(df.iloc[0].name)
train_end = str(df.iloc[train_n].name)
print('Training set start: {0} \n Training set end: {1}'.format(train_start,train_end))
#print(df.loc[train_start:train_end])

cross_start = str(df.iloc[train_n+1].name)
cross_end = str(df.iloc[train_n+cross_n].name)
print('cross validation set start: {0} \n cross validation set end: {1}'.format(cross_start,cross_end))
#print(df.loc[cross_start:cross_end])

test_start = str(df.iloc[train_n + cross_n +1].name)
test_end = str(df.iloc[n-1].name)
print('test set start: {0} \n test set end: {1}'.format(test_start,test_end))
#print(df.loc[test_start:])

X = df.iloc[:,:len(df.columns)-3]

y = df['target']


X_train = np.asarray(X.loc[train_start:train_end])
y_train = np.asarray(y.loc[train_start:train_end])

X_cross = np.asarray(X.loc[cross_start:cross_end])
y_cross = np.asarray(y.loc[cross_start:cross_end])

X_test = np.asarray(X.loc[test_start:])
y_test =np.asarray(y.loc[test_start:])



#Extra for GridSearch
X_train_cv = np.asarray(X.loc[train_start:cross_end])
y_train_cv = np.asarray(y.loc[train_start:cross_end])


print("Shape of X_train:", np.shape(X_train))
print("Shape of y_train:", np.shape(y_train))
print("Shape of X_cross:", np.shape(X_cross))
print("Shape of y_cross:", np.shape(y_cross))
print("Shape of X_test:", np.shape(X_test))
print("Shape of y_test:", np.shape(y_test))


# Below is for later simulation
df_simu_test = df_simu.loc[test_start:]

12633
Training set start: 2021-01-06 11:00:00 
 Training set end: 2022-07-06 09:00:00
cross validation set start: 2022-07-06 10:00:00 
 cross validation set end: 2022-10-30 20:00:00
test set start: 2022-10-30 21:00:00 
 test set end: 2023-02-28 19:00:00
Shape of X_train: (8844, 26)
Shape of y_train: (8844,)
Shape of X_cross: (1895, 26)
Shape of y_cross: (1895,)
Shape of X_test: (1894, 26)
Shape of y_test: (1894,)


In [29]:
X

Unnamed: 0_level_0,open,high,low,close,open_lag1,open_lag2,open_lag3,open_lag4,open_lag5,high_lag1,...,low_lag3,low_lag4,low_lag5,close_lag1,close_lag2,close_lag3,close_lag4,close_lag5,EMAF,STO_K
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-06 11:00:00,1909.649,1909.848,1900.755,1907.965,1932.345,1932.825,1933.809,1945.168,1956.595,1933.658,...,1930.588,1924.989,1945.168,1909.728,1932.395,1932.858,1933.849,1945.168,1940.718746,15.858660
2021-01-06 12:00:00,1907.978,1908.028,1902.448,1907.478,1909.649,1932.345,1932.825,1933.809,1945.168,1909.848,...,1927.288,1930.588,1924.989,1907.965,1909.728,1932.395,1932.858,1933.849,1939.610721,13.895802
2021-01-06 13:00:00,1907.478,1911.348,1905.968,1911.248,1907.978,1909.649,1932.345,1932.825,1933.809,1908.028,...,1903.895,1927.288,1930.588,1907.478,1907.965,1909.728,1932.395,1932.858,1938.665297,19.664488
2021-01-06 14:00:00,1911.158,1921.208,1908.488,1918.365,1907.478,1907.978,1909.649,1932.345,1932.825,1911.348,...,1900.755,1903.895,1927.288,1911.248,1907.478,1907.965,1909.728,1932.395,1937.988620,32.465316
2021-01-06 15:00:00,1918.305,1924.175,1915.208,1918.088,1911.158,1907.478,1907.978,1909.649,1932.345,1921.208,...,1902.448,1900.755,1903.895,1918.365,1911.248,1907.478,1907.965,1909.728,1937.325266,51.887239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-02-28 12:00:00,1826.518,1831.045,1826.255,1829.168,1818.608,1809.258,1808.915,1810.575,1810.425,1826.645,...,1807.745,1809.414,1804.594,1823.975,1813.804,1808.975,1810.235,1810.565,1818.791606,87.624677
2023-02-28 14:00:00,1827.978,1830.218,1827.848,1828.915,1826.518,1818.608,1809.258,1808.915,1810.575,1831.045,...,1807.108,1807.745,1809.414,1829.168,1823.975,1813.804,1808.975,1810.235,1819.129052,90.383977
2023-02-28 16:00:00,1826.375,1827.345,1825.316,1826.648,1827.978,1826.518,1818.608,1809.258,1808.915,1830.218,...,1818.608,1807.108,1807.745,1828.915,1829.168,1823.975,1813.804,1808.975,1819.379684,88.297058
2023-02-28 18:00:00,1826.104,1827.218,1825.265,1825.395,1826.375,1827.978,1826.518,1818.608,1809.258,1827.345,...,1826.255,1818.608,1807.108,1826.648,1828.915,1829.168,1823.975,1813.804,1819.580194,75.767876


####เอาไว้เรียนรู้ทำความเข้าใจ ตรงนี้สำคัญ ยากและลืมง่าย

#Create a sample dataframe
df = pd.DataFrame({'value': [2, 4, 5, 7, 6, 8, 9, 10, 11, 12, 9, 8, 6, 4, 3]})

#Apply rolling window with current row included and apply max function
window_period = 4
df['rolling_min'] = df['value'].rolling(window=4).min().shift(-3)
df['rolling_max'] = df['value'].rolling(window=4).max().shift(-3)
df


<details>
<summary>Manually Normaliz </summary>
```python
#Create Normalized Data
def normalize_data(X):
    means = np.mean(X, axis=0)
    stds = np.std(X, axis=0)
    X_norm = (X - means) / stds
    return X_norm, means, stds

#This can run multiple times
X_train_n, X_train_means, X_train_stds = normalize_data(X_train)

def normalize_data_new(X,means,stds):
    X_norm = (X - means) / stds
    return X_norm

X_cross_n = normalize_data_new(X_cross,X_train_means,X_train_stds)
X_test_n = normalize_data_new(X_test,X_train_means,X_train_stds)
```
</details>

In [30]:
#Create Normalized Data
def normalize_data(X):
    means = np.mean(X, axis=0)
    stds = np.std(X, axis=0)
    X_norm = (X - means) / stds
    return X_norm, means, stds

#This can run multiple times
X_train_n, X_train_means, X_train_stds = normalize_data(X_train)

def normalize_data_new(X,means,stds):
    X_norm = (X - means) / stds
    return X_norm

X_cross_n = normalize_data_new(X_cross,X_train_means,X_train_stds)
X_test_n = normalize_data_new(X_test,X_train_means,X_train_stds)

##### Export Scaler

In [31]:
from joblib import dump
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(sum(X_test_n == X_test_scaled)) # sum if true

dump(scaler, 'scaler_long.pkl')

[1894 1894 1894 1894 1894 1894 1894 1894 1894 1894 1894 1894 1894 1894
 1894 1894 1894 1894 1894 1894 1894 1894 1894 1894 1894 1894]


['scaler_long.pkl']

##### Test import scaler

In [32]:
from joblib import load
scaler2 = load('scaler_long.pkl')
X_test_scaled2 = scaler2.transform(X_test)
sum((X_test_n == X_test_scaled2) & (X_test_scaled == X_test_scaled2))

array([1894, 1894, 1894, 1894, 1894, 1894, 1894, 1894, 1894, 1894, 1894,
       1894, 1894, 1894, 1894, 1894, 1894, 1894, 1894, 1894, 1894, 1894,
       1894, 1894, 1894, 1894])

##### Final verification

In [33]:
mean = scaler.mean_
std = scaler.scale_
mean2 = scaler2.mean_
std2 = scaler2.scale_
print('mean scaler one',mean)
print('mean scaler two',mean2)
print('mean manual nor',X_train_means)

print('std scaler one',std)
print('std scaler two',std2)
print('std manual nor',X_train_stds)

print(X_train_scaled[0])

mean scaler one [1823.14996009 1825.42356581 1820.74145828 1823.12533786 1823.16886906
 1823.18806479 1823.20759159 1823.22809181 1823.24962743 1825.44232203
 1825.4615709  1825.48193725 1825.50277408 1825.52428471 1820.7582595
 1820.77723847 1820.7964922  1820.81515513 1820.83565604 1823.14222886
 1823.16112653 1823.18031739 1823.19984193 1823.22034102 1823.6406115
   51.27082431]
mean scaler two [1823.14996009 1825.42356581 1820.74145828 1823.12533786 1823.16886906
 1823.18806479 1823.20759159 1823.22809181 1823.24962743 1825.44232203
 1825.4615709  1825.48193725 1825.50277408 1825.52428471 1820.7582595
 1820.77723847 1820.7964922  1820.81515513 1820.83565604 1823.14222886
 1823.16112653 1823.18031739 1823.19984193 1823.22034102 1823.6406115
   51.27082431]
mean manual nor [1823.14996009 1825.42356581 1820.74145828 1823.12533786 1823.16886906
 1823.18806479 1823.20759159 1823.22809181 1823.24962743 1825.44232203
 1825.4615709  1825.48193725 1825.50277408 1825.52428471 1820.7582595
 1

<h2 id="modeling">Modeling (SVM with Scikit-learn)</h2>


In the context of Support Vector Machines (SVM), "C" is a hyperparameter that controls the trade-off between maximizing the margin and minimizing the classification error on the training data.

The hyperparameter "C" in SVM controls the misclassification penalty. A smaller value of "C" allows more misclassifications on the training data, while a larger value of "C" penalizes misclassifications more heavily. In other words, a smaller value of "C" creates a wider margin, allowing more data points to fall within the margin, but may result in lower accuracy on the training data. Conversely, a larger value of "C" creates a narrower margin, reducing the number of misclassifications but may lead to overfitting.

<span style="color:red">
Large C >>> Larger penalizes >>> complex model >>> lead to overfittting <br>
Small C >>> Smaller penalizes >>> simple model >>> underfitting
</span>

## MODEL 1: LSVM

In [34]:
print((y_cross == 0).sum() /((y_cross == 1).sum() + (y_cross == 0).sum() ))
print((y_cross == 1).sum())
print((y_cross == 0).sum())

0.6052770448548813
748
1147


In [43]:
# Define a list of hyperparameters to loop through
#################larger value of C imposes less regularization on the model
Cs = [0.001,0.01, 0.1,0.5, 1, 1.5, 2, 10,100]
# Cs = [0.001,0.01,0.5,1,10]

# Initialize variables to keep track of the best hyperparameters and their corresponding score
best_score = 0
best_C = None
max_iter = 20000
features_X_train = X_train_n
target_y_train = y_train
features_X_cross = X_cross_n
target_y_cross = y_cross
# Loop through the hyperparameters
for C in Cs:
    lsvm = LinearSVC(penalty='l2', loss='squared_hinge', dual=True, C=C, max_iter=max_iter)
    lsvm.fit(features_X_train, y_train)
    y_train_pred = lsvm.predict(features_X_train)
    tn, fp, fn, tp = confusion_matrix(target_y_train, y_train_pred).ravel()
    train_acc = tp / (fp + tp)
    train_score = lsvm.score(features_X_train, target_y_train)
    y_cross_pred = lsvm.predict(features_X_cross)
    tn, fp, fn, tp = confusion_matrix(target_y_cross, y_cross_pred).ravel()
    cross_acc = tp / (fp + tp)
    cross_score = lsvm.score(features_X_cross, target_y_cross)
    print(f"C={C}, Train acc={train_acc}, cross acc={cross_acc}")
    print(f"C={C}, Train score={train_score}, cross score={cross_score}")
    
    # Update the best hyperparameters and their corresponding score if applicable
    if (train_acc + cross_acc) > best_score:
        best_score = train_acc + cross_acc
        best_C = C

print("The best 'C' is:", best_C)
# Create the LinearSVC model with normalization and the best value of C
lsvm = LinearSVC(penalty='l2', loss='squared_hinge', dual=True, C=best_C, max_iter=max_iter)

# Fit the model to the training data
lsvm.fit(features_X_train, target_y_train)


#EVALUATION
yhat_m1 = lsvm.predict(features_X_cross)
accuracy_m1 = round(accuracy_score(target_y_cross,yhat_m1),2)
print('Accuracy_score: ', accuracy_m1)
f1 = round(f1_score(target_y_cross, yhat_m1, average='weighted'), 2)
print('weight avg', f1)

print(metrics.classification_report(target_y_cross, yhat_m1))

tn, fp, fn, tp = confusion_matrix(target_y_cross, yhat_m1).ravel()
tpr = (tp / (tp + fp)) * 100
print("Accuracy for predicting class 1: {:.2f}%".format(tpr))

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay.from_estimator(lsvm, features_X_cross, target_y_cross)


TypeError: __init__() got an unexpected keyword argument 'n_jobs'

<details>
<summary> XGBoost </summary>

```python
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform, randint # this will indirectly use in TimeSeriesSplit

# Define the hyperparameter space
param_dist = {'n_estimators': [100, 200, 300],
              'max_depth': [3, 6, 9, 18, 36],
              'gamma': [0.01, 0.1],
              'learning_rate': [0.001, 0.01, 0.1, 1],
              'reg_lambda': [0.1, 1, 10],
              'reg_alpha': [0.1, 1, 10]}

def count_hyperparam_combinations(param_dict):
    total_combinations = 1
    for key in param_dict:
        total_combinations *= len(param_dict[key])
    return total_combinations

possible_combination = count_hyperparam_combinations(param_dist)
print("possible combination:", possible_combination)


# Create a time series split object
tscv = TimeSeriesSplit(n_splits=5)

# Scale the training data
scaler = StandardScaler()
features_X_train = scaler.fit_transform(X_train)
target_y_train = y_train



# Create a random search object
model_xgb = xgb.XGBRegressor(seed=42) # set seed to 42 to fix how model randomize sets of cv, this is for reproducibility.
random_search = RandomizedSearchCV(model_xgb, param_distributions=param_dist, cv=tscv, n_iter=possible_combination-(possible_combination*0.8), verbose=1, n_jobs=-1)

# Fit the random search object to the data
random_search.fit(features_X_train, target_y_train)

# Print the best hyperparameters found by the random search
print(random_search.best_params_)

# Create the final model using the best hyperparameters
model_xgb = xgb.XGBRegressor(seed=42, **random_search.best_params_)


# Fit the final model to the training data
model_xgb.fit(features_X_train, y_train)


##########Evaluation
# Normalize the test data using the same scaler used for training data
features_X_cross = scaler.transform(X_cross)
target_y_cross = y_cross

# Make predictions using the XGBoost model
yhat_m2_pred = model_xgb.predict(features_X_cross)

# Convert the predicted values to binary labels (0 or 1) using a threshold of 0.5
yhat_m2 = (yhat_m2_pred > 0.5).astype(int)

# Evaluate the model performance on the test set
accuracy = accuracy_score(target_y_cross, yhat_m2)
f1 = f1_score(target_y_cross, yhat_m2, average='weighted')
jaccard = jaccard_score(target_y_cross, yhat_m2)

# Print the evaluation metrics
print("Accuracy score:", accuracy)
print("F1 score:", f1)
print("Jaccard score:", jaccard)



# Make predictions using the XGBoost model
yhat_m2_pred = model_xgb.predict(features_X_cross)

# Convert the predicted values to binary labels (0 or 1) using a threshold of 0.5
yhat_m2 = (yhat_m2_pred > 0.5).astype(int)

#report
print(metrics.classification_report(target_y_cross, yhat_m2))

# Plot the confusion matrix
cm = confusion_matrix(target_y_cross, yhat_m2)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(target_y_cross))
disp.plot()

tn, fp, fn, tp = confusion_matrix(target_y_cross, yhat_m2).ravel()
tpr = (tp / (tp + fp)) * 100
print("Accuracy for predicting class 1: {:.2f}%".format(tpr))

```

</details>


# Logistic

In [None]:
# Mad man method loop through everything
#   larger value of C imposes less regularization on the model
params = {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l2']}
lr = LogisticRegression(solver='liblinear')
best_score_no_norm = 0
best_params = {}

features_X_train = X_train
target_y_train = y_train
features_X_cross = X_cross
target_y_cross = y_cross

for c in params['C']:
    for p in params['penalty']:
        lr.set_params(C=c, penalty=p)
        lr.fit(features_X_train, target_y_train)
        score_train = lr.score(features_X_train,target_y_train)
        score_cross = lr.score(features_X_cross, target_y_cross)
        print("score_train: {0}, score_cross: {1}, C: {2}, penalty: {3}".format(score_train,score_cross,c,p))
        if score_train > best_score_no_norm:
            best_score_no_norm = score_train
            best_params_no_norm = {'C': c, 'penalty': p}
print("Without Normalization")
print("Best hyperparameters:", best_params_no_norm)
print("Cross set score:", best_score_no_norm)

#------------

lr = LogisticRegression(solver='liblinear')
best_score_norm = 0
best_params_norm = {}

features_X_train = X_train_n
target_y_train = y_train
features_X_cross = X_cross_n
target_y_cross = y_cross

for c in params['C']:
    for p in params['penalty']:
        
        lr.set_params(C=c, penalty=p)
        lr.fit(features_X_train, target_y_train)
        
        score_train = lr.score(features_X_train,target_y_train)
        score_cross = lr.score(features_X_cross, target_y_cross)
        print("score_train: {0}, score_cross: {1}, C: {2}, penalty: {3}".format(score_train,score_cross,c,p))
        if score_train > best_score_norm:
            best_score_norm = score_train
            best_params_norm = {'C': c, 'penalty': p}
print("With Normalization")
print("Best hyperparameters:", best_params_norm)
print("Cross set score:", best_score_norm)

if best_score_no_norm >= best_score_norm:
    best_params = best_params_no_norm
    features_X_train = X_train
    target_y_train = y_train
    features_X_cross = X_cross
    target_y_cross = y_cross
    isnorm = False
elif best_score_norm >= best_score_no_norm:
    best_params = best_params_norm
    isnorm = True
else:
    raise ValueError('no condition met')

print("BEST COMBINATION hyperparameters: {0}, With normalization: {1}".format(best_params,isnorm))

lr = LogisticRegression(solver='liblinear', C=best_params['C'],penalty=best_params['penalty'])##(** is for unpacking dictionary)
lr.fit(features_X_train, target_y_train)

#Evaluation
cross_score = lr.score(features_X_cross, target_y_cross)
print("cross set score:", cross_score)

yhat_m3 = lr.predict(features_X_cross)

accuracy_m3 = accuracy_score(target_y_cross,yhat_m3)
print('Accuracy: ', accuracy_m3)

jaccard_m3 = jaccard_score(target_y_cross, yhat_m3,pos_label=1)
print('Jaccard' , jaccard_m3)

print(metrics.classification_report(target_y_cross,yhat_m3))

# Plot the confusion matrix
cm = confusion_matrix(target_y_cross, yhat_m3)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(target_y_cross))
disp.plot()

tn, fp, fn, tp = confusion_matrix(target_y_cross, yhat_m3).ravel()
tpr = (tp / (tp + fp)) * 100
print("Accuracy for predicting class 1: {:.2f}%".format(tpr))

# SIMULATION PART

In [37]:
def predict_simu(model, data_simu_x_n, data_simu_y, initial_capital):
    capital = initial_capital
    win_count = 0
    lost_count = 0
    no_order_count = 0
    record_result = []
    record_capital = []
    record_date = []
    consecutive_lost = 0
    consecutive_lost_max = 0
    lw = []
    for i in range(0, len(data_simu_x_n)):
        # if model == model_xgb:
        #     prediction = model.predict(data_simu_x_n.iloc[i].values.reshape(1,-1))
        #     prediction = (prediction > 0.5).astype(int)
        prediction = model.predict(data_simu_x_n.iloc[i].values.reshape(1,-1))
        
        if prediction == 1 and data_simu_y[i] == 1:
            outcome = 3.1
            capital += outcome
            win_count += 1
            order_record = 'win-------- prediction = {0}, actual = {1}'.format(prediction,data_simu_y[i])
            result = 'win'
        elif prediction == 1 and data_simu_y[i] == 0:
            outcome = -3.1
            capital += outcome
            lost_count += 1
            order_record = 'lost------- prediction = {0}, actual = {1}'.format(prediction,data_simu_y[i])
            result = 'lost'
        elif prediction == 0:
            no_order_count +=1
            order_record = 'no order--- prediction = {0}, actual = {1}'.format(prediction, data_simu_y[i])
            capital = capital
            result = 'no order'
        else:
            raise ValueError('no condition met')
        record_date.append(data_simu_x_n.iloc[i].name)
        record_result.append(order_record)
        record_capital.append(capital)

        #Calculate Consecutive Lost
        if result == 'lost':
            consecutive_lost += -1
            lw.append('Lost')
            if consecutive_lost <= consecutive_lost_max:
                consecutive_lost_max = consecutive_lost
        if result == 'win':
            lw.append('Win')
            consecutive_lost = 0

    total_return = ((capital - initial_capital) / initial_capital)*100
    sim_df = pd.DataFrame({'record_date':record_date,
                           'record_result':record_result,
                           'record_capital': record_capital})
    sim_df.set_index('record_date', inplace=True)

    return win_count, lost_count, no_order_count, capital, total_return, sim_df, consecutive_lost_max, lw


In [38]:
# X_train = np.asarray(X.loc[train_start:train_end])
# y_train = np.asarray(y.loc[train_start:train_end])

# X_cross = np.asarray(X.loc[cross_start:cross_end])
# y_cross = np.asarray(y.loc[cross_start:cross_end])

# X_test = np.asarray(X.loc[test_start:])
# y_test =np.asarray(y.loc[test_start:])


X_test_simu = X.loc[test_start:]
X_test_simu_n = normalize_data_new(X_test_simu,X_train_means,X_train_stds)

X_cross_simu = X.loc[cross_start:cross_end]
X_cross_simu_n = normalize_data_new(X_cross_simu,X_train_means,X_train_stds)

def predict_count(data_simu):
    adding = []
    for i in range(0,len(data_simu)):
        prediction = str(lsvm.predict(data_simu.iloc[i].values.reshape(1,-1))).strip('[]')
        adding.append(prediction)
    print(adding.count('0'))
    print(adding.count('1'))


print('-----cross-----')
print('The value should be:>> yhat = 0: {0}<<, >> yhat = 1: {1}<<'.format(np.count_nonzero(yhat_m1 == 0),np.count_nonzero(yhat_m1 == 1)))
predict_count(X_cross_simu_n)

print('-----test-----')
predict_count(X_test_simu_n)

-----cross-----
The value should be:>> yhat = 0: 1814<<, >> yhat = 1: 81<<
1814
81
-----test-----
1793
101


In [39]:
win_count, lost_count, no_order_count, capital, total_return, sim_df, consecutive_lost_max, lw = predict_simu(lsvm, 
                                                                                                          X_test_simu_n, 
                                                                                                          y_test, 
                                                                                                          500)
print('Total Return: {0}%'.format(round(total_return,0)))
print('Final Capital: ${0}'.format(round(capital,0)))
print('Win Count: ',win_count)
print('Lost Count: ',lost_count)
print('No Order Count: ', no_order_count)
print('Max Consecutive Lost: {0} trades'.format(consecutive_lost_max/-1))
day_one = X_test_simu_n.iloc[0].name.to_pydatetime().date()
day_final = X_test_simu_n.iloc[len(X_test_simu_n)-1].name.to_pydatetime().date()
print('Total day on trading: {0} days'.format((day_final - day_one).days))
Accuracy = win_count / (win_count+lost_count)
print(f"Accuarcy {Accuracy*100:.2f}%")

Total Return: 1.0%
Final Capital: $503.0
Win Count:  51
Lost Count:  50
No Order Count:  1793
Max Consecutive Lost: 11.0 trades
Total day on trading: 121 days
Accuarcy 50.50%


In [None]:
X_test_simu_n.iloc[0]

In [None]:
sim_df.plot(y='record_capital')

In [None]:
sim_df['record_capital'].to_csv('record_capital.csv')

# Export the simulation data set:

In [None]:
np.savetxt('y_long_simu.csv', y_test, delimiter=',')
X_test_simu_n.to_csv('X_long_simu.csv')

##### To import the file use:

```python
y_test = np.loadtxt('y_long_simu.csv', delimiter=',')
X_test_simu_n = pd.read_csv('X_long_simu.csv',index_col='datetime', parse_dates=True)
```

# Saving model using Pickle

In [None]:
import pickle
print('version should be 4.0:',pickle.format_version)

In [None]:
pickle.dump(lsvm,open('lsvm_xauusd_long_rrr.pkl','wb'))

In [None]:
pickled_model=pickle.load(open('lsvm_xauusd_long_rrr.pkl','rb'))

In [None]:
model_input = [-2.839808,-2.806494,-2.828501,-2.791640]
model_input = np.array(model_input).reshape(1,-1)
model_input
pickled_model.predict(model_input)

# Question on SVM

##### what is the different between svm.SVC(kernel='linear') and LinearSVC in sklearn?

svm.SVC(kernel='linear') and LinearSVC are both linear SVM classifiers available in scikit-learn, but there are some differences between them.

svm.SVC(kernel='linear') is based on the LIBSVM library, which is written in C++, while LinearSVC is based on the LIBLINEAR library, which is written in C.

One of the main differences between these two is the handling of multi-class classification. svm.SVC(kernel='linear') uses the one-vs-one approach, which trains a binary classifier for every pair of classes, while LinearSVC uses the one-vs-the-rest approach, which trains a binary classifier for each class against all the other classes.

Another difference is the way they handle the regularization parameter C. In svm.SVC(kernel='linear'), C controls the tradeoff between maximizing the margin and minimizing the classification error. In LinearSVC, C controls the strength of regularization, where smaller values of C lead to stronger regularization.

Overall, LinearSVC is faster and more memory-efficient than svm.SVC(kernel='linear') and is recommended for large datasets with many features, while svm.SVC(kernel='linear') may be more suitable for smaller datasets with few features or for multi-class classification.

##### which is created first

SVC(kernel='linear') was created first and it is a more flexible implementation of linear support vector machines. It can handle non-linearly separable data by using the kernel trick to transform the input data into a higher-dimensional feature space where the data becomes linearly separable.

LinearSVC, on the other hand, is a more efficient implementation of linear support vector machines that only supports linear kernels. It uses a different algorithm for training the model that is more scalable and faster than the algorithm used by SVC(kernel='linear').

Both models are useful for solving binary classification problems, but LinearSVC is generally preferred when the dataset is large and the number of features is high, while SVC(kernel='linear') is preferred when the dataset is smaller or when non-linearly separable data is involved.

##### Definition of large dataset and high number of features

The definitions of "large dataset" and "high number of features" are somewhat relative and can vary depending on the context of the problem and the available computational resources. However, as a general rule of thumb, a "large dataset" can refer to a dataset with tens of thousands or more samples, while a "small dataset" can refer to a dataset with a few hundred or a few thousand samples.

Similarly, a "high number of features" can also be relative, but it generally refers to datasets where the number of features is comparable to or larger than the number of samples. For example, in genomics and bioinformatics, it is common to work with datasets with tens of thousands or even hundreds of thousands of features.

In practice, when working with a large dataset with a high number of features, LinearSVC can be more computationally efficient and can handle such datasets more effectively. On the other hand, when working with a smaller dataset with a lower number of features, SVC(kernel='linear') can perform just as well and may offer better accuracy in some cases.b

# Question XGBoost

Random Forest and XGBoost are both popular machine learning algorithms used for regression and classification tasks. However, there are several key differences between these algorithms, which are outlined below:

**During the training process**, XGBoost uses gradient boosting to optimize a loss function. The loss function measures the difference between the predicted and actual values and is chosen based on the specific problem being solved. For example, the mean squared error (MSE) loss function can be used for regression problems, and the cross-entropy loss function can be used for classification problems.

**Model architecture:** Random Forest is an ensemble learning method that builds multiple decision trees and combines their predictions to make a final prediction. Each decision tree in the Random Forest is built independently and does not depend on the other trees. On the other hand, XGBoost is a boosted tree algorithm that builds decision trees sequentially, where each new tree corrects the errors of the previous trees.

**Tree construction:** In Random Forest, each decision tree is built by randomly selecting a subset of the features and the samples in the training set. This helps to reduce overfitting and improves the performance of the model. In XGBoost, each tree is built by greedily selecting the best split that maximizes the information gain.

**Handling of missing data:** Random Forest can handle missing data by imputing the missing values with the mean or median of the feature. XGBoost can handle missing data by splitting the samples into two groups: one group with the missing value and one group without the missing value.

**Regularization:** Random Forest does not have any regularization parameters. XGBoost has several regularization parameters, including the learning rate, which controls the step size during the gradient descent, and the regularization term, which penalizes complex models and helps to prevent overfitting.

**Performance:** Random Forest is generally faster to train than XGBoost, especially for large datasets. However, XGBoost often outperforms Random Forest in terms of predictive accuracy, especially for complex tasks with high-dimensional features.

**In summary,** Random Forest and XGBoost are both powerful machine learning algorithms with different strengths and weaknesses. Random Forest is a simple and fast algorithm that can handle missing data, while XGBoost is a more complex algorithm that can handle complex tasks and has better predictive accuracy, but may require more tuning and training time.

