## Headers

In [1]:
import pandas as pd
import time
import numpy as np


In [2]:
from data_treatment import train_df, test_df, \
                            treated_train_df, treated_test_df, \
                            new_features_train_df, new_features_test_df, \
                            svd_train_df, svd_test_df, \
                            targets_for_test_df

from data_treatment import train_val_split

## Basic Logistic Regression

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

start_time = time.time()

# Features and target
X = train_df.drop(columns=['target'])
y = train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)


end_time = time.time()
print(f'Time elapsed: {end_time - start_time} seconds')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')


Time elapsed: 20.6884343624115 seconds
--------------------------------------
Validation Accuracy: 0.5314072482614349
Validation F1 Macro Score: 0.3553044848365192


In [4]:
# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'target': test_predictions
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")



Predictions saved to submission.csv


In [5]:
# Evaluate the X_test predictions using the targets_for_test_df


test_predictions = test_predictions[:len(targets_for_test_df)]
# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions)
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.58023
Test F1 Macro Score: 0.37640


## Apply stationary treatment

In [6]:
    
def stationarity_treatment(train_df):   
    
    # Apply log filter and differentiate all features
    treated_train_df = train_df.copy() 
    droped_columns = []
    
    if 'row_id' in train_df.columns:
        treated_train_df = treated_train_df.drop(columns=['row_id'])
        droped_columns.append('row_id')        
        
    if 'target' in train_df.columns:
        treated_train_df = treated_train_df.drop(columns=['target'])
        droped_columns.append('target')

    # Apply log filter to numeric columns
    for column in treated_train_df.columns:
        if np.issubdtype(treated_train_df[column].dtype, np.number):
            treated_train_df[column] = np.log(treated_train_df[column] + 1)

    # Differentiate the numeric columns
    treated_train_df = treated_train_df.diff().dropna()
    for column in droped_columns:
        treated_train_df[column] = train_df[column].iloc[1:].values
    
    return treated_train_df

    

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

start_time = time.time()

stationary_train_df = stationarity_treatment(train_df)
# Features and target
X = stationary_train_df.drop(columns=['target'])
y = stationary_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)


end_time = time.time()
print(f'Time elapsed: {end_time - start_time} seconds')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')


Time elapsed: 4.323765516281128 seconds
--------------------------------------
Validation Accuracy: 0.5310962854073613
Validation F1 Macro Score: 0.3470552405389616


In [8]:
stationary_test_df = stationarity_treatment(test_df)

# Ensure the test data is preprocessed in the same way as the training data
X_test = stationary_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'target': [0, *test_predictions]
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")



Predictions saved to submission.csv


In [9]:
# Evaluate the X_test predictions using the targets_for_test_df


test_predictions = test_predictions[:len(targets_for_test_df)]
# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions)
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.57623
Test F1 Macro Score: 0.36575


## Logistic Regression on SVDd data

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

start_time = time.time()

svd_train_df = svd_train_df.dropna()
# Features and target
X = svd_train_df.drop(columns=['target'])
y = svd_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)


end_time = time.time()
print(f'Time elapsed: {end_time - start_time} seconds')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')


Time elapsed: 5.784758567810059 seconds
--------------------------------------
Validation Accuracy: 0.5288366599740167
Validation F1 Macro Score: 0.4463134219244439


In [16]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = svd_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.56000
Test F1 Macro Score: 0.44273


## Add Lagged Features

### All features lagged from 0 to 6

In [None]:
# Treat Dataframe

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso

# Treat Train
treated_train_df = train_df.copy()
# Add lagged features for every column
for column in treated_train_df.columns:
    if column != 'target' and column != 'timestamp':
        for lag in range(1, 6):
            treated_train_df[f'{column}_lag_{lag}'] = treated_train_df[column].shift(lag)

treated_train_df = treated_train_df

treated_train_df_close_lag = treated_train_df.filter(like='close_lag')
treated_train_df_close_lag.loc[:, train_df.columns] = train_df.values

treated_train_df = treated_train_df.dropna()
treated_train_df_close_lag = treated_train_df_close_lag.dropna()

# Treat Test
treated_test_df = test_df.copy()
# Add lagged features for every column
for column in treated_test_df.columns:
    if column != 'row_id' and column != 'timestamp':
        for lag in range(1, 6):
            treated_test_df[f'{column}_lag_{lag}'] = treated_test_df[column].shift(lag)

treated_test_df = treated_test_df

treated_test_df_close_lag = treated_test_df.filter(like='close_lag')
treated_test_df_close_lag.loc[:, test_df.columns] = test_df.values

treated_test_df = treated_test_df.dropna()
treated_test_df_close_lag = treated_test_df_close_lag.dropna()

In [None]:
# Features and target
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'All Variables Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'All variables Validation F1 Macro Score: {f1_macro}')

# Do the same for the test data

# Features and target
X = treated_train_df_close_lag.drop(columns=['target'])
y = treated_train_df_close_lag['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model_close_lag = LogisticRegression(max_iter=1000)
model_close_lag.fit(X_train, y_train)

# Predict on the validation set
y_pred = model_close_lag.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Close lag Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Close lag Validation F1 Macro Score: {f1_macro}')





STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


All Variables Validation Accuracy: 0.5313095571831411
All variables Validation F1 Macro Score: 0.35682216512744497
Close lag Validation Accuracy: 0.5314061443577778
Close lag Validation F1 Macro Score: 0.3553040086962812


In [None]:
# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'][:len(X_test)],
    'target': test_predictions[:len(X_test)]
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")

# Now Evaluate treated_test_df_close_lag 
X_test = treated_test_df_close_lag.drop(columns=['row_id'])
X_test = X_test[:len(treated_train_df_close_lag)]

# Make predictions on the test data
test_close_lag_predictions = model_close_lag.predict(X_test)

# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'][:len(X_test)],
    'target': test_close_lag_predictions[:len(X_test)]
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")





Predictions saved to submission.csv
Predictions saved to submission.csv


In [None]:
targets_for_test_df[:]

0         1.0
1         0.0
2         1.0
3         1.0
4         1.0
         ... 
909611    1.0
909612    0.0
909613    0.0
909614    1.0
909615    1.0
Name: close, Length: 909616, dtype: float64

In [None]:
# Evaluate the X_test predictions using the targets_for_test_df


# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions)
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_close_lag_predictions)], test_close_lag_predictions)
print(f'Test close lag Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_close_lag_predictions)], test_close_lag_predictions, average='macro')
print(f'Test close lag F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.57920
Test F1 Macro Score: 0.37494
Test close lag Accuracy: 0.57900
Test close lag F1 Macro Score: 0.37459


## Standardize Data

In [None]:

treated_train_df = train_df.copy()


X = treated_train_df.drop(columns=['target', 'timestamp'])
y = treated_train_df['target']
# Standardize the data
scaler = StandardScaler()
treated_train_df.loc[:, X.columns] = scaler.fit_transform(X)



# Standardize the test data
treated_test_df = test_df.copy()
X_test = treated_test_df.drop(columns=['row_id', 'timestamp'])
treated_test_df.loc[:, X_test.columns] = scaler.transform(X_test)

In [None]:
# Features and target
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val= y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Validation Accuracy: 0.531152824108102
Validation F1 Macro Score: 0.3468973284345402


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.58005
Test F1 Macro Score: 0.36711


## Feature Engineering


### Add RSI, Moving Average, Bolling band, and MACD

In [None]:
# Features and target
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Validation Accuracy: 0.5314209227738358
Validation F1 Macro Score: 0.3553219827173543


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions)	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.57921
Test F1 Macro Score: 0.37490


### Use ONLY new features

In [None]:

def compute_rsi(data, window):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi
def compute_macd(data, short_window=12, long_window=26, signal_window=9):
    short_ema = data.ewm(span=short_window, adjust=False).mean()
    long_ema = data.ewm(span=long_window, adjust=False).mean()
    macd = short_ema - long_ema
    signal = macd.ewm(span=signal_window, adjust=False).mean()
    return macd, signal
def compute_bollinger_band(data, window=15):
    rolling_mean = data.rolling(window=window).mean()
    rolling_std = data.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * 2)
    lower_band = rolling_mean - (rolling_std * 2)
    print(len(upper_band), len(lower_band))
    return upper_band, lower_band


treated_train_df = train_df.copy()

# Compute RSI for the 'close' column with a window of 14
treated_train_df['RSI'] = compute_rsi(treated_train_df['close'], 14)

# Compute moving average for the 'close' column with a window of 14
treated_train_df['moving_average'] = treated_train_df['close'].rolling(window=14).mean()

# Compute MACD and signal line for the 'close' column
treated_train_df['MACD'], treated_train_df['MACD_signal'] = compute_macd(treated_train_df['close'])

# Compute Bollinger bands for the 'close' column with a window of 15
treated_train_df['bollinger_upper'], treated_train_df['bollinger_lower'] = compute_bollinger_band(treated_train_df['close'])

treated_train_df = treated_train_df.dropna()
treated_train_df = treated_train_df.drop(columns=train_df.drop(columns=['target']).columns.tolist())



# Do the same for test_df
treated_test_df = test_df.copy()

# Compute RSI for the 'close' column with a window of 14
treated_test_df['RSI'] = compute_rsi(treated_test_df['close'], 14)

# Compute moving average for the 'close' column with a window of 14
treated_test_df['moving_average'] = treated_test_df['close'].rolling(window=14).mean()

# Compute MACD and signal line for the 'close' column
treated_test_df['MACD'], treated_test_df['MACD_signal'] = compute_macd(treated_test_df['close'])

# Compute Bollinger bands for the 'close' column with a window of 15
treated_test_df['bollinger_upper'], treated_test_df['bollinger_lower'] = compute_bollinger_band(treated_test_df['close'])

treated_test_df = treated_test_df.dropna()
treated_test_df = treated_test_df.drop(columns=test_df.drop(columns=['row_id']).columns.tolist())

2122438 2122438
909617 909617


In [None]:
# Features and target
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Validation Accuracy: 0.5309496148235776
Validation F1 Macro Score: 0.3476556047300922


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions)	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.57866
Test F1 Macro Score: 0.36772


### Add Month, Day of Week and Hour of Day

In [None]:
# Load your data set
treated_train_df = train_df.copy()
treated_train_df['timestamp'] = pd.to_datetime(treated_train_df['timestamp'], unit='s')  # Ensure timestamp is a datetime type
treated_train_df.set_index('timestamp', inplace=True)

# Create columns for the month, day of the week, and time of day
treated_train_df['month'] = treated_train_df.index.month
treated_train_df['day_of_week'] = treated_train_df.index.dayofweek
treated_train_df['time_of_day'] = treated_train_df.index.hour + treated_train_df.index.minute / 60.0

# Do the same for test_df
treated_test_df = test_df.copy()
treated_test_df['timestamp'] = pd.to_datetime(treated_test_df['timestamp'], unit='s')  # Ensure timestamp is a datetime type
treated_test_df.set_index('timestamp', inplace=True)

# Create columns for the month, day of the week, and time of day
treated_test_df['month'] = treated_test_df.index.month
treated_test_df['day_of_week'] = treated_test_df.index.dayofweek
treated_test_df['time_of_day'] = treated_test_df.index.hour + treated_test_df.index.minute / 60.0



In [None]:
# Features and target
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Validation Accuracy: 0.5299820018469309
Validation F1 Macro Score: 0.38594774598718123


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.57973
Test F1 Macro Score: 0.38558


## Hyperparameter Tuning 

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression



# Padronizar os dados
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(train_df.drop(columns=['target'])) 
X = train_df.drop(columns=['target'])
y = train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Define the model and parameter grid
model = LogisticRegression(max_iter=1000)  # Aumentar o número de iterações
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

# Set up GridSearchCV with k-fold cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

# Retrieve the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_


In [None]:
# Predict on the validation set
y_pred = best_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Validation Accuracy: 0.5314072482614349
Validation F1 Macro Score: 0.3553044848365192


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.58023
Test F1 Macro Score: 0.37640


## Recreate Paper

### Monthly

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load your dataset
data = train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the month
data['month'] = data.index.month

# List to store models and coefficients
models = []
coefficients = []

# Iterate through each month to train individual models
for month in range(1, 13):
    monthly_data = data[data['month'] == month]
    if len(monthly_data) < 10:  # Skip months with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = monthly_data[train_df.columns.drop(['timestamp', 'target'])]
    y = monthly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models.append(model)
    coefficients.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Month {month}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")
    

# Average the coefficients from all models
avg_coefficients = np.mean(coefficients, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'month'])
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Month 1: 0.54 - F1 Score: 0.45
Model Accuracy for Month 2: 0.51 - F1 Score: 0.49
Model Accuracy for Month 3: 0.53 - F1 Score: 0.36
Model Accuracy for Month 4: 0.45 - F1 Score: 0.34
Model Accuracy for Month 5: 0.53 - F1 Score: 0.43
Model Accuracy for Month 6: 0.52 - F1 Score: 0.42
Model Accuracy for Month 7: 0.53 - F1 Score: 0.42
Model Accuracy for Month 8: 0.52 - F1 Score: 0.39
Model Accuracy for Month 9: 0.53 - F1 Score: 0.40
Model Accuracy for Month 10: 0.53 - F1 Score: 0.36
Model Accuracy for Month 11: 0.53 - F1 Score: 0.35
Model Accuracy for Month 12: 0.52 - F1 Score: 0.37
Average Coefficients: [-2.20798825e-02 -2.20937305e-02 -2.20749204e-02 -2.21149212e-02
  1.87567104e-07 -9.39045112e-08  1.62591629e-04 -9.35953503e-08
 -2.88208617e-07]




Validation Accuracy: 0.527284634665762
Validation F1 Macro Score: 0.42406341485567456


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.55948
Test F1 Macro Score: 0.48980


In [None]:
# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'target': test_predictions
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")




Predictions saved to submission.csv


### Weekly

In [None]:
# Load your dataset
data = train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the day of the week
data['day_of_week'] = data.index.dayofweek

# List to store models and coefficients
models_day = []
coefficients_day = []

# Iterate through each day of the week to train individual models
for day in range(7):
    daily_data = data[data['day_of_week'] == day]
    if len(daily_data) < 10:  # Skip days with insufficient data
        continue

    # Select relevant features and the target variable
    features = ['open', 'high', 'low', 'close', 'volume']
    X = daily_data[train_df.columns.drop(['timestamp', 'target'])]
    y = daily_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models_day.append(model)
    coefficients_day.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Day {day}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")


# Average the coefficients from all models
avg_coefficients = np.mean(coefficients_day, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'day_of_week'])
y = data['target']

# Split data into train and test 
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Day 0: 0.53 - F1 Score: 0.38
Model Accuracy for Day 1: 0.53 - F1 Score: 0.37
Model Accuracy for Day 2: 0.53 - F1 Score: 0.38
Model Accuracy for Day 3: 0.53 - F1 Score: 0.39
Model Accuracy for Day 4: 0.53 - F1 Score: 0.37
Model Accuracy for Day 5: 0.53 - F1 Score: 0.39
Model Accuracy for Day 6: 0.54 - F1 Score: 0.37
Average Coefficients: [-2.67803531e-02 -2.67929706e-02 -2.67741270e-02 -2.68146135e-02
  8.92224306e-08 -1.36138100e-07  1.86447191e-04 -1.65047824e-07
  7.87169978e-08]
Validation Accuracy: 0.528999641921562
Validation F1 Macro Score: 0.40930593103597596




In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.57720
Test F1 Macro Score: 0.42154


### Hourly

In [None]:
# Load your dataset
data = train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the hour of the day
data['hour_of_day'] = data.index.hour

# List to store models and coefficients
models_hour = []
coefficients_hour = []

# Iterate through each hour of the day to train individual models
for hour in range(24):
    hourly_data = data[data['hour_of_day'] == hour]
    if len(hourly_data) < 10:  # Skip hours with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = hourly_data[train_df.columns.drop(['timestamp', 'target'])]
    y = hourly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models_hour.append(model)
    coefficients_hour.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Hour {hour}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")


# Average the coefficients from all models
avg_coefficients = np.mean(coefficients_hour, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'hour_of_day'])
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Hour 0: 0.52 - F1 Score: 0.37
Model Accuracy for Hour 1: 0.53 - F1 Score: 0.36
Model Accuracy for Hour 2: 0.53 - F1 Score: 0.38
Model Accuracy for Hour 3: 0.53 - F1 Score: 0.40
Model Accuracy for Hour 4: 0.53 - F1 Score: 0.39
Model Accuracy for Hour 5: 0.53 - F1 Score: 0.39
Model Accuracy for Hour 6: 0.54 - F1 Score: 0.38
Model Accuracy for Hour 7: 0.54 - F1 Score: 0.38
Model Accuracy for Hour 8: 0.54 - F1 Score: 0.38
Model Accuracy for Hour 9: 0.54 - F1 Score: 0.39
Model Accuracy for Hour 10: 0.53 - F1 Score: 0.45
Model Accuracy for Hour 11: 0.54 - F1 Score: 0.38
Model Accuracy for Hour 12: 0.52 - F1 Score: 0.37
Model Accuracy for Hour 13: 0.53 - F1 Score: 0.38
Model Accuracy for Hour 14: 0.53 - F1 Score: 0.37
Model Accuracy for Hour 15: 0.53 - F1 Score: 0.39
Model Accuracy for Hour 16: 0.51 - F1 Score: 0.48
Model Accuracy for Hour 17: 0.53 - F1 Score: 0.38
Model Accuracy for Hour 18: 0.53 - F1 Score: 0.38
Model Accuracy for Hour 19: 0.53 - F1 Score: 0.39
Model Accu



In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.57682
Test F1 Macro Score: 0.42455


### Minute by minute


In [None]:
# Load your dataset
data = train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the minute of the day
data['minute_of_day'] = data.index.hour * 60 + data.index.minute

# List to store models and coefficients
models_minute = []
coefficients_minute = []

# Iterate through each minute of the day to train individual models
for minute in range(24*60):
    minutely_data = data[data['minute_of_day'] == minute]
    if len(minutely_data) < 10:  # Skip minutes with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = minutely_data[train_df.columns.drop(['timestamp', 'target'])]
    y = minutely_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models_minute.append(model)
    coefficients_minute.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for minute {minute}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")


# Average the coefficients from all models
avg_coefficients = np.mean(coefficients_minute, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'minute_of_day'])
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for minute 0: 0.54 - F1 Score: 0.47
Model Accuracy for minute 1: 0.54 - F1 Score: 0.38
Model Accuracy for minute 2: 0.44 - F1 Score: 0.44
Model Accuracy for minute 3: 0.52 - F1 Score: 0.40
Model Accuracy for minute 4: 0.53 - F1 Score: 0.39
Model Accuracy for minute 5: 0.54 - F1 Score: 0.41
Model Accuracy for minute 6: 0.54 - F1 Score: 0.53
Model Accuracy for minute 7: 0.45 - F1 Score: 0.39
Model Accuracy for minute 8: 0.46 - F1 Score: 0.38
Model Accuracy for minute 9: 0.49 - F1 Score: 0.47
Model Accuracy for minute 10: 0.55 - F1 Score: 0.47
Model Accuracy for minute 11: 0.53 - F1 Score: 0.49
Model Accuracy for minute 12: 0.52 - F1 Score: 0.41
Model Accuracy for minute 13: 0.49 - F1 Score: 0.34
Model Accuracy for minute 14: 0.48 - F1 Score: 0.48
Model Accuracy for minute 15: 0.54 - F1 Score: 0.49
Model Accuracy for minute 16: 0.56 - F1 Score: 0.37
Model Accuracy for minute 17: 0.45 - F1 Score: 0.35
Model Accuracy for minute 18: 0.52 - F1 Score: 0.36
Model Accuracy for min



In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.57288
Test F1 Macro Score: 0.45064


## Ensamble paper + added features

### Add new features

In [None]:

def compute_rsi(data, window):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi
def compute_macd(data, short_window=12, long_window=26, signal_window=9):
    short_ema = data.ewm(span=short_window, adjust=False).mean()
    long_ema = data.ewm(span=long_window, adjust=False).mean()
    macd = short_ema - long_ema
    signal = macd.ewm(span=signal_window, adjust=False).mean()
    return macd, signal
def compute_bollinger_band(data, window=15):
    rolling_mean = data.rolling(window=window).mean()
    rolling_std = data.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * 2)
    lower_band = rolling_mean - (rolling_std * 2)
    print(len(upper_band), len(lower_band))
    return upper_band, lower_band


treated_train_df = train_df.copy()

# Compute RSI for the 'close' column with a window of 14
treated_train_df['RSI'] = compute_rsi(treated_train_df['close'], 14)

# Compute moving average for the 'close' column with a window of 14
treated_train_df['moving_average'] = treated_train_df['close'].rolling(window=14).mean()

# Compute MACD and signal line for the 'close' column
treated_train_df['MACD'], treated_train_df['MACD_signal'] = compute_macd(treated_train_df['close'])

# Compute Bollinger bands for the 'close' column with a window of 15
treated_train_df['bollinger_upper'], treated_train_df['bollinger_lower'] = compute_bollinger_band(treated_train_df['close'])

treated_train_df = treated_train_df.dropna()



# Do the same for test_df
treated_test_df = test_df.copy()

# Compute RSI for the 'close' column with a window of 14
treated_test_df['RSI'] = compute_rsi(treated_test_df['close'], 14)

# Compute moving average for the 'close' column with a window of 14
treated_test_df['moving_average'] = treated_test_df['close'].rolling(window=14).mean()

# Compute MACD and signal line for the 'close' column
treated_test_df['MACD'], treated_test_df['MACD_signal'] = compute_macd(treated_test_df['close'])

# Compute Bollinger bands for the 'close' column with a window of 15
treated_test_df['bollinger_upper'], treated_test_df['bollinger_lower'] = compute_bollinger_band(treated_test_df['close'])

treated_test_df = treated_test_df.dropna()

2122438 2122438
909617 909617


#### Monthly

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load your dataset
data = treated_train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the month
data['month'] = data.index.month

# List to store models and coefficients
models = []
coefficients = []

# Iterate through each month to train individual models
for month in range(1, 13):
    monthly_data = data[data['month'] == month]
    if len(monthly_data) < 10:  # Skip months with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = monthly_data[treated_train_df.columns.drop(['timestamp', 'target'])]
    y = monthly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models.append(model)
    coefficients.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Month {month}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")
    

# Average the coefficients from all models
avg_coefficients = np.mean(coefficients, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'month'])
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Month 1: 0.55 - F1 Score: 0.38
Model Accuracy for Month 2: 0.52 - F1 Score: 0.41
Model Accuracy for Month 3: 0.53 - F1 Score: 0.38
Model Accuracy for Month 4: 0.55 - F1 Score: 0.39
Model Accuracy for Month 5: 0.53 - F1 Score: 0.46
Model Accuracy for Month 6: 0.52 - F1 Score: 0.45
Model Accuracy for Month 7: 0.54 - F1 Score: 0.43
Model Accuracy for Month 8: 0.52 - F1 Score: 0.38
Model Accuracy for Month 9: 0.53 - F1 Score: 0.42
Model Accuracy for Month 10: 0.53 - F1 Score: 0.36
Model Accuracy for Month 11: 0.53 - F1 Score: 0.36
Model Accuracy for Month 12: 0.52 - F1 Score: 0.40
Average Coefficients: [-6.58955734e-04 -6.60286756e-04 -6.64149668e-04 -7.00458246e-04
  1.85501098e-07 -2.13060098e-07  2.12218680e-04  1.37734227e-08
 -3.51540126e-07 -2.17467113e-03 -6.67635428e-04 -7.89703637e-06
 -8.33128648e-06 -6.54371028e-04 -6.80563737e-04]
Validation Accuracy: 0.5308506401540234
Validation F1 Macro Score: 0.40175482756365327




In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)


# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.56637
Test F1 Macro Score: 0.40815


In [None]:
# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': treated_test_df['row_id'],
    'target': test_predictions
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")




Predictions saved to submission.csv


#### Minute by Minute

In [None]:
# Load your dataset
data = treated_train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the minute of the day
data['minute_of_day'] = data.index.hour * 60 + data.index.minute

# List to store models and coefficients
models_minute = []
coefficients_minute = []

# Iterate through each minute of the day to train individual models
for minute in range(24*60):
    minutely_data = data[data['minute_of_day'] == minute]
    if len(minutely_data) < 10:  # Skip minutes with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = minutely_data[treated_train_df.columns.drop(['timestamp', 'target'])]
    y = minutely_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models_minute.append(model)
    coefficients_minute.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for minute {minute}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")


# Average the coefficients from all models
avg_coefficients = np.mean(coefficients_minute, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'minute_of_day'])
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for minute 0: 0.55 - F1 Score: 0.37
Model Accuracy for minute 1: 0.54 - F1 Score: 0.37
Model Accuracy for minute 2: 0.52 - F1 Score: 0.44
Model Accuracy for minute 3: 0.54 - F1 Score: 0.46
Model Accuracy for minute 4: 0.54 - F1 Score: 0.40
Model Accuracy for minute 5: 0.55 - F1 Score: 0.44
Model Accuracy for minute 6: 0.53 - F1 Score: 0.52
Model Accuracy for minute 7: 0.46 - F1 Score: 0.32
Model Accuracy for minute 8: 0.47 - F1 Score: 0.38
Model Accuracy for minute 9: 0.50 - F1 Score: 0.46
Model Accuracy for minute 10: 0.56 - F1 Score: 0.55
Model Accuracy for minute 11: 0.56 - F1 Score: 0.53
Model Accuracy for minute 12: 0.53 - F1 Score: 0.46
Model Accuracy for minute 13: 0.50 - F1 Score: 0.35
Model Accuracy for minute 14: 0.46 - F1 Score: 0.40
Model Accuracy for minute 15: 0.55 - F1 Score: 0.47
Model Accuracy for minute 16: 0.56 - F1 Score: 0.39
Model Accuracy for minute 17: 0.53 - F1 Score: 0.53
Model Accuracy for minute 18: 0.53 - F1 Score: 0.37
Model Accuracy for min



In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)


# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.57505
Test F1 Macro Score: 0.38798


### Only New Features

In [None]:

def compute_rsi(data, window):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi
def compute_macd(data, short_window=12, long_window=26, signal_window=9):
    short_ema = data.ewm(span=short_window, adjust=False).mean()
    long_ema = data.ewm(span=long_window, adjust=False).mean()
    macd = short_ema - long_ema
    signal = macd.ewm(span=signal_window, adjust=False).mean()
    return macd, signal
def compute_bollinger_band(data, window=15):
    rolling_mean = data.rolling(window=window).mean()
    rolling_std = data.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * 2)
    lower_band = rolling_mean - (rolling_std * 2)
    print(len(upper_band), len(lower_band))
    return upper_band, lower_band


treated_train_df = train_df.copy()

# Compute RSI for the 'close' column with a window of 14
treated_train_df['RSI'] = compute_rsi(treated_train_df['close'], 14)

# Compute moving average for the 'close' column with a window of 14
treated_train_df['moving_average'] = treated_train_df['close'].rolling(window=14).mean()

# Compute MACD and signal line for the 'close' column
treated_train_df['MACD'], treated_train_df['MACD_signal'] = compute_macd(treated_train_df['close'])

# Compute Bollinger bands for the 'close' column with a window of 15
treated_train_df['bollinger_upper'], treated_train_df['bollinger_lower'] = compute_bollinger_band(treated_train_df['close'])

treated_train_df = treated_train_df.dropna()
treated_train_df = treated_train_df.drop(columns=train_df.drop(columns=['target', 'timestamp']).columns.tolist())



# Do the same for test_df
treated_test_df = test_df.copy()

# Compute RSI for the 'close' column with a window of 14
treated_test_df['RSI'] = compute_rsi(treated_test_df['close'], 14)

# Compute moving average for the 'close' column with a window of 14
treated_test_df['moving_average'] = treated_test_df['close'].rolling(window=14).mean()

# Compute MACD and signal line for the 'close' column
treated_test_df['MACD'], treated_test_df['MACD_signal'] = compute_macd(treated_test_df['close'])

# Compute Bollinger bands for the 'close' column with a window of 15
treated_test_df['bollinger_upper'], treated_test_df['bollinger_lower'] = compute_bollinger_band(treated_test_df['close'])

treated_test_df = treated_test_df.dropna()
treated_test_df = treated_test_df.drop(columns=test_df.drop(columns=['row_id', 'timestamp']).columns.tolist())

2122438 2122438
909617 909617


#### Monthly

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load your dataset
data = treated_train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the month
data['month'] = data.index.month

# List to store models and coefficients
models = []
coefficients = []

# Iterate through each month to train individual models
for month in range(1, 13):
    monthly_data = data[data['month'] == month]
    if len(monthly_data) < 10:  # Skip months with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = monthly_data[treated_train_df.columns.drop(['timestamp', 'target'])]
    y = monthly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models.append(model)
    coefficients.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Month {month}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")
    

# Average the coefficients from all models
avg_coefficients = np.mean(coefficients, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'month'])
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Month 1: 0.55 - F1 Score: 0.36
Model Accuracy for Month 2: 0.52 - F1 Score: 0.34
Model Accuracy for Month 3: 0.53 - F1 Score: 0.36
Model Accuracy for Month 4: 0.55 - F1 Score: 0.37
Model Accuracy for Month 5: 0.53 - F1 Score: 0.35
Model Accuracy for Month 6: 0.53 - F1 Score: 0.35
Model Accuracy for Month 7: 0.54 - F1 Score: 0.35
Model Accuracy for Month 8: 0.52 - F1 Score: 0.34
Model Accuracy for Month 9: 0.52 - F1 Score: 0.42
Model Accuracy for Month 10: 0.53 - F1 Score: 0.34
Model Accuracy for Month 11: 0.53 - F1 Score: 0.36
Model Accuracy for Month 12: 0.51 - F1 Score: 0.51
Average Coefficients: [-0.0020054  -0.02746194 -0.18973423 -0.1555876   0.12936461 -0.15672422]
Validation Accuracy: 0.5311522772421886
Validation F1 Macro Score: 0.3468970951725751




In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id', 'timestamp'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions)	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.56959
Test F1 Macro Score: 0.40018


#### Minute by minute

In [None]:
# Load your dataset
data = treated_train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the minute of the day
data['minute_of_day'] = data.index.hour * 60 + data.index.minute

# List to store models and coefficients
models_minute = []
coefficients_minute = []

# Iterate through each minute of the day to train individual models
for minute in range(24*60):
    minutely_data = data[data['minute_of_day'] == minute]
    if len(minutely_data) < 10:  # Skip minutes with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = minutely_data[treated_train_df.columns.drop(['timestamp', 'target'])]
    y = minutely_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models_minute.append(model)
    coefficients_minute.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for minute {minute}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")


# Average the coefficients from all models
avg_coefficients = np.mean(coefficients_minute, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'minute_of_day'])
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for minute 0: 0.55 - F1 Score: 0.35
Model Accuracy for minute 1: 0.54 - F1 Score: 0.35
Model Accuracy for minute 2: 0.58 - F1 Score: 0.58
Model Accuracy for minute 3: 0.54 - F1 Score: 0.36
Model Accuracy for minute 4: 0.52 - F1 Score: 0.34
Model Accuracy for minute 5: 0.52 - F1 Score: 0.43
Model Accuracy for minute 6: 0.53 - F1 Score: 0.50
Model Accuracy for minute 7: 0.46 - F1 Score: 0.32
Model Accuracy for minute 8: 0.51 - F1 Score: 0.43
Model Accuracy for minute 9: 0.47 - F1 Score: 0.36
Model Accuracy for minute 10: 0.57 - F1 Score: 0.57
Model Accuracy for minute 11: 0.48 - F1 Score: 0.35
Model Accuracy for minute 12: 0.52 - F1 Score: 0.34
Model Accuracy for minute 13: 0.51 - F1 Score: 0.37
Model Accuracy for minute 14: 0.43 - F1 Score: 0.30
Model Accuracy for minute 15: 0.45 - F1 Score: 0.37
Model Accuracy for minute 16: 0.57 - F1 Score: 0.37
Model Accuracy for minute 17: 0.43 - F1 Score: 0.30
Model Accuracy for minute 18: 0.53 - F1 Score: 0.35
Model Accuracy for min



In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id', 'timestamp'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions)	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.43058
Test F1 Macro Score: 0.41977


### Add Month, Day of Week and Hour of Day

In [None]:

def compute_rsi(data, window):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi
def compute_macd(data, short_window=12, long_window=26, signal_window=9):
    short_ema = data.ewm(span=short_window, adjust=False).mean()
    long_ema = data.ewm(span=long_window, adjust=False).mean()
    macd = short_ema - long_ema
    signal = macd.ewm(span=signal_window, adjust=False).mean()
    return macd, signal
def compute_bollinger_band(data, window=15):
    rolling_mean = data.rolling(window=window).mean()
    rolling_std = data.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * 2)
    lower_band = rolling_mean - (rolling_std * 2)
    print(len(upper_band), len(lower_band))
    return upper_band, lower_band


treated_train_df = train_df.copy()

# Compute RSI for the 'close' column with a window of 14
treated_train_df['RSI'] = compute_rsi(treated_train_df['close'], 14)

# Compute moving average for the 'close' column with a window of 14
treated_train_df['moving_average'] = treated_train_df['close'].rolling(window=14).mean()

# Compute MACD and signal line for the 'close' column
treated_train_df['MACD'], treated_train_df['MACD_signal'] = compute_macd(treated_train_df['close'])

# Compute Bollinger bands for the 'close' column with a window of 15
treated_train_df['bollinger_upper'], treated_train_df['bollinger_lower'] = compute_bollinger_band(treated_train_df['close'])

treated_train_df = treated_train_df.dropna()



# Do the same for test_df
treated_test_df = test_df.copy()

# Compute RSI for the 'close' column with a window of 14
treated_test_df['RSI'] = compute_rsi(treated_test_df['close'], 14)

# Compute moving average for the 'close' column with a window of 14
treated_test_df['moving_average'] = treated_test_df['close'].rolling(window=14).mean()

# Compute MACD and signal line for the 'close' column
treated_test_df['MACD'], treated_test_df['MACD_signal'] = compute_macd(treated_test_df['close'])

# Compute Bollinger bands for the 'close' column with a window of 15
treated_test_df['bollinger_upper'], treated_test_df['bollinger_lower'] = compute_bollinger_band(treated_test_df['close'])

treated_test_df = treated_test_df.dropna()


2122438 2122438
909617 909617


In [None]:
# Load your data set
treated_train_df['timestamp'] = pd.to_datetime(treated_train_df['timestamp'], unit='s')  # Ensure timestamp is a datetime type
treated_train_df.set_index('timestamp', inplace=True)

# Create columns for the month, day of the week, and time of day
treated_train_df['month'] = treated_train_df.index.month
treated_train_df['day_of_week'] = treated_train_df.index.dayofweek
treated_train_df['time_of_day'] = treated_train_df.index.hour + treated_train_df.index.minute / 60.0

# Do the same for test_df
treated_test_df['timestamp'] = pd.to_datetime(treated_test_df['timestamp'], unit='s')  # Ensure timestamp is a datetime type
treated_test_df.set_index('timestamp', inplace=True)

# Create columns for the month, day of the week, and time of day
treated_test_df['month'] = treated_test_df.index.month
treated_test_df['day_of_week'] = treated_test_df.index.dayofweek
treated_test_df['time_of_day'] = treated_test_df.index.hour + treated_test_df.index.minute / 60.0



#### Monthly

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np


# List to store models and coefficients
models = []
coefficients = []

data = treated_train_df.copy()

# Iterate through each month to train individual models
for month in range(1, 13):
    monthly_data = data[data['month'] == month]
    if len(monthly_data) < 10:  # Skip months with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = monthly_data[treated_train_df.columns.drop(['target'])]
    y = monthly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models.append(model)
    coefficients.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Month {month}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")
    

# Average the coefficients from all models
avg_coefficients = np.mean(coefficients, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=treated_train_df.columns.drop(['target']))
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Month 1: 0.55 - F1 Score: 0.40
Model Accuracy for Month 2: 0.52 - F1 Score: 0.41
Model Accuracy for Month 3: 0.53 - F1 Score: 0.38
Model Accuracy for Month 4: 0.55 - F1 Score: 0.38


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Accuracy for Month 5: 0.53 - F1 Score: 0.46
Model Accuracy for Month 6: 0.52 - F1 Score: 0.43
Model Accuracy for Month 7: 0.54 - F1 Score: 0.44
Model Accuracy for Month 8: 0.52 - F1 Score: 0.39
Model Accuracy for Month 9: 0.53 - F1 Score: 0.44
Model Accuracy for Month 10: 0.53 - F1 Score: 0.36
Model Accuracy for Month 11: 0.53 - F1 Score: 0.36
Model Accuracy for Month 12: 0.52 - F1 Score: 0.42
Average Coefficients: [-1.34314757e-03 -1.34503160e-03 -1.34363202e-03 -1.36303707e-03
  1.99603957e-07 -2.23884210e-07  2.17075781e-04 -5.87604814e-09
 -3.38080273e-07 -1.92189924e-03 -1.34559153e-03 -3.01046541e-06
 -2.76427074e-06 -1.34502601e-03 -1.34632161e-03 -4.59550976e-03
 -2.92316624e-05  1.66699147e-03]




ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 18 is different from 1)

In [None]:
# Features and target
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Validation Accuracy: 0.5299820018469309
Validation F1 Macro Score: 0.38594774598718123


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.57973
Test F1 Macro Score: 0.38558


#### Minute by minute

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load your dataset
data = treated_train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the month
data['month'] = data.index.month

# List to store models and coefficients
models = []
coefficients = []

# Iterate through each month to train individual models
for month in range(1, 13):
    monthly_data = data[data['month'] == month]
    if len(monthly_data) < 10:  # Skip months with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = monthly_data[train_df.columns.drop(['timestamp', 'target'])]
    y = monthly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models.append(model)
    coefficients.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Month {month}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")
    

# Average the coefficients from all models
avg_coefficients = np.mean(coefficients, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'month'])
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

In [None]:
# Features and target
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

## Ensamble paper + month + day + hour 

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load your dataset
data = train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the month
data['month'] = data.index.month
data['day_of_week'] = data.index.dayofweek
data['hour_of_day'] = data.index.hour

# List to store models and coefficients
models = []
coefficients = []

# Iterate through each month to train individual models
for month in range(1, 13):
    monthly_data = data[data['month'] == month]
    if len(monthly_data) < 10:  # Skip months with insufficient data
        continue

    # Select relevant features and the target variable
    features = ['open', 'high', 'low', 'close', 'volume']
    X = monthly_data[data.columns.drop('target')]
    y = monthly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models.append(model)
    coefficients.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Month {month}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")
    

# Average the coefficients from all models
avg_coefficients = np.mean(coefficients, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target'])
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Month 1: 0.55 - F1 Score: 0.38
Model Accuracy for Month 2: 0.52 - F1 Score: 0.43
Model Accuracy for Month 3: 0.53 - F1 Score: 0.37
Model Accuracy for Month 4: 0.55 - F1 Score: 0.40
Model Accuracy for Month 5: 0.52 - F1 Score: 0.46
Model Accuracy for Month 6: 0.52 - F1 Score: 0.43
Model Accuracy for Month 7: 0.53 - F1 Score: 0.42
Model Accuracy for Month 8: 0.52 - F1 Score: 0.39
Model Accuracy for Month 9: 0.53 - F1 Score: 0.40
Model Accuracy for Month 10: 0.53 - F1 Score: 0.36
Model Accuracy for Month 11: 0.53 - F1 Score: 0.35
Model Accuracy for Month 12: 0.52 - F1 Score: 0.39
Average Coefficients: [-1.07156651e-03 -1.07260306e-03 -1.07244135e-03 -1.08228842e-03
  2.47907463e-07 -2.40155123e-07  1.99280807e-04 -9.58989820e-08
 -2.83978137e-07 -1.35848302e-02 -3.63994434e-03  4.59313754e-04]
Validation Accuracy: 0.5246485177437289
Validation F1 Macro Score: 0.4339989360996589




In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Create a column for the month
X_test['month'] = X_test.index.month
X_test['day_of_week'] = X_test.index.dayofweek
X_test['hour_of_day'] = X_test.index.hour

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.56702
Test F1 Macro Score: 0.45276


## Paper but only OHLCV
 

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load your dataset
data = train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the month
data['month'] = data.index.month

# List to store models and coefficients
models = []
coefficients = []

# Iterate through each month to train individual models
for month in range(1, 13):
    monthly_data = data[data['month'] == month]
    if len(monthly_data) < 10:  # Skip months with insufficient data
        continue

    # Select relevant features and the target variable
    features = ['open', 'high', 'low', 'close', 'volume']
    X = monthly_data[features]
    y = monthly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models.append(model)
    coefficients.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Month {month}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")
    

# Average the coefficients from all models
avg_coefficients = np.mean(coefficients, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data[features]
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')


Model Accuracy for Month 1: 0.55 - F1 Score: 0.36
Model Accuracy for Month 2: 0.48 - F1 Score: 0.32
Model Accuracy for Month 3: 0.53 - F1 Score: 0.35
Model Accuracy for Month 4: 0.55 - F1 Score: 0.36
Model Accuracy for Month 5: 0.53 - F1 Score: 0.43
Model Accuracy for Month 6: 0.52 - F1 Score: 0.43
Model Accuracy for Month 7: 0.53 - F1 Score: 0.41
Model Accuracy for Month 8: 0.52 - F1 Score: 0.43
Model Accuracy for Month 9: 0.53 - F1 Score: 0.40
Model Accuracy for Month 10: 0.53 - F1 Score: 0.37
Model Accuracy for Month 11: 0.48 - F1 Score: 0.32
Model Accuracy for Month 12: 0.52 - F1 Score: 0.34
Average Coefficients: [-2.05803470e-02 -2.05937735e-02 -2.05780030e-02 -2.06122967e-02
  1.20081905e-07]
Validation Accuracy: 0.5272657884321819
Validation F1 Macro Score: 0.4220523234932381




In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

features = ['open', 'high', 'low', 'close', 'volume']
X_test = X_test[features]

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.55724
Test F1 Macro Score: 0.48794
