In [1]:
import pandas as pd

In [2]:
# Paths
data_folder = input()
TRAIN_PATH = data_folder + '/train.csv'
TEST_PATH = data_folder + '/test.csv'
SAMPLE_SUBMISSION_PATH = data_folder + '/sample_submission.csv'



In [133]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
sample_submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)


In [168]:
targets_for_test_df = test_df['close'] / test_df['close'].shift(1)
targets_for_test_df = targets_for_test_df > 1
targets_for_test_df = targets_for_test_df.astype(int).shift(-1)
targets_for_test_df.dropna(inplace=True)


## Basic Logistic Regression

In [81]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# Features and target
X = train_df.drop(columns=['target'])
y = train_df['target']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Validation Accuracy: 0.5253929439701476
Validation F1 Macro Score: 0.35738196975903924


In [82]:
# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'target': test_predictions
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")



Predictions saved to submission.csv


In [90]:
# Evaluate the X_test predictions using the targets_for_test_df


test_predictions = test_predictions[:len(targets_for_test_df)]
# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions)
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.58022
Test F1 Macro Score: 0.37593


## Add Lagged Features

### All features lagged from 0 to 6

In [97]:
# Treat Dataframe

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso

# Treat Train
treated_train_df = train_df.copy()
# Add lagged features for every column
for column in treated_train_df.columns:
    if column != 'target' and column != 'timestamp':
        for lag in range(1, 6):
            treated_train_df[f'{column}_lag_{lag}'] = treated_train_df[column].shift(lag)

treated_train_df = treated_train_df

treated_train_df_close_lag = treated_train_df.filter(like='close_lag')
treated_train_df_close_lag.loc[:, train_df.columns] = train_df.values

treated_train_df = treated_train_df.dropna()
treated_train_df_close_lag = treated_train_df_close_lag.dropna()

# Treat Test
treated_test_df = test_df.copy()
# Add lagged features for every column
for column in treated_test_df.columns:
    if column != 'row_id' and column != 'timestamp':
        for lag in range(1, 6):
            treated_test_df[f'{column}_lag_{lag}'] = treated_test_df[column].shift(lag)

treated_test_df = treated_test_df

treated_test_df_close_lag = treated_test_df.filter(like='close_lag')
treated_test_df_close_lag.loc[:, test_df.columns] = test_df.values

treated_test_df = treated_test_df.dropna()
treated_test_df_close_lag = treated_test_df_close_lag.dropna()

In [99]:
# Features and target
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'All Variables Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'All variables Validation F1 Macro Score: {f1_macro}')

# Do the same for the test data

# Features and target
X = treated_train_df_close_lag.drop(columns=['target'])
y = treated_train_df_close_lag['target']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model_close_lag = LogisticRegression(max_iter=1000)
model_close_lag.fit(X_train, y_train)

# Predict on the validation set
y_pred = model_close_lag.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Close lag Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Close lag Validation F1 Macro Score: {f1_macro}')





STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


All Variables Validation Accuracy: 0.5247133598908801
All variables Validation F1 Macro Score: 0.3580181799387846
Close lag Validation Accuracy: 0.5246992251823966
Close lag Validation F1 Macro Score: 0.3566996334376826


In [111]:
len( test_df['row_id'][:len(X_test)])

909612

In [112]:
# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'][:len(X_test)],
    'target': test_predictions[:len(X_test)]
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")

# Now Evaluate treated_test_df_close_lag 
X_test = treated_test_df_close_lag.drop(columns=['row_id'])
X_test = X_test[:len(treated_train_df_close_lag)]

# Make predictions on the test data
test_close_lag_predictions = model_close_lag.predict(X_test)

# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'][:len(X_test)],
    'target': test_close_lag_predictions[:len(X_test)]
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")





Predictions saved to submission.csv
Predictions saved to submission.csv


In [None]:
targets_for_test_df[:]

In [116]:
# Evaluate the X_test predictions using the targets_for_test_df


# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions)
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_close_lag_predictions)], test_close_lag_predictions)
print(f'Test close lag Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_close_lag_predictions)], test_close_lag_predictions, average='macro')
print(f'Test close lag F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.57919
Test F1 Macro Score: 0.37376
Test close lag Accuracy: 0.57917
Test close lag F1 Macro Score: 0.37383


## Standardize Data

In [174]:

treated_train_df = train_df.copy()


X = treated_train_df.drop(columns=['target', 'timestamp'])
y = treated_train_df['target']
# Standardize the data
scaler = StandardScaler()
treated_train_df.loc[:, X.columns] = scaler.fit_transform(X)



# Standardize the test data
treated_test_df = test_df.copy()
X_test = treated_test_df.drop(columns=['row_id', 'timestamp'])
treated_test_df.loc[:, X_test.columns] = scaler.transform(X_test)

In [175]:
# Features and target
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Validation Accuracy: 0.5249830383897778
Validation F1 Macro Score: 0.3442550016452018


In [176]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.58005
Test F1 Macro Score: 0.36711


## Feature Engineering


### Add RSI, Moving Average and MACD

In [158]:

def compute_rsi(data, window):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi
def compute_macd(data, short_window=12, long_window=26, signal_window=9):
    short_ema = data.ewm(span=short_window, adjust=False).mean()
    long_ema = data.ewm(span=long_window, adjust=False).mean()
    macd = short_ema - long_ema
    signal = macd.ewm(span=signal_window, adjust=False).mean()
    return macd, signal



treated_train_df = train_df.copy()

# Compute RSI for the 'close' column with a window of 14
treated_train_df['RSI'] = compute_rsi(treated_train_df['close'], 14)

# Compute moving average for the 'close' column with a window of 14
treated_train_df['moving_average'] = treated_train_df['close'].rolling(window=14).mean()

# Compute MACD and signal line for the 'close' column
treated_train_df['MACD'], treated_train_df['MACD_signal'] = compute_macd(treated_train_df['close'])

treated_train_df = treated_train_df.dropna()



# Do the same for test_df
treated_test_df = test_df.copy()

# Compute RSI for the 'close' column with a window of 14
treated_test_df['RSI'] = compute_rsi(treated_test_df['close'], 14)

# Compute moving average for the 'close' column with a window of 14
treated_test_df['moving_average'] = treated_test_df['close'].rolling(window=14).mean()

# Compute MACD and signal line for the 'close' column
treated_test_df['MACD'], treated_test_df['MACD_signal'] = compute_macd(treated_test_df['close'])

treated_test_df = treated_test_df.dropna()

In [159]:
# Features and target
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Validation Accuracy: 0.5238905436995702
Validation F1 Macro Score: 0.35677156806833143


In [160]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions)	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.57932
Test F1 Macro Score: 0.37449


### Add Month, Day of Week and Hour of Day

In [177]:
# Load your data set
treated_train_df = train_df.copy()
treated_train_df['timestamp'] = pd.to_datetime(treated_train_df['timestamp'], unit='s')  # Ensure timestamp is a datetime type
treated_train_df.set_index('timestamp', inplace=True)

# Create columns for the month, day of the week, and time of day
treated_train_df['month'] = treated_train_df.index.month
treated_train_df['day_of_week'] = treated_train_df.index.dayofweek
treated_train_df['time_of_day'] = treated_train_df.index.hour + treated_train_df.index.minute / 60.0

# Do the same for test_df
treated_test_df = test_df.copy()
treated_test_df['timestamp'] = pd.to_datetime(treated_test_df['timestamp'], unit='s')  # Ensure timestamp is a datetime type
treated_test_df.set_index('timestamp', inplace=True)

# Create columns for the month, day of the week, and time of day
treated_test_df['month'] = treated_test_df.index.month
treated_test_df['day_of_week'] = treated_test_df.index.dayofweek
treated_test_df['time_of_day'] = treated_test_df.index.hour + treated_test_df.index.minute / 60.0



In [178]:
# Features and target
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Validation Accuracy: 0.5251997700759503
Validation F1 Macro Score: 0.3648028300053914


In [179]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.57967
Test F1 Macro Score: 0.38048


## Hyperparameter Tuning 

In [180]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression



# Padronizar os dados
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(train_df.drop(columns=['target'])) 
X = train_df.drop(columns=['target'])
y = train_df['target']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model and parameter grid
model = LogisticRegression(max_iter=1000)  # Aumentar o número de iterações
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

# Set up GridSearchCV with k-fold cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

# Retrieve the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_


In [185]:
# Predict on the validation set
y_pred = best_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Validation Accuracy: 0.5253929439701476
Validation F1 Macro Score: 0.35738196975903924


In [188]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.58022
Test F1 Macro Score: 0.37593


## Recreate Paper

### Monthly

In [192]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load your dataset
data = train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the month
data['month'] = data.index.month

# List to store models and coefficients
models = []
coefficients = []

# Iterate through each month to train individual models
for month in range(1, 13):
    monthly_data = data[data['month'] == month]
    if len(monthly_data) < 10:  # Skip months with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = monthly_data[train_df.columns.drop(['timestamp', 'target'])]
    y = monthly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models.append(model)
    coefficients.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Month {month}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")
    

# Average the coefficients from all models
avg_coefficients = np.mean(coefficients, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'month'])
y = data['target']

# Split data into train and test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Month 1: 0.52 - F1 Score: 0.37
Model Accuracy for Month 2: 0.52 - F1 Score: 0.40
Model Accuracy for Month 3: 0.52 - F1 Score: 0.38
Model Accuracy for Month 4: 0.49 - F1 Score: 0.47
Model Accuracy for Month 5: 0.53 - F1 Score: 0.36
Model Accuracy for Month 6: 0.53 - F1 Score: 0.36
Model Accuracy for Month 7: 0.54 - F1 Score: 0.38
Model Accuracy for Month 8: 0.53 - F1 Score: 0.39
Model Accuracy for Month 9: 0.53 - F1 Score: 0.37
Model Accuracy for Month 10: 0.53 - F1 Score: 0.36
Model Accuracy for Month 11: 0.52 - F1 Score: 0.38
Model Accuracy for Month 12: 0.52 - F1 Score: 0.36
Average Coefficients: [-2.67835664e-02 -2.67956252e-02 -2.67764923e-02 -2.68086056e-02
  1.69702620e-07 -1.12223552e-07  9.34319311e-05 -1.02999829e-07
 -1.13676525e-07]
Validation Accuracy: 0.5251078946872467
Validation F1 Macro Score: 0.4289752737725788




In [195]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.57201
Test F1 Macro Score: 0.44616


In [210]:
# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'target': test_predictions
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")




Predictions saved to submission.csv


### Weekly

In [196]:
# Load your dataset
data = train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the day of the week
data['day_of_week'] = data.index.dayofweek

# List to store models and coefficients
models_day = []
coefficients_day = []

# Iterate through each day of the week to train individual models
for day in range(7):
    daily_data = data[data['day_of_week'] == day]
    if len(daily_data) < 10:  # Skip days with insufficient data
        continue

    # Select relevant features and the target variable
    features = ['open', 'high', 'low', 'close', 'volume']
    X = daily_data[train_df.columns.drop(['timestamp', 'target'])]
    y = daily_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models_day.append(model)
    coefficients_day.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Day {day}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")


# Average the coefficients from all models
avg_coefficients = np.mean(coefficients_day, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'day_of_week'])
y = data['target']

# Split data into train and test 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Day 0: 0.52 - F1 Score: 0.36
Model Accuracy for Day 1: 0.53 - F1 Score: 0.36
Model Accuracy for Day 2: 0.53 - F1 Score: 0.36
Model Accuracy for Day 3: 0.53 - F1 Score: 0.36
Model Accuracy for Day 4: 0.52 - F1 Score: 0.36
Model Accuracy for Day 5: 0.53 - F1 Score: 0.36
Model Accuracy for Day 6: 0.53 - F1 Score: 0.37
Average Coefficients: [-2.78774996e-02 -2.78895336e-02 -2.78703675e-02 -2.79016629e-02
  9.38670756e-08 -1.82204223e-08  5.49662333e-05 -1.44465621e-07
  4.33303745e-08]
Validation Accuracy: 0.5255390022803943
Validation F1 Macro Score: 0.3922182955337142




In [197]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.57820
Test F1 Macro Score: 0.40350


### Hourly

In [198]:
# Load your dataset
data = train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the hour of the day
data['hour_of_day'] = data.index.hour

# List to store models and coefficients
models_hour = []
coefficients_hour = []

# Iterate through each hour of the day to train individual models
for hour in range(24):
    hourly_data = data[data['hour_of_day'] == hour]
    if len(hourly_data) < 10:  # Skip hours with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = hourly_data[train_df.columns.drop(['timestamp', 'target'])]
    y = hourly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models_hour.append(model)
    coefficients_hour.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Hour {hour}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")


# Average the coefficients from all models
avg_coefficients = np.mean(coefficients_hour, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'hour_of_day'])
y = data['target']

# Split data into train and test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Hour 0: 0.52 - F1 Score: 0.36
Model Accuracy for Hour 1: 0.53 - F1 Score: 0.37
Model Accuracy for Hour 2: 0.53 - F1 Score: 0.36
Model Accuracy for Hour 3: 0.53 - F1 Score: 0.37
Model Accuracy for Hour 4: 0.52 - F1 Score: 0.36
Model Accuracy for Hour 5: 0.53 - F1 Score: 0.36
Model Accuracy for Hour 6: 0.52 - F1 Score: 0.36
Model Accuracy for Hour 7: 0.53 - F1 Score: 0.36
Model Accuracy for Hour 8: 0.54 - F1 Score: 0.36
Model Accuracy for Hour 9: 0.53 - F1 Score: 0.37
Model Accuracy for Hour 10: 0.53 - F1 Score: 0.36
Model Accuracy for Hour 11: 0.52 - F1 Score: 0.36
Model Accuracy for Hour 12: 0.51 - F1 Score: 0.36
Model Accuracy for Hour 13: 0.53 - F1 Score: 0.35
Model Accuracy for Hour 14: 0.52 - F1 Score: 0.36
Model Accuracy for Hour 15: 0.53 - F1 Score: 0.37
Model Accuracy for Hour 16: 0.51 - F1 Score: 0.36
Model Accuracy for Hour 17: 0.51 - F1 Score: 0.46
Model Accuracy for Hour 18: 0.52 - F1 Score: 0.37
Model Accuracy for Hour 19: 0.52 - F1 Score: 0.36
Model Accu



In [199]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.57826
Test F1 Macro Score: 0.40391


### Minute by minute


In [207]:
# Load your dataset
data = train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the minute of the day
data['minute_of_day'] = data.index.hour * 60 + data.index.minute

# List to store models and coefficients
models_minute = []
coefficients_minute = []

# Iterate through each minute of the day to train individual models
for minute in range(24*60):
    minutely_data = data[data['minute_of_day'] == minute]
    if len(minutely_data) < 10:  # Skip minutes with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = minutely_data[train_df.columns.drop(['timestamp', 'target'])]
    y = minutely_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models_minute.append(model)
    coefficients_minute.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for minute {minute}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")


# Average the coefficients from all models
avg_coefficients = np.mean(coefficients_minute, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'minute_of_day'])
y = data['target']

# Split data into train and test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for minute 0: 0.56 - F1 Score: 0.44
Model Accuracy for minute 1: 0.58 - F1 Score: 0.39
Model Accuracy for minute 2: 0.55 - F1 Score: 0.42
Model Accuracy for minute 3: 0.54 - F1 Score: 0.38
Model Accuracy for minute 4: 0.56 - F1 Score: 0.44
Model Accuracy for minute 5: 0.53 - F1 Score: 0.37
Model Accuracy for minute 6: 0.53 - F1 Score: 0.49
Model Accuracy for minute 7: 0.47 - F1 Score: 0.39
Model Accuracy for minute 8: 0.48 - F1 Score: 0.41
Model Accuracy for minute 9: 0.45 - F1 Score: 0.45
Model Accuracy for minute 10: 0.54 - F1 Score: 0.46
Model Accuracy for minute 11: 0.47 - F1 Score: 0.41
Model Accuracy for minute 12: 0.56 - F1 Score: 0.43
Model Accuracy for minute 13: 0.51 - F1 Score: 0.35
Model Accuracy for minute 14: 0.54 - F1 Score: 0.54
Model Accuracy for minute 15: 0.52 - F1 Score: 0.40
Model Accuracy for minute 16: 0.53 - F1 Score: 0.35
Model Accuracy for minute 17: 0.53 - F1 Score: 0.46
Model Accuracy for minute 18: 0.49 - F1 Score: 0.40
Model Accuracy for min



In [208]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.57377
Test F1 Macro Score: 0.44131


## Ensamble paper + month + day + hour 

In [200]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load your dataset
data = train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the month
data['month'] = data.index.month
data['day_of_week'] = data.index.dayofweek
data['hour_of_day'] = data.index.hour

# List to store models and coefficients
models = []
coefficients = []

# Iterate through each month to train individual models
for month in range(1, 13):
    monthly_data = data[data['month'] == month]
    if len(monthly_data) < 10:  # Skip months with insufficient data
        continue

    # Select relevant features and the target variable
    features = ['open', 'high', 'low', 'close', 'volume']
    X = monthly_data[data.columns.drop('target')]
    y = monthly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models.append(model)
    coefficients.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Month {month}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")
    

# Average the coefficients from all models
avg_coefficients = np.mean(coefficients, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target'])
y = data['target']

# Split data into train and test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Month 1: 0.52 - F1 Score: 0.38
Model Accuracy for Month 2: 0.52 - F1 Score: 0.42
Model Accuracy for Month 3: 0.52 - F1 Score: 0.38
Model Accuracy for Month 4: 0.52 - F1 Score: 0.38
Model Accuracy for Month 5: 0.53 - F1 Score: 0.36
Model Accuracy for Month 6: 0.53 - F1 Score: 0.36
Model Accuracy for Month 7: 0.53 - F1 Score: 0.38
Model Accuracy for Month 8: 0.53 - F1 Score: 0.40
Model Accuracy for Month 9: 0.53 - F1 Score: 0.37
Model Accuracy for Month 10: 0.52 - F1 Score: 0.36
Model Accuracy for Month 11: 0.52 - F1 Score: 0.39
Model Accuracy for Month 12: 0.52 - F1 Score: 0.36
Average Coefficients: [-2.14104137e-03 -2.14177726e-03 -2.14271522e-03 -2.14852638e-03
  2.39060345e-07 -2.12104152e-07  8.78033162e-05 -9.93560450e-08
 -1.26904822e-07 -1.49751268e-02 -7.37690702e-03  1.09628840e-03]




Validation Accuracy: 0.5241538041122482
Validation F1 Macro Score: 0.40769913219731135


In [202]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Create a column for the month
X_test['month'] = X_test.index.month
X_test['day_of_week'] = X_test.index.dayofweek
X_test['hour_of_day'] = X_test.index.hour

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.57129
Test F1 Macro Score: 0.43313


## Paper but only OHLCV
 

In [203]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load your dataset
data = train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the month
data['month'] = data.index.month

# List to store models and coefficients
models = []
coefficients = []

# Iterate through each month to train individual models
for month in range(1, 13):
    monthly_data = data[data['month'] == month]
    if len(monthly_data) < 10:  # Skip months with insufficient data
        continue

    # Select relevant features and the target variable
    features = ['open', 'high', 'low', 'close', 'volume']
    X = monthly_data[features]
    y = monthly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models.append(model)
    coefficients.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Month {month}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")
    

# Average the coefficients from all models
avg_coefficients = np.mean(coefficients, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data[features]
y = data['target']

# Split data into train and test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')


Model Accuracy for Month 1: 0.52 - F1 Score: 0.35
Model Accuracy for Month 2: 0.49 - F1 Score: 0.33
Model Accuracy for Month 3: 0.52 - F1 Score: 0.37
Model Accuracy for Month 4: 0.52 - F1 Score: 0.37
Model Accuracy for Month 5: 0.53 - F1 Score: 0.37
Model Accuracy for Month 6: 0.53 - F1 Score: 0.36
Model Accuracy for Month 7: 0.53 - F1 Score: 0.36
Model Accuracy for Month 8: 0.53 - F1 Score: 0.38
Model Accuracy for Month 9: 0.53 - F1 Score: 0.36
Model Accuracy for Month 10: 0.53 - F1 Score: 0.36
Model Accuracy for Month 11: 0.52 - F1 Score: 0.38
Model Accuracy for Month 12: 0.52 - F1 Score: 0.35
Average Coefficients: [-2.63215995e-02 -2.63359707e-02 -2.63161496e-02 -2.63473651e-02
  8.44300381e-08]
Validation Accuracy: 0.524827556962741
Validation F1 Macro Score: 0.414361932701391




In [204]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

features = ['open', 'high', 'low', 'close', 'volume']
X_test = X_test[features]

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.57292
Test F1 Macro Score: 0.43191
