## Headers

In [None]:
import pandas as pd
import time
import numpy as np


In [None]:
from utils import train_val_split
from utils import train_datapath, test_datapath


In [None]:
targets_for_test_df = pd.read_csv('data/targets_for_test.csv', index_col=0)


## Basic Logistic Regression

In [16]:

train_df = pd.read_csv(train_datapath)
test_df = pd.read_csv(test_datapath)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score



start_time = time.time()

# Features and target
X = train_df.drop(columns=['target'])
y = train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)


end_time = time.time()
print(f'Time elapsed: {end_time - start_time} seconds')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')


Time elapsed: 16.181551933288574 seconds
--------------------------------------
Validation Accuracy: 0.5314072482614349
Validation F1 Macro Score: 0.3553044848365192


In [18]:
# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'target': test_predictions
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")



Predictions saved to submission.csv


In [19]:
# Evaluate the X_test predictions using the targets_for_test_df


test_predictions = test_predictions[:len(targets_for_test_df)]
# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions)
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.58023
Test F1 Macro Score: 0.37640


In [20]:
del train_df
del test_df

## Apply stationary treatment

In [38]:

train_df = pd.read_csv(train_datapath)
test_df = pd.read_csv(test_datapath)

In [39]:
    
def stationarity_treatment(train_df):   
    
    # Apply log filter and differentiate all features
    treated_train_df = train_df.copy() 
    droped_columns = []
    
    if 'row_id' in train_df.columns:
        treated_train_df = treated_train_df.drop(columns=['row_id'])
        droped_columns.append('row_id')        
        
    if 'target' in train_df.columns:
        treated_train_df = treated_train_df.drop(columns=['target'])
        droped_columns.append('target')

    # Apply log filter to numeric columns
    for column in treated_train_df.columns:
        if np.issubdtype(treated_train_df[column].dtype, np.number):
            treated_train_df[column] = np.log(treated_train_df[column] + 1)

    # Differentiate the numeric columns
    treated_train_df = treated_train_df.diff().dropna()
    for column in droped_columns:
        treated_train_df[column] = train_df[column].iloc[1:].values
    
    return treated_train_df

    

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

start_time = time.time()

stationary_train_df = stationarity_treatment(train_df)
# Features and target
X = stationary_train_df.drop(columns=['target'])
y = stationary_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)


end_time = time.time()
print(f'Time elapsed: {end_time - start_time} seconds')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')


Time elapsed: 4.406996726989746 seconds
--------------------------------------
Validation Accuracy: 0.5310962854073613
Validation F1 Macro Score: 0.3470552405389616


In [41]:
stationary_test_df = stationarity_treatment(test_df)

# Ensure the test data is preprocessed in the same way as the training data
X_test = stationary_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'target': [0, *test_predictions]
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")



Predictions saved to submission.csv


In [42]:
# Evaluate the X_test predictions using the targets_for_test_df


test_predictions = test_predictions[:len(targets_for_test_df)]
# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions)
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.57623
Test F1 Macro Score: 0.36575


In [43]:
del train_df
del test_df


## Logistic Regression on SVDd data

In [44]:
svd_train_df = pd.read_csv('data/svd_train.csv')
svd_test_df = pd.read_csv('data/svd_test.csv')


In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

start_time = time.time()

svd_train_df = svd_train_df.dropna()
# Features and target
X = svd_train_df.drop(columns=['target'])
y = svd_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)


end_time = time.time()
print(f'Time elapsed: {end_time - start_time} seconds')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')


Time elapsed: 5.158998966217041 seconds
--------------------------------------
Validation Accuracy: 0.5288366599740167
Validation F1 Macro Score: 0.4463134219244439


In [46]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = svd_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.56000
Test F1 Macro Score: 0.44273


In [47]:
del svd_train_df   
del svd_test_df

## Add Lagged Features

In [1]:

train_df = pd.read_csv(train_datapath)
test_df = pd.read_csv(test_datapath)

NameError: name 'pd' is not defined

### All features lagged from 0 to 6

In [None]:
# Treat Dataframe

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso

# Treat Train
treated_train_df = train_df.copy()
# Add lagged features for every column
for column in treated_train_df.columns:
    if column != 'target' and column != 'timestamp':
        for lag in range(1, 6):
            treated_train_df[f'{column}_lag_{lag}'] = treated_train_df[column].shift(lag)

treated_train_df = treated_train_df

treated_train_df_close_lag = treated_train_df.filter(like='close_lag')
treated_train_df_close_lag.loc[:, train_df.columns] = train_df.values

treated_train_df = treated_train_df.dropna()
treated_train_df_close_lag = treated_train_df_close_lag.dropna()

# Treat Test
treated_test_df = test_df.copy()
# Add lagged features for every column
for column in treated_test_df.columns:
    if column != 'row_id' and column != 'timestamp':
        for lag in range(1, 6):
            treated_test_df[f'{column}_lag_{lag}'] = treated_test_df[column].shift(lag)

treated_test_df = treated_test_df

treated_test_df_close_lag = treated_test_df.filter(like='close_lag')
treated_test_df_close_lag.loc[:, test_df.columns] = test_df.values

treated_test_df = treated_test_df.dropna()
treated_test_df_close_lag = treated_test_df_close_lag.dropna()

In [None]:
# Features and target
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'All Variables Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'All variables Validation F1 Macro Score: {f1_macro}')

# Do the same for the test data

# Features and target
X = treated_train_df_close_lag.drop(columns=['target'])
y = treated_train_df_close_lag['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model_close_lag = LogisticRegression(max_iter=1000)
model_close_lag.fit(X_train, y_train)

# Predict on the validation set
y_pred = model_close_lag.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Close lag Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Close lag Validation F1 Macro Score: {f1_macro}')





NameError: name 'treated_train_df' is not defined

In [None]:
# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'][:len(X_test)],
    'target': test_predictions[:len(X_test)]
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")

# Now Evaluate treated_test_df_close_lag 
X_test = treated_test_df_close_lag.drop(columns=['row_id'])
X_test = X_test[:len(treated_train_df_close_lag)]

# Make predictions on the test data
test_close_lag_predictions = model_close_lag.predict(X_test)

# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'][:len(X_test)],
    'target': test_close_lag_predictions[:len(X_test)]
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")





NameError: name 'treated_test_df' is not defined

In [None]:
# Evaluate the X_test predictions using the targets_for_test_df


# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions)
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_close_lag_predictions)], test_close_lag_predictions)
print(f'Test close lag Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_close_lag_predictions)], test_close_lag_predictions, average='macro')
print(f'Test close lag F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.58023
Test F1 Macro Score: 0.37640


NameError: name 'test_close_lag_predictions' is not defined

In [None]:
del train_df
del test_df
del treated_train_df
del treated_test_df
del treated_train_df_close_lag
del treated_test_df_close_lag


## Standardize Data

In [None]:

train_df = pd.read_csv(train_datapath)
test_df = pd.read_csv(test_datapath)

In [None]:

treated_train_df = train_df.copy()


X = treated_train_df.drop(columns=['target', 'timestamp'])
y = treated_train_df['target']
# Standardize the data
scaler = StandardScaler()
treated_train_df.loc[:, X.columns] = scaler.fit_transform(X)



# Standardize the test data
treated_test_df = test_df.copy()
X_test = treated_test_df.drop(columns=['row_id', 'timestamp'])
treated_test_df.loc[:, X_test.columns] = scaler.transform(X_test)

In [None]:
# Features and target
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val= y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Validation Accuracy: 0.531152824108102
Validation F1 Macro Score: 0.3468973284345402


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.58005
Test F1 Macro Score: 0.36711


In [None]:
del train_df   
del test_df
del treated_train_df
del treated_test_df


## Feature Engineering


### Add RSI, Moving Average, Bolling band, and MACD

In [None]:
treated_train_df = pd.read_csv('data/treated_train.csv')
treated_test_df = pd.read_csv('data/treated_test.csv')

In [None]:
# Features and target
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Validation Accuracy: 0.5294177394590764
Validation F1 Macro Score: 0.4496506149121075


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions)	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.56209
Test F1 Macro Score: 0.44414


In [None]:
del treated_train_df
del treated_test_df


### Use ONLY new features

In [None]:
new_features_train_df = pd.read_csv('data/new_features_train.csv')
new_features_test_df = pd.read_csv('data/new_features_test.csv')

In [None]:
# Features and target
X = new_features_train_df.drop(columns=['target'])
y = new_features_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Validation Accuracy: 0.5311019251210582
Validation F1 Macro Score: 0.3966281159968356


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = new_features_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions)	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.56253
Test F1 Macro Score: 0.44404


In [None]:
del new_features_train_df
del new_features_test_df

### Add Month, Day of Week and Hour of Day

In [None]:
train_df = pd.read_csv(train_datapath)
test_df = pd.read_csv(test_datapath)


In [None]:
# Load your data set
treated_train_df = train_df.copy()
treated_train_df['timestamp'] = pd.to_datetime(treated_train_df['timestamp'], unit='s')  # Ensure timestamp is a datetime type
treated_train_df.set_index('timestamp', inplace=True)

# Create columns for the month, day of the week, and time of day
treated_train_df['month'] = treated_train_df.index.month
treated_train_df['day_of_week'] = treated_train_df.index.dayofweek
treated_train_df['time_of_day'] = treated_train_df.index.hour + treated_train_df.index.minute / 60.0

# Do the same for test_df
treated_test_df = test_df.copy()
treated_test_df['timestamp'] = pd.to_datetime(treated_test_df['timestamp'], unit='s')  # Ensure timestamp is a datetime type
treated_test_df.set_index('timestamp', inplace=True)

# Create columns for the month, day of the week, and time of day
treated_test_df['month'] = treated_test_df.index.month
treated_test_df['day_of_week'] = treated_test_df.index.dayofweek
treated_test_df['time_of_day'] = treated_test_df.index.hour + treated_test_df.index.minute / 60.0



In [None]:
# Features and target
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Validation Accuracy: 0.5299820018469309
Validation F1 Macro Score: 0.38594774598718123


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.57973
Test F1 Macro Score: 0.38558


In [None]:
del train_df
del test_df
del treated_train_df
del treated_test_df

## Hyperparameter Tuning 

In [None]:
train_df = pd.read_csv(train_datapath)
test_df = pd.read_csv(test_datapath)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression



# Padronizar os dados
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(train_df.drop(columns=['target'])) 
X = train_df.drop(columns=['target'])
y = train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Define the model and parameter grid
model = LogisticRegression(max_iter=1000)  # Aumentar o número de iterações
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

# Set up GridSearchCV with k-fold cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

# Retrieve the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_


In [None]:
# Predict on the validation set
y_pred = best_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Validation Accuracy: 0.5314072482614349
Validation F1 Macro Score: 0.3553044848365192


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.58023
Test F1 Macro Score: 0.37640


In [None]:
del train_df
del test_df

## Recreate Paper

In [None]:
train_df = pd.read_csv(train_datapath)
test_df = pd.read_csv(test_datapath)

### Monthly

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load your dataset
data = train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the month
data['month'] = data.index.month

# List to store models and coefficients
models = []
coefficients = []

# Iterate through each month to train individual models
for month in range(1, 13):
    monthly_data = data[data['month'] == month]
    if len(monthly_data) < 10:  # Skip months with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = monthly_data[train_df.columns.drop(['timestamp', 'target'])]
    y = monthly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models.append(model)
    coefficients.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Month {month}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")
    

# Average the coefficients from all models
avg_coefficients = np.mean(coefficients, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'month'])
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Month 1: 0.54 - F1 Score: 0.45
Model Accuracy for Month 2: 0.51 - F1 Score: 0.49
Model Accuracy for Month 3: 0.53 - F1 Score: 0.36
Model Accuracy for Month 4: 0.45 - F1 Score: 0.34
Model Accuracy for Month 5: 0.53 - F1 Score: 0.43
Model Accuracy for Month 6: 0.52 - F1 Score: 0.42
Model Accuracy for Month 7: 0.53 - F1 Score: 0.42
Model Accuracy for Month 8: 0.52 - F1 Score: 0.39
Model Accuracy for Month 9: 0.53 - F1 Score: 0.40
Model Accuracy for Month 10: 0.53 - F1 Score: 0.36
Model Accuracy for Month 11: 0.53 - F1 Score: 0.35
Model Accuracy for Month 12: 0.52 - F1 Score: 0.37
Average Coefficients: [-2.20798825e-02 -2.20937305e-02 -2.20749204e-02 -2.21149212e-02
  1.87567104e-07 -9.39045112e-08  1.62591629e-04 -9.35953503e-08
 -2.88208617e-07]
Validation Accuracy: 0.527284634665762
Validation F1 Macro Score: 0.42406341485567456




In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.55948
Test F1 Macro Score: 0.48980


In [None]:
# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'target': test_predictions
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")




Predictions saved to submission.csv


### Weekly

In [None]:
# Load your dataset
data = train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the day of the week
data['day_of_week'] = data.index.dayofweek

# List to store models and coefficients
models_day = []
coefficients_day = []

# Iterate through each day of the week to train individual models
for day in range(7):
    daily_data = data[data['day_of_week'] == day]
    if len(daily_data) < 10:  # Skip days with insufficient data
        continue

    # Select relevant features and the target variable
    features = ['open', 'high', 'low', 'close', 'volume']
    X = daily_data[train_df.columns.drop(['timestamp', 'target'])]
    y = daily_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models_day.append(model)
    coefficients_day.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Day {day}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")


# Average the coefficients from all models
avg_coefficients = np.mean(coefficients_day, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'day_of_week'])
y = data['target']

# Split data into train and test 
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Day 0: 0.53 - F1 Score: 0.38
Model Accuracy for Day 1: 0.53 - F1 Score: 0.37
Model Accuracy for Day 2: 0.53 - F1 Score: 0.38
Model Accuracy for Day 3: 0.53 - F1 Score: 0.39
Model Accuracy for Day 4: 0.53 - F1 Score: 0.37
Model Accuracy for Day 5: 0.53 - F1 Score: 0.39
Model Accuracy for Day 6: 0.54 - F1 Score: 0.37
Average Coefficients: [-2.67803531e-02 -2.67929706e-02 -2.67741270e-02 -2.68146135e-02
  8.92224306e-08 -1.36138100e-07  1.86447191e-04 -1.65047824e-07
  7.87169978e-08]
Validation Accuracy: 0.528999641921562
Validation F1 Macro Score: 0.40930593103597596




In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.57720
Test F1 Macro Score: 0.42154


### Hourly

In [None]:
# Load your dataset
data = train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the hour of the day
data['hour_of_day'] = data.index.hour

# List to store models and coefficients
models_hour = []
coefficients_hour = []

# Iterate through each hour of the day to train individual models
for hour in range(24):
    hourly_data = data[data['hour_of_day'] == hour]
    if len(hourly_data) < 10:  # Skip hours with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = hourly_data[train_df.columns.drop(['timestamp', 'target'])]
    y = hourly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models_hour.append(model)
    coefficients_hour.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Hour {hour}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")


# Average the coefficients from all models
avg_coefficients = np.mean(coefficients_hour, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'hour_of_day'])
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Hour 0: 0.52 - F1 Score: 0.37
Model Accuracy for Hour 1: 0.53 - F1 Score: 0.36
Model Accuracy for Hour 2: 0.53 - F1 Score: 0.38
Model Accuracy for Hour 3: 0.53 - F1 Score: 0.40
Model Accuracy for Hour 4: 0.53 - F1 Score: 0.39
Model Accuracy for Hour 5: 0.53 - F1 Score: 0.39
Model Accuracy for Hour 6: 0.54 - F1 Score: 0.38
Model Accuracy for Hour 7: 0.54 - F1 Score: 0.38
Model Accuracy for Hour 8: 0.54 - F1 Score: 0.38
Model Accuracy for Hour 9: 0.54 - F1 Score: 0.39
Model Accuracy for Hour 10: 0.53 - F1 Score: 0.45
Model Accuracy for Hour 11: 0.54 - F1 Score: 0.38
Model Accuracy for Hour 12: 0.52 - F1 Score: 0.37
Model Accuracy for Hour 13: 0.53 - F1 Score: 0.38
Model Accuracy for Hour 14: 0.53 - F1 Score: 0.37
Model Accuracy for Hour 15: 0.53 - F1 Score: 0.39
Model Accuracy for Hour 16: 0.51 - F1 Score: 0.48
Model Accuracy for Hour 17: 0.53 - F1 Score: 0.38
Model Accuracy for Hour 18: 0.53 - F1 Score: 0.38
Model Accuracy for Hour 19: 0.53 - F1 Score: 0.39
Model Accu



In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.57682
Test F1 Macro Score: 0.42455


### Minute by minute


In [None]:
# Load your dataset
data = train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the minute of the day
data['minute_of_day'] = data.index.hour * 60 + data.index.minute

# List to store models and coefficients
models_minute = []
coefficients_minute = []

# Iterate through each minute of the day to train individual models
for minute in range(24*60):
    minutely_data = data[data['minute_of_day'] == minute]
    if len(minutely_data) < 10:  # Skip minutes with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = minutely_data[train_df.columns.drop(['timestamp', 'target'])]
    y = minutely_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models_minute.append(model)
    coefficients_minute.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for minute {minute}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")


# Average the coefficients from all models
avg_coefficients = np.mean(coefficients_minute, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'minute_of_day'])
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for minute 0: 0.54 - F1 Score: 0.47
Model Accuracy for minute 1: 0.54 - F1 Score: 0.38
Model Accuracy for minute 2: 0.44 - F1 Score: 0.44
Model Accuracy for minute 3: 0.52 - F1 Score: 0.40
Model Accuracy for minute 4: 0.53 - F1 Score: 0.39
Model Accuracy for minute 5: 0.54 - F1 Score: 0.41
Model Accuracy for minute 6: 0.54 - F1 Score: 0.53
Model Accuracy for minute 7: 0.45 - F1 Score: 0.39
Model Accuracy for minute 8: 0.46 - F1 Score: 0.38
Model Accuracy for minute 9: 0.49 - F1 Score: 0.47
Model Accuracy for minute 10: 0.55 - F1 Score: 0.47
Model Accuracy for minute 11: 0.53 - F1 Score: 0.49
Model Accuracy for minute 12: 0.52 - F1 Score: 0.41
Model Accuracy for minute 13: 0.49 - F1 Score: 0.34
Model Accuracy for minute 14: 0.48 - F1 Score: 0.48
Model Accuracy for minute 15: 0.54 - F1 Score: 0.49
Model Accuracy for minute 16: 0.56 - F1 Score: 0.37
Model Accuracy for minute 17: 0.45 - F1 Score: 0.35
Model Accuracy for minute 18: 0.52 - F1 Score: 0.36
Model Accuracy for min



In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.57288
Test F1 Macro Score: 0.45064


In [None]:
del train_df
del test_df

## Ensamble paper + added features

### Add new features

In [None]:
treated_train_df = pd.read_csv('data/treated_train.csv')
treated_test_df = pd.read_csv('data/treated_test.csv')

#### Monthly

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load your dataset
data = treated_train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the month
data['month'] = data.index.month

# List to store models and coefficients
models = []
coefficients = []

# Iterate through each month to train individual models
for month in range(1, 13):
    monthly_data = data[data['month'] == month]
    if len(monthly_data) < 10:  # Skip months with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = monthly_data[treated_train_df.columns.drop(['timestamp', 'target'])]
    y = monthly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models.append(model)
    coefficients.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Month {month}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")
    

# Average the coefficients from all models
avg_coefficients = np.mean(coefficients, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'month'])
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Month 12: 0.65 - F1 Score: 0.56
Average Coefficients: [ 0.76786527  1.64612283  0.93417381 -0.74266632 -2.14876822 -2.1954007
 -0.69863162  3.54668062  3.38198174 -0.30911377 -0.37091024 -0.46972609
 -0.49055077 -0.47053439  0.11840211  0.01933911 -0.01949126  0.07342921
 -0.04222488  0.09950946 -1.04553658]
Validation Accuracy: 0.6250944822373394
Validation F1 Macro Score: 0.4473241524392755




In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)


# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.53195
Test F1 Macro Score: 0.50219


In [None]:
# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': treated_test_df['row_id'],
    'target': test_predictions
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")




Predictions saved to submission.csv


#### Minute by Minute

In [None]:
# Load your dataset
data = treated_train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the minute of the day
data['minute_of_day'] = data.index.hour * 60 + data.index.minute

# List to store models and coefficients
models_minute = []
coefficients_minute = []

# Iterate through each minute of the day to train individual models
for minute in range(24*60):
    minutely_data = data[data['minute_of_day'] == minute]
    if len(minutely_data) < 10:  # Skip minutes with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = minutely_data[treated_train_df.columns.drop(['timestamp', 'target'])]
    y = minutely_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models_minute.append(model)
    coefficients_minute.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for minute {minute}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")


# Average the coefficients from all models
avg_coefficients = np.mean(coefficients_minute, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'minute_of_day'])
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for minute 1439: 0.65 - F1 Score: 0.56
Average Coefficients: [ 0.76786527  1.64612283  0.93417381 -0.74266632 -2.14876822 -2.1954007
 -0.69863162  3.54668062  3.38198174 -0.30911377 -0.37091024 -0.46972609
 -0.49055077 -0.47053439  0.11840211  0.01933911 -0.01949126  0.07342921
 -0.04222488  0.09950946 -1.04553658]
Validation Accuracy: 0.6250944822373394
Validation F1 Macro Score: 0.4473241524392755




In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)


# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.53195
Test F1 Macro Score: 0.50219


In [None]:
del treated_train_df
del treated_test_df

### Only New Features

In [None]:
train_df = pd.read_csv(train_datapath)
test_df = pd.read_csv(test_datapath)
new_features_train_df = pd.read_csv('data/new_features_train.csv')
new_features_test_df = pd.read_csv('data/new_features_test.csv')

treated_train_df = new_features_train_df
treated_train_df['timestamp'] = train_df['timestamp']
treated_test_df = new_features_test_df
treated_test_df['timestamp'] = test_df['timestamp']

del train_df
del test_df
del new_features_test_df
del new_features_train_df

#### Monthly

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load your dataset
data = treated_train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the month
data['month'] = data.index.month

# List to store models and coefficients
models = []
coefficients = []

# Iterate through each month to train individual models
for month in range(1, 13):
    monthly_data = data[data['month'] == month]
    if len(monthly_data) < 10:  # Skip months with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = monthly_data[treated_train_df.columns.drop(['timestamp', 'target'])]
    y = monthly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models.append(model)
    coefficients.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Month {month}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")
    

# Average the coefficients from all models
avg_coefficients = np.mean(coefficients, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'month'])
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Month 1: 0.55 - F1 Score: 0.37
Model Accuracy for Month 2: 0.53 - F1 Score: 0.43
Model Accuracy for Month 3: 0.54 - F1 Score: 0.45
Model Accuracy for Month 4: 0.55 - F1 Score: 0.36
Model Accuracy for Month 5: 0.52 - F1 Score: 0.44
Model Accuracy for Month 6: 0.52 - F1 Score: 0.47
Model Accuracy for Month 7: 0.54 - F1 Score: 0.42
Model Accuracy for Month 8: 0.52 - F1 Score: 0.44
Model Accuracy for Month 9: 0.53 - F1 Score: 0.44
Model Accuracy for Month 10: 0.53 - F1 Score: 0.45
Model Accuracy for Month 11: 0.52 - F1 Score: 0.42
Model Accuracy for Month 12: 0.53 - F1 Score: 0.52
Average Coefficients: [-0.11264195 -0.00542045 -0.00309648 -0.00599481 -0.00426288  0.04635641
 -0.01585908  0.00769711 -0.10185071  0.06415371  0.00035575 -0.00658856]
Validation Accuracy: 0.5232053856147396
Validation F1 Macro Score: 0.494442422311948




In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id', 'timestamp'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions)	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.55508
Test F1 Macro Score: 0.46790


#### Minute by minute

In [None]:
# Load your dataset
data = treated_train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the minute of the day
data['minute_of_day'] = data.index.hour * 60 + data.index.minute

# List to store models and coefficients
models_minute = []
coefficients_minute = []

# Iterate through each minute of the day to train individual models
for minute in range(24*60):
    minutely_data = data[data['minute_of_day'] == minute]
    if len(minutely_data) < 10:  # Skip minutes with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = minutely_data[treated_train_df.columns.drop(['timestamp', 'target'])]
    y = minutely_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models_minute.append(model)
    coefficients_minute.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for minute {minute}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")


# Average the coefficients from all models
avg_coefficients = np.mean(coefficients_minute, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target', 'minute_of_day'])
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for minute 0: 0.52 - F1 Score: 0.44
Model Accuracy for minute 1: 0.54 - F1 Score: 0.47
Model Accuracy for minute 2: 0.53 - F1 Score: 0.36
Model Accuracy for minute 3: 0.43 - F1 Score: 0.42
Model Accuracy for minute 4: 0.55 - F1 Score: 0.55
Model Accuracy for minute 5: 0.52 - F1 Score: 0.50
Model Accuracy for minute 6: 0.48 - F1 Score: 0.46
Model Accuracy for minute 7: 0.51 - F1 Score: 0.43
Model Accuracy for minute 8: 0.52 - F1 Score: 0.47
Model Accuracy for minute 9: 0.48 - F1 Score: 0.36
Model Accuracy for minute 10: 0.51 - F1 Score: 0.37
Model Accuracy for minute 11: 0.55 - F1 Score: 0.41
Model Accuracy for minute 12: 0.54 - F1 Score: 0.42
Model Accuracy for minute 13: 0.47 - F1 Score: 0.45
Model Accuracy for minute 14: 0.53 - F1 Score: 0.40
Model Accuracy for minute 15: 0.50 - F1 Score: 0.50
Model Accuracy for minute 16: 0.54 - F1 Score: 0.48
Model Accuracy for minute 17: 0.49 - F1 Score: 0.49
Model Accuracy for minute 18: 0.53 - F1 Score: 0.48
Model Accuracy for min



In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id', 'timestamp'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions)	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.56928
Test F1 Macro Score: 0.42746


In [None]:
del treated_train_df
del treated_test_df


### Add Month, Day of Week and Hour of Day

In [55]:
treated_train_df = pd.read_csv('data/treated_train.csv')
treated_test_df = pd.read_csv('data/treated_test.csv')

MemoryError: Unable to allocate 371. MiB for an array with shape (23, 2116749) and data type float64

In [None]:
# Load your data set
treated_train_df['timestamp'] = pd.to_datetime(treated_train_df['timestamp'], unit='s')  # Ensure timestamp is a datetime type
treated_train_df.set_index('timestamp', inplace=True)

# Create columns for the month, day of the week, and time of day
treated_train_df['month'] = treated_train_df.index.month
treated_train_df['day_of_week'] = treated_train_df.index.dayofweek
treated_train_df['time_of_day'] = treated_train_df.index.hour + treated_train_df.index.minute / 60.0

# Do the same for test_df
treated_test_df['timestamp'] = pd.to_datetime(treated_test_df['timestamp'], unit='s')  # Ensure timestamp is a datetime type
treated_test_df.set_index('timestamp', inplace=True)

# Create columns for the month, day of the week, and time of day
treated_test_df['month'] = treated_test_df.index.month
treated_test_df['day_of_week'] = treated_test_df.index.dayofweek
treated_test_df['time_of_day'] = treated_test_df.index.hour + treated_test_df.index.minute / 60.0



#### Monthly

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np


# List to store models and coefficients
models = []
coefficients = []

data = treated_train_df.copy()

# Iterate through each month to train individual models
for month in range(1, 13):
    monthly_data = data[data['month'] == month]
    if len(monthly_data) < 10:  # Skip months with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = monthly_data[treated_train_df.columns.drop(['target'])]
    y = monthly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models.append(model)
    coefficients.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Month {month}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")
    

# Average the coefficients from all models
avg_coefficients = np.mean(coefficients, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data[treated_train_df.columns.drop(['target'])]
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Month 1: 0.54 - F1 Score: 0.39
Model Accuracy for Month 12: 0.52 - F1 Score: 0.50
Average Coefficients: [-0.01348372  0.02774667  0.02593455 -0.02146273 -0.47699987 -0.21132929
  0.25599516  0.35172388  0.26051835 -0.12290384 -0.01543034 -0.01382774
 -0.01669801 -0.01563002  0.02937047 -0.03344775  0.00094728 -0.04094232
  0.01621888 -0.00247344 -0.02531282 -0.00380967 -0.01002696 -0.00098899]
Validation Accuracy: 0.5244714775008857
Validation F1 Macro Score: 0.46561388353980715




In [None]:
# Features and target
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Validation Accuracy: 0.5312719971654659
Validation F1 Macro Score: 0.4199515252872523


In [None]:
X_test.shape, treated_test_df.shape, targets_for_test_df.shape

((909529, 24), (909529, 25), (909616, 1))

In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)



# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions)	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.56273
Test F1 Macro Score: 0.44168


In [None]:
del treated_train_df
del treated_test_df


#### Minute by minute

In [None]:
train_df = pd.read_csv(train_datapath)
train_columns = train_df.columns
del train_df


In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# Load your dataset
data = treated_train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the month
data['month'] = data.index.month

# List to store models and coefficients
models = []
coefficients = []

# Iterate through each month to train individual models
for month in range(1, 13):
    monthly_data = data[data['month'] == month]
    if len(monthly_data) < 10:  # Skip months with insufficient data
        continue

    # Select relevant features and the target variable
    # features = ['open', 'high', 'low', 'close', 'volume']
    X = monthly_data.drop(columns='target')
    y = monthly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models.append(model)
    coefficients.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Month {month}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")
    

# Average the coefficients from all models
avg_coefficients = np.mean(coefficients, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target'])
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Month 1: 0.54 - F1 Score: 0.39
Model Accuracy for Month 12: 0.52 - F1 Score: 0.50
Average Coefficients: [-1.08110857e-02  2.88110554e-02  2.84550899e-02 -2.16252352e-02
 -4.85571153e-01 -2.07658943e-01  2.57830464e-01  3.58289760e-01
  2.56803085e-01 -1.23483976e-01 -1.62770523e-02 -1.51091958e-02
 -1.72865311e-02 -1.65263044e-02  2.98890350e-02 -3.41665935e-02
  3.54190904e-06 -3.93155637e-02  1.45242608e-02 -2.79197427e-03
 -2.75682810e-02 -2.07150169e-02]
Validation Accuracy: 0.5239777961497579
Validation F1 Macro Score: 0.47273670961604886




In [None]:
# Features and target
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Validation Accuracy: 0.5294177394590764
Validation F1 Macro Score: 0.4496506149121075


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

ValueError: Found input variables with inconsistent numbers of samples: [909616, 909529]

In [None]:
del treated_train_df
del treated_test_df

## Ensamble paper + month + day + hour 

In [7]:
train_df    = pd.read_csv(train_datapath)
test_df     = pd.read_csv(test_datapath)

In [8]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# Load your dataset
data = train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the month
data['month'] = data.index.month
data['day_of_week'] = data.index.dayofweek
data['hour_of_day'] = data.index.hour

# List to store models and coefficients
models = []
coefficients = []

# Iterate through each month to train individual models
for month in range(1, 13):
    monthly_data = data[data['month'] == month]
    if len(monthly_data) < 10:  # Skip months with insufficient data
        continue

    # Select relevant features and the target variable
    features = ['open', 'high', 'low', 'close', 'volume']
    X = monthly_data[data.columns.drop('target')]
    y = monthly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models.append(model)
    coefficients.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Month {month}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")
    

# Average the coefficients from all models
avg_coefficients = np.mean(coefficients, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data.drop(columns=['target'])
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')

Model Accuracy for Month 1: 0.55 - F1 Score: 0.38
Model Accuracy for Month 2: 0.52 - F1 Score: 0.43
Model Accuracy for Month 3: 0.53 - F1 Score: 0.37
Model Accuracy for Month 4: 0.55 - F1 Score: 0.40
Model Accuracy for Month 5: 0.52 - F1 Score: 0.46
Model Accuracy for Month 6: 0.52 - F1 Score: 0.43
Model Accuracy for Month 7: 0.53 - F1 Score: 0.42
Model Accuracy for Month 8: 0.52 - F1 Score: 0.39
Model Accuracy for Month 9: 0.53 - F1 Score: 0.40
Model Accuracy for Month 10: 0.53 - F1 Score: 0.36
Model Accuracy for Month 11: 0.53 - F1 Score: 0.35
Model Accuracy for Month 12: 0.52 - F1 Score: 0.39
Average Coefficients: [-1.07156651e-03 -1.07260306e-03 -1.07244135e-03 -1.08228842e-03
  2.47907463e-07 -2.40155123e-07  1.99280807e-04 -9.58989820e-08
 -2.83978137e-07 -1.35848302e-02 -3.63994434e-03  4.59313754e-04]




Validation Accuracy: 0.5246485177437289
Validation F1 Macro Score: 0.4339989360996589


In [9]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

# Create a column for the month
X_test['month'] = X_test.index.month
X_test['day_of_week'] = X_test.index.dayofweek
X_test['hour_of_day'] = X_test.index.hour

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.56702
Test F1 Macro Score: 0.45276


## Paper but only OHLCV
 

In [10]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load your dataset
data = train_df.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')  # Ensure timestamp is a datetime type
data.set_index('timestamp', inplace=True)

# Create a column for the month
data['month'] = data.index.month

# List to store models and coefficients
models = []
coefficients = []

# Iterate through each month to train individual models
for month in range(1, 13):
    monthly_data = data[data['month'] == month]
    if len(monthly_data) < 10:  # Skip months with insufficient data
        continue

    # Select relevant features and the target variable
    features = ['open', 'high', 'low', 'close', 'volume']
    X = monthly_data[features]
    y = monthly_data['target']  # Target: 1 for uptrend, 0 for downtrend

    # Split data into train and test sets
    X_train = X.iloc[:len(X)*4//5]
    y_train = y.iloc[:len(y)*4//5]
    X_val= X.iloc[len(X)*4//5:]
    y_val = y.iloc[len(y)*4//5:]

    # Initialize and train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Store model and coefficients
    models.append(model)
    coefficients.append(model.coef_[0])

    # Evaluate model performance
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Model Accuracy for Month {month}: {accuracy:.2f} - F1 Score: {f1_score(y_val, y_pred, average='macro'):.2f}")
    

# Average the coefficients from all models
avg_coefficients = np.mean(coefficients, axis=0)
print("Average Coefficients:", avg_coefficients)

# Final model can use averaged coefficients for predictions
final_model = LogisticRegression()
final_model.coef_ = np.array([avg_coefficients])
final_model.intercept_ = np.array([0])  # Adjust this as necessary

# Adjust the intercept to be the mean of the intercepts of the individual models
# final_model.intercept_ = np.mean([model.intercept_ for model in models], axis=0)

# Set classes_ attribute manually to match your binary classification (0 and 1)
final_model.classes_ = np.array([0, 1])

X = data[features]
y = data['target']

# Split data into train and test sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Predict on the validation set
y_pred = final_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')


Model Accuracy for Month 1: 0.55 - F1 Score: 0.36
Model Accuracy for Month 2: 0.48 - F1 Score: 0.32
Model Accuracy for Month 3: 0.53 - F1 Score: 0.35
Model Accuracy for Month 4: 0.55 - F1 Score: 0.36
Model Accuracy for Month 5: 0.53 - F1 Score: 0.43
Model Accuracy for Month 6: 0.52 - F1 Score: 0.43
Model Accuracy for Month 7: 0.53 - F1 Score: 0.41
Model Accuracy for Month 8: 0.52 - F1 Score: 0.43
Model Accuracy for Month 9: 0.53 - F1 Score: 0.40
Model Accuracy for Month 10: 0.53 - F1 Score: 0.37
Model Accuracy for Month 11: 0.48 - F1 Score: 0.32
Model Accuracy for Month 12: 0.52 - F1 Score: 0.34
Average Coefficients: [-2.05803470e-02 -2.05937735e-02 -2.05780030e-02 -2.06122967e-02
  1.20081905e-07]
Validation Accuracy: 0.5272657884321819
Validation F1 Macro Score: 0.4220523234932381




In [11]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'], unit='s')  # Ensure timestamp is a datetime type
X_test.set_index('timestamp', inplace=True)

features = ['open', 'high', 'low', 'close', 'volume']
X_test = X_test[features]

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')



Test Accuracy: 0.55724
Test F1 Macro Score: 0.48794


In [12]:
del train_df
del test_df