In [1]:
import time

# Data preprocessing
import pandas as pd
import numpy as np

# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [2]:
from utils import train_val_split
from utils import accuracy_f1_score
from utils import train_datapath, test_datapath

In [3]:
targets_for_test_df = pd.read_csv('data/targets_for_test.csv', index_col=0)['close']


def evaluate_model_performance(model, X_val_seq, y_val_seq):
    # Predict probabilities
    y_pred_prob = model.predict(X_val_seq)

    # Convert probabilities to binary predictions
    y_pred = (y_pred_prob > 0.5).astype(int)
    
    y_val = y_val_seq.copy()
    filled_y_pred = y_pred.copy()
    if len(y_val_seq) == len(y_pred):       # If the lengths are the same, do nothing
        print('Lengths are the same')
        pass
    elif len(y_val_seq) > len(y_pred):      # If the target is longer than the prediction
        print('Target is longer than prediction')
        difference = len(y_val_seq) - len(y_pred)
        filled_y_pred = np.concatenate([np.zeros(difference), filled_y_pred])
    else:                                   # If the prediction is longer than the target 
        print('Prediction is longer than target')
        y_val = np.concatenate([np.zeros(1), y_val])
        
        
    
    # Calculate accuracy
    accuracy = accuracy_score(y_val, filled_y_pred)
    print(f'Validation Accuracy: {accuracy:.5f}')

    # Calculate F1 macro score
    f1_macro = f1_score(y_val, filled_y_pred, average='macro')
    print(f'Validation F1 Macro Score: {f1_macro:.5f}')

    return y_pred, y_pred_prob


def save_submission(test_df, filled_test_predictions, filename='submission.csv'):
    filename = 'submissions/' + filename
    # Create a new DataFrame for the submission
    submission_df = pd.DataFrame({
        'row_id': test_df['row_id'],
        'target': [0] + filled_test_predictions
    })

    # Save the submission file
    submission_df.to_csv(filename, index=False)
    print(f"Predictions saved to {filename}")





In [4]:
test_df = pd.read_csv(test_datapath, usecols=['timestamp', 'row_id'], index_col=['row_id']) 
seasonal_submission = pd.read_csv('submission/mean_seasonal_predictions.csv', index_col=0)
svd_submission = pd.read_csv('submission/svd_dt_submission.csv', index_col=0)
svd_submission['timestamp']= test_df['timestamp']


In [5]:
seasonal_submission

Unnamed: 0,0
0,1
1,1
2,0
3,0
4,0
...,...
909611,0
909612,0
909613,1
909614,1


In [6]:
import numpy as np
import tqdm
predictions = []
for i in tqdm.tqdm(range(len(test_df))):
    svd_pred = np.random.choice([0, 1])
    seasonal_pred = np.random.choice([0, 1])
    if i in svd_submission.index:
        svd_pred = svd_submission.loc[i]['target']
    if i in seasonal_submission.index:
        seasonal_pred = seasonal_submission.loc[i][0]

    prediction = np.random.choice([svd_pred, seasonal_pred])
    predictions.append(prediction)

accuracy_f1_score(targets_for_test_df, predictions)

  seasonal_pred = seasonal_submission.loc[i][0]
100%|██████████| 909617/909617 [02:38<00:00, 5735.42it/s]


Target is longer than prediction
Validation Accuracy: 0.49503
Validation F1 Macro Score: 0.49071


(0.49502812722277617, 0.4907147303991576)

In [7]:
pd.DataFrame(predictions).to_csv('submission/ensemble_submission.csv', index=False)