In [67]:
import pandas as pd
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
import torch

In [74]:
def preprocessing(df: pd.DataFrame) -> pd.DataFrame:    

    # Drop irrelevant values
    df = df.drop(columns=['symbol', 'timestamp'])

    # Check for NaN values
    if df.isnull().values.any():
        print("Error in dataframe, missing value")

    # Add the difference between values (will be used as label)
    df['results'] = [df.iloc[i]['vwap'] - df.iloc[i-1]['vwap'] for i in range(len(df))]
    
    # Resets the index and removes the first value that will have the wrong results
    df = df[1:].reset_index()

    features = df[['open', 'high', 'low', 'close', 'volume', 'trade_count','vwap']]
    labels = df['results']
    
    # Split the data into training and testing sets
    train_X, test_X, train_Y, test_Y = train_test_split(features, labels, test_size=0.2, random_state=0)

    # Apply RobustScaler to the training set
    scaler = RobustScaler().fit(train_X)

    # Apply the scaling to the sets
    train_X_scaled = scaler.transform(train_X)
    test_X_scaled = scaler.transform(test_X)

    # Convert the scaled data into tensors
    train_X = torch.tensor(train_X_scaled).float()
    test_X = torch.tensor(test_X_scaled).float()
    train_Y = torch.tensor(train_Y.values).float()
    test_Y = torch.tensor(test_Y.values).float()

    return train_X, test_X, train_Y, test_Y