In [99]:
import os
import pandas as pd
from model_config import Path
import plotly.graph_objects as go 
import numpy as np
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
import pandas_ta as ta
from sklearn.preprocessing import LabelEncoder

- All Fiches

In [100]:
scaler = StandardScaler()
timeframe = 1
data_num = 1
folder_path = Path["raw"]

In [101]:
if type(scaler) == type(StandardScaler()):
    scaler_name = "Standart"
    
elif type(scaler) == type(MinMaxScaler()):
    scaler_name = "MinMax"

elif type(scaler) == type(RobustScaler()):
    scaler_name = "Robust"

In [102]:
# Define numerical features to scale
numerical_features = [
                    "Close",
                    'Open', 'High', 'Low', 'Average',  'Change', 'Volume', 'Volume Change',
                    'EMA', 'SMA',  'MACD', 'BB_upper', 'BB_middle', 'BB_lower'
                      ]

In [103]:
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
dataframes = []

In [None]:
# Loop through each CSV file in the folder
for csv_file in csv_files:
    # Construct full file path
    file_path = os.path.join(folder_path, csv_file)
    
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    df = df.iloc[1:].reset_index(drop=True)
    
    # Extract the Asset_ID from the file name (assuming file name is the Asset_ID)
    asset_id = os.path.splitext(csv_file)[0].split("_")[0]  # Removes the '.csv' extension
    
    # Add the Asset_ID column to the DataFrame
    df['Asset_ID'] = asset_id
    
    # Append the DataFrame to the list
    dataframes.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Display the combined DataFrame
combined_df.head()

In [None]:
# Обработка пропущенных значений
combined_df.fillna(method='ffill', inplace=True)
combined_df.fillna(method='bfill', inplace=True)

In [None]:
combined_df.info()

In [107]:
# Convert 'Date' column to datetime if not already
combined_df['Date'] = pd.to_datetime(combined_df['Date'])

# Sort by Asset_ID and Date
combined_df.sort_values(by=['Asset_ID', 'Date'], inplace=True)

# Reset index after sorting
combined_df.reset_index(drop=True, inplace=True)

In [108]:
combined_df = combined_df.groupby('Asset_ID', group_keys=False)

In [None]:
def add_technical_indicators(group):
    group['EMA'] = ta.ema(group['Close'], length=14)
    group['SMA'] = ta.sma(group['Close'], length=14)
    group['RSI'] = ta.rsi(group['Close'], length=14)
    group['MACD'] = ta.macd(group['Close'])['MACD_12_26_9']
    bbands = ta.bbands(group['Close'], length=20)
    group['BB_upper'] = bbands['BBU_20_2.0']
    group['BB_middle'] = bbands['BBM_20_2.0']
    group['BB_lower'] = bbands['BBL_20_2.0']
    return group

# Apply the function to each group (asset)
combined_df = combined_df.apply(add_technical_indicators)
combined_df.fillna(method='bfill', inplace=True)

In [110]:
# Extract time components
combined_df['Hour'] = combined_df['Date'].dt.hour
combined_df['Day'] = combined_df['Date'].dt.dayofweek  # 0 = Monday
combined_df['Month'] = combined_df['Date'].dt.month

# Cyclical encoding
combined_df['Hour_sin'] = np.sin(2 * np.pi * combined_df['Hour']/24)
combined_df['Hour_cos'] = np.cos(2 * np.pi * combined_df['Hour']/24)

combined_df['Day_sin'] = np.sin(2 * np.pi * combined_df['Day']/7)
combined_df['Day_cos'] = np.cos(2 * np.pi * combined_df['Day']/7)

combined_df['Month_sin'] = np.sin(2 * np.pi * combined_df['Month']/12)
combined_df['Month_cos'] = np.cos(2 * np.pi * combined_df['Month']/12)

In [111]:
label_encoder = LabelEncoder()
combined_df['Asset_ID_encoded'] = label_encoder.fit_transform(combined_df['Asset_ID'])

In [112]:
combined_df["Close_orig"] = combined_df["Close"]

In [113]:
def scale_data(df, scaler):
    if scaler in ["log"]:
        def scale_group(group):
            # Применяем логарифмическую трансформацию к числовым признакам
            group[numerical_features] = np.log(group[numerical_features] + 1e-6)  # Добавляем небольшое значение для избежания логарифма от 0
            return group
    else:
        def scale_group(group):
            group[numerical_features] = scaler.fit_transform(group[numerical_features])
            return group  

    df = df.groupby('Asset_ID', group_keys=False).apply(scale_group).reset_index(drop=True)
    return df

In [None]:
combined_df = scale_data(df = combined_df, scaler= scaler)

In [115]:
# Функция для замены нулей на среднее
def replace_zeros_with_mean(df, columns):
    for column in columns:
        df[column].replace(0, np.nan, inplace=True)
        df[column].interpolate(method='linear', inplace=True)
        df[column].fillna(method='bfill', inplace=True)
        df[column].fillna(method='ffill', inplace=True)
    return df

# Задаем столбцы, в которых нужно заменить нули
columns_to_replace = [ 
                      "Close_orig",   
                      "Close", 'Open', 'High', 'Low', 'Average',  'Change', 'Volume', 'Volume Change',
                      'EMA', 'SMA', 'RSI', 'MACD', 'BB_upper', 'BB_middle', 'BB_lower'
                      ]

In [None]:
# Замена нулевых значений на средние с интерполяцией для каждого Asset_ID
combined_df = combined_df.groupby('Asset_ID', group_keys=False).apply(lambda group: replace_zeros_with_mean(group, columns_to_replace)).reset_index(drop=True)

In [None]:
zero_close_prices = combined_df[combined_df['Close'] == 0]
print(f"Number of zero 'Close' prices after scaling: {len(zero_close_prices)}")

In [118]:
combined_df.sort_values(by=['Asset_ID', 'Date'], inplace=True)
combined_df.drop(columns= ['Date', "Asset_ID"], axis=1, inplace=True)

1) scaler
2) columns cnt
3) tokens cnt
4) timeframe

In [119]:
columns_cnt = combined_df.shape[1]
tokens_cnt = len(combined_df['Asset_ID_encoded'].unique())

In [None]:
combined_df.to_csv(Path["dataset"](data_num, scaler_name, columns_cnt, tokens_cnt, timeframe), index= False)
print(f"File saved to {Path['dataset'](data_num, scaler_name, columns_cnt, tokens_cnt, timeframe)}")

In [122]:
df = pd.read_csv(Path["dataset"](data_num, scaler_name, columns_cnt, tokens_cnt, timeframe))

In [None]:
df.columns

In [None]:
df