In [19]:
import pandas as pd
import numpy as np
import tqdm
from collections import defaultdict
from pathlib import Path


from backend.ml_pipeline.preprocess_for_inference import calculate_features_for_inference 
from backend.ml_pipeline.configs.feature_params import one_day_params

import os
os.chdir('/home/evgenii/work/itmo/coming_soon_ai_product_hack/')

In [2]:
shop_sales = pd.read_csv('./data/shop_sales.csv')
shop_sales_dates = pd.read_csv('./data/shop_sales_dates.csv')
clusters = pd.read_csv('./data/clusters.csv')
sku = pd.read_csv('./data/item_064.csv')

In [3]:
data = shop_sales.merge(shop_sales_dates, how='left', on='date_id')

In [16]:
def split_dataset(data: pd.DataFrame, train_rate = 0.7, val_rate = 0.2, test_rate = 0.1):
    train_size = int(len(data) * train_rate)
    val_size = int(len(data) * val_rate)
    train_data = data[:train_size]
    val_data = data[train_size:train_size + val_size]
    test_data = data[train_size + val_size:]

    return train_data, val_data, test_data

def calculate_features(data: pd.DataFrame, granularity: str):
    match granularity:
        case 'one_day':
            granularity_params = one_day_params
            shift = 1
        case 'seven_days':
            pass
        case 'thirty_days':
            pass
    
    data_with_features = calculate_features_for_inference(data=data, params=granularity_params)
    data_with_features = data_with_features.copy()
    data_with_features.loc[:, 'Y'] = data_with_features['cnt'].shift(shift)
    data_with_features = data_with_features.drop(columns=['date'])
    data_with_features = data_with_features.dropna()


    return data_with_features

In [12]:
cluster_datasets = defaultdict(lambda: defaultdict(list))

unique_clusters = np.sort(clusters['cluster'].unique())
for cluster_id in unique_clusters:
    item_ids_for_given_cluster = clusters[clusters['cluster'] == cluster_id]['item_id'].unique()
    for item_id in tqdm.tqdm(item_ids_for_given_cluster):
        sku = data[data['item_id'] == item_id]
        sku.loc[:, 'date'] = pd.to_datetime(sku['date'])
        sku_sorted = sku.sort_values(by='date')
        
        # Calculate features:
        sku_features_data = calculate_features(sku_sorted[['cnt', 'date']].copy(), 'one_day')

        # Split dataset:
        train_data, val_data, test_data = split_dataset(sku_features_data)

        cluster_datasets[cluster_id]['train'].append(train_data)
        cluster_datasets[cluster_id]['val'].append(val_data)
        cluster_datasets[cluster_id]['test'].append(test_data)

100%|██████████| 1/1 [00:00<00:00,  8.87it/s]
100%|██████████| 21/21 [00:00<00:00, 32.80it/s]
100%|██████████| 14/14 [00:00<00:00, 32.58it/s]
100%|██████████| 1/1 [00:00<00:00, 27.00it/s]
100%|██████████| 2/2 [00:00<00:00, 31.75it/s]
100%|██████████| 6/6 [00:00<00:00, 28.21it/s]


In [13]:
PATH_TO_SAVE = Path("./backend/ml_pipeline/datasets")

for cluster_num, data in cluster_datasets.items():
    # Concatenate DataFrames for 'train', 'val', and 'test'
    train_df = pd.concat(data['train'], ignore_index=True)
    val_df = pd.concat(data['val'], ignore_index=True)
    test_df = pd.concat(data['test'], ignore_index=True)

    train_df.to_csv(PATH_TO_SAVE / f'cluster_{cluster_num}_train.csv', index=False)
    val_df.to_csv(PATH_TO_SAVE / f'cluster_{cluster_num}_val.csv', index=False)
    test_df.to_csv(PATH_TO_SAVE / f'cluster_{cluster_num}_test.csv', index=False)