# To extend data one_store -> one_warehouse_stores setup

In [1]:
from shared_imports import *
n_stores = 3

In [2]:
def extend_and_save(path, n_stores):
    data = torch.load(path)
    data_extended = data.repeat(1, n_stores, 1)
    torch.save(data_extended, path)

In [8]:
extend_and_save('data_files/favorita_one_warehouse/weekly_sales.pt', n_stores)

# Analysis on items that are sold at # of stores

In [18]:
# Load the CSV file
file_path = 'data_files/favorita_one_warehouse/tensors_row_info.csv'  # Update the file path
df = pd.read_csv(file_path)

# Group by 'item_nbr' and count unique 'store_nbr' for each 'item_nbr'
store_counts = df.groupby('item_nbr')['store_nbr'].nunique()

# Determine the maximum number of stores any item is sold in
max_stores = store_counts.max()

# Initialize a dictionary to store the counts
store_distribution = {}

# Loop through each unique store count found and count how many items have that store count
for count in range(1, max_stores + 1):
    store_distribution[count] = (store_counts == count).sum()

ct = 0
# Printing the results
for stores, num_items in store_distribution.items():
    ct += stores * num_items
    print(f"Number of items sold in exactly {stores} store(s): {num_items}")

Number of items sold in exactly 1 store(s): 10
Number of items sold in exactly 2 store(s): 25
Number of items sold in exactly 3 store(s): 46
Number of items sold in exactly 4 store(s): 78
Number of items sold in exactly 5 store(s): 117
Number of items sold in exactly 6 store(s): 161
Number of items sold in exactly 7 store(s): 174
Number of items sold in exactly 8 store(s): 190
Number of items sold in exactly 9 store(s): 234
Number of items sold in exactly 10 store(s): 221
Number of items sold in exactly 11 store(s): 270
Number of items sold in exactly 12 store(s): 292
Number of items sold in exactly 13 store(s): 315
Number of items sold in exactly 14 store(s): 265
Number of items sold in exactly 15 store(s): 226
Number of items sold in exactly 16 store(s): 149
Number of items sold in exactly 17 store(s): 122
Number of items sold in exactly 18 store(s): 82
Number of items sold in exactly 19 store(s): 41
Number of items sold in exactly 20 store(s): 16
Number of items sold in exactly 21 s

# Construct dataset from items being sold at 16 stores.

In [32]:
import pandas as pd
import torch

df = pd.read_csv('data_files/favorita/tensors_row_info.csv')
tensor = torch.load('data_files/favorita/weekly_sales.pt')

In [33]:
store_counts = df.groupby('item_nbr')['store_nbr'].nunique()
items_with_many_stores = store_counts[store_counts >= 16].index

In [35]:
filtered_df = df[df['item_nbr'].isin(items_with_many_stores)]
pivot_df = filtered_df.pivot(index='item_nbr', columns='store_nbr', values='family')
selected_stores = pivot_df.columns[:16]

filtered_df = filtered_df[filtered_df['store_nbr'].isin(selected_stores)]
new_tensor_shape = [len(items_with_many_stores), 16, 240]
new_tensor = torch.zeros(new_tensor_shape)

# Populate the new tensor
for i, item in enumerate(items_with_many_stores):
    item_data = tensor[filtered_df[filtered_df['item_nbr'] == item].index, 0, :]
    new_tensor[i, :item_data.size(0), :] = item_data[:16]

In [36]:
# Output the new tensor
torch.save(new_tensor, 'data_files/favorita_one_warehouse/weekly_sales.pt')

# Re-create the CSV for the new structure
new_csv_data = filtered_df.sort_values(by=['item_nbr', 'store_nbr'])
new_csv_data = new_csv_data.iloc[:new_tensor.numel() // 240]  # Limit the rows to match the tensor

# Save the new CSV
new_csv_data.to_csv('data_files/favorita_one_warehouse/tensors_row_info.csv', index=False)

In [87]:
import pandas as pd
import torch

def filter_and_construct_data(csv_file_path, tensor_file_path, output_csv_path, output_tensor_path):
    # Load the CSV file
    df = pd.read_csv(csv_file_path)
    
    # Load the tensor
    data_tensor = torch.load(tensor_file_path)
    
    # Find items sold in more than 16 different stores
    store_counts = df.groupby('item_nbr')['store_nbr'].nunique()
    items_more_than_16_stores = store_counts[store_counts > 16].index
    
    # Filter DataFrame for these items
    filtered_df = df[df['item_nbr'].isin(items_more_than_16_stores)]
    
    # For each qualifying item, select data for exactly 16 stores
    final_df = pd.DataFrame()
    indices = []
    for item in items_more_than_16_stores:
        temp_df = filtered_df[filtered_df['item_nbr'] == item]
        selected_stores = temp_df['store_nbr'].drop_duplicates().iloc[:16]  # Select the first 16 unique stores
        selected_rows = temp_df[temp_df['store_nbr'].isin(selected_stores)]
        final_df = pd.concat([final_df, selected_rows])
        indices.extend(selected_rows.index.tolist())
    
    # Select corresponding tensors
    final_tensor = data_tensor[indices, :, :]
    
    # Ensure final_tensor is of the shape [X, 16, 240] where X is the number of valid (item, store) pairs
    final_tensor = final_tensor.view(-1, 16, 240)
    
    # Save the new DataFrame and tensor
    final_df.to_csv(output_csv_path, index=False)
    torch.save(final_tensor, output_tensor_path)
    
    print("Filtered data has been saved successfully.")

# Usage
filter_and_construct_data('data_files/favorita/tensors_row_info.csv', 'data_files/favorita/weekly_sales.pt'\
                          , 'data_files/favorita_one_warehouse/tensors_row_info.csv', 'data_files/favorita_one_warehouse/weekly_sales.pt')

Filtered data has been saved successfully.
