# Activity Data DF:
Create a united table with all of the shipments arrived and supplied items over time:

In [None]:
import pandas as pd
from datetime import datetime as dt
from functools import reduce

def process_two_columns_per_date_file(input_path):
    # Read the Excel file and load "Sheet1" into a DataFrame
    df = pd.read_csv(input_path, header=None, low_memory=False)
    
    # Extract unique identifiers from the first column
    identifiers = df.iloc[:, 0].dropna().unique()
    
    # Initialize a dictionary to store data for each unique value in the second row
    parameters = df.iloc[1, 1:3].values
    data_dict = {name: {"id": identifiers} for name in parameters}
    
    # Iterate over the remaining columns
    total_columns = len(df.columns[1:])
    current_column = 0
    
    for column in df.columns[1:]:
        # Update progress
        current_column += 1
        # print(f"Processing column {current_column}/{total_columns}")
        
        # Extract the string from the first row of the current column, parse it to a string of format YYYY-MM-DD
        date = dt.strptime(df[column].iloc[0],'%d/%m/%Y')
        string_date = dt.strftime(date, "%Y-%m-%d")
        # Extract the data type from the second row
        data_type = (df[column].iloc[1])
        
        # Skip the column if the data type is empty
        if pd.isna(data_type):
            continue
        
        # Add the data from the column to the respective dictionary
        records = df[column].iloc[2:]#.values
        if data_type not in data_dict:
            data_dict[data_type] = {string_date: records}
        else:
            data_dict[data_type][string_date] = records
    
    dfs = []
    for parameter, data in data_dict.items():
        data_frame = pd.DataFrame(data)
        date_cols = data_frame.columns[1:].tolist()
        action_rows_df = pd.melt(
            data_frame,
            id_vars=['id'],
            value_vars=date_cols,
            var_name='date',
            value_name=parameter
        ).dropna()
        dfs.append(action_rows_df)

    united_df = reduce(lambda df1,df2: pd.merge(df1,df2,on=['id', 'date'], how='outer'), dfs).fillna(0)
    return united_df

In [None]:
# get all the files in the directory '../data/raw/activity':
import os
import re

def get_files_with_pattern(path, pattern):
    files = os.listdir(path)
    return [file for file in files if re.search(pattern, file)]

path = '../data/raw/activity/'

a_files = get_files_with_pattern(path, 'a_')
b_files = get_files_with_pattern(path, 'b_')    

In [None]:
# create an empty list of DFs, perform the processing on each file and append them to the list:
a_dfs = []
b_dfs = []
for dfs, files in [(a_dfs, a_files), (b_dfs, b_files)]:
    for file in files:
        print(f"Processing file {file}")
        df = process_two_columns_per_date_file(f"../data/raw/activity/{file}")
        dfs.append(df)

In [None]:
united_list = a_dfs + b_dfs
united_df = pd.concat(united_list)
united_df

In [None]:
# group by id and date and sum the values:
grouped_df = united_df.groupby(['id', 'date']).sum().sort_values(by=['date']).reset_index()
grouped_df

In [None]:
# rename all columns to lower case, and replace spaces with underscores:
grouped_df.columns = [column.lower().replace(' ', '_') for column in grouped_df.columns]
# rename the 'id' column to 'uuid':
grouped_df.rename(columns={'id': 'uuid'}, inplace=True)
# replace every ',' in the DF with '':
grouped_df = grouped_df.replace(',', '', regex=True)

In [None]:
# save the result to a csv file under '../data/processed/activity_data.csv':
grouped_df.to_csv('../data/processed/activity_data.csv', index=False)