In [1]:
# --- Standard library imports ---
from os import listdir                    # To list files in a directory
from os.path import isfile, join          # To check if path is a file and to join paths
import os                                 # General operating system interfaces (path handling, environment, etc.)
import datetime                           # Date and time handling
from datetime import date, datetime       # Specific date and datetime classes for easier usage
import calendar                           # To work with calendar dates, weeks, and months
from calendar import month_abbr, month_name  # Provides month abbreviations (e.g., Jan) and full names (e.g., January)
import re                                 # Regular expressions for text pattern matching
from dateutil.relativedelta import relativedelta  # Advanced date manipulations (e.g., add/subtract months)

# --- Third-party library imports ---
import pandas as pd                       # Data manipulation and analysis (DataFrames, Excel, CSV handling)
import matplotlib.pyplot as plt           # Plotting library for charts and visualizations
from openpyxl import load_workbook        # Excel file reader/writer (used with pandas for multiple sheets)
from google.colab import drive            # For mounting Google Drive when running in Google Colab


In [2]:
# Mount Google Drive to access files
drive.mount('/content/drive')
# Get the current working directory (useful for debugging file paths)
os.getcwd()

# Disable pandas "chained assignment" warning messages
# these occur when modifying a DataFrame slice
# These warnings may flood the output, consume RAM, and even cause your code to crash.
pd.options.mode.chained_assignment = None


Mounted at /content/drive


# Getting all the sales files

In [3]:
"""
Our sales data is organized in a main folder called 'Sales'.

Inside the 'Sales' folder, there are three subfolders:
1. A folder containing the full dataset with all columns.
2. A folder containing the summary dataset with fewer columns.
3. A folder containing Japan-specific data.

Inside each of these subfolders, there are two more folders,
which contain the actual sales files.
"""



# Define the main directory that contains all sales data
directory = r'/content/drive/My Drive/Sales Data/Sales'

# Get a list of sub-folders inside the main directory
main_folder = os.listdir(directory)

# Loop through each item in the main folder
for folder in main_folder:
 # Construct the full path for the current folder
  folder_path = os.path.join(directory, folder)
  # Check if the item is actually a folder
  if os.path.isdir(folder_path):

       # List all files/folders inside this subfolder
       files = os.listdir(folder_path)
       print(f"Files in {folder}:")

       # Loop through each item inside the subfolder
       for file in files:

        # Build the full path for the item inside the subfolder
        small_folder_path = os.path.join(folder_path, file)

        # List files inside the nested folder
        files = os.listdir(small_folder_path)
        print(f"Folder in {small_folder_path}:")
        print(f"Files in {files}:")


Files in JP:
Folder in /content/drive/My Drive/Sales Data/Sales/JP/All Info:
Files in ['October 2024', 'November 2024', 'December 2024', 'January 2025', 'March 2025', 'July 2024', 'June 2024', 'August 2024', 'February 2025', 'September 2024', 'May 2025', 'April 2025', 'June 2025', 'July 2025']:
Folder in /content/drive/My Drive/Sales Data/Sales/JP/Sales:
Files in ['August 2024', 'Feb 2025', 'Nov 2024', 'December 2024', 'July 2024', 'Jan 2025', 'June 2024', 'March 2025', 'Sep 2024', 'Oct 2024', 'May 2025', 'April 2025', 'June 2025', 'July 2025']:
Files in Sales Full:
Folder in /content/drive/My Drive/Sales Data/Sales/Sales Full/US:
Files in ['May 2023.xlsx', 'Sep 2023.xlsx', 'March 2023.xlsx', 'June 2023.xlsx', 'July 2023.xlsx', 'Jan 2023.xlsx', 'Oct 2023.xlsx', 'Feb 2023.xlsx', 'Aug 2023.xlsx', 'Nov 2023.xlsx', 'Dec 2023.xlsx', 'April 2023.xlsx', 'Dec 2024.xlsx', 'Jan 2025.xlsx', 'Feb 2025.xlsx', 'April 2025.xlsx', 'Nov 2024.xlsx', 'Oct 2024.xlsx', 'Sep 2024.xlsx', 'Aug 2024.xlsx', 'Ju

# **Visually verify that all folders and their files are present**

In [4]:
# Define the root directory that contains all sales data
root_dir = r'/content/drive/My Drive/Sales Data/Sales'
# Dictionary to store results in the format:
# { "MainFolder > Subfolder" : [list of files] }
result_dict = {}

# Get a sorted list of top-level folders
main_folders = sorted(os.listdir(root_dir))

# Loop through each top-level folder
for main in main_folders:
    main_path = os.path.join(root_dir, main)
    # Ensure the item is a folder
    if os.path.isdir(main_path):
        print(f"\n📁 Top-level folder: {main}")

        # Get a sorted list of subfolders inside the current main folder
        subfolders = sorted(os.listdir(main_path))

        # Loop through each subfolder
        for sub in subfolders:
            sub_path = os.path.join(main_path, sub)
            if os.path.isdir(sub_path):
                # List all files inside this subfolder
                files = sorted(os.listdir(sub_path))

                # Print structure for readability
                print(f"  📂 Subfolder: {sub}")
                print(f"    📄 Files: {files}")

                # Save results to dictionary with "Main > Sub" as the key
                result_dict[(main + ' > '+ sub)] = files



📁 Top-level folder: JP
  📂 Subfolder: All Info
    📄 Files: ['April 2025', 'August 2024', 'December 2024', 'February 2025', 'January 2025', 'July 2024', 'July 2025', 'June 2024', 'June 2025', 'March 2025', 'May 2025', 'November 2024', 'October 2024', 'September 2024']
  📂 Subfolder: Sales
    📄 Files: ['April 2025', 'August 2024', 'December 2024', 'Feb 2025', 'Jan 2025', 'July 2024', 'July 2025', 'June 2024', 'June 2025', 'March 2025', 'May 2025', 'Nov 2024', 'Oct 2024', 'Sep 2024']

📁 Top-level folder: Sales Full
  📂 Subfolder: EU
    📄 Files: ['April 2024.xlsx', 'April 2025.xlsx', 'Aug 2024.xlsx', 'August 2025.xlsx', 'December 2024.xlsx', 'Feb 2024.xlsx', 'Feb 2025.xlsx', 'Jan 2024.xlsx', 'Jan 2025.xlsx', 'July 2024.xlsx', 'July 2025.xlsx', 'June 2024.xlsx', 'June 2025.xlsx', 'March 2024.xlsx', 'March 2025.xlsx', 'May 2024.xlsx', 'May 2025.xlsx', 'Nov 2024.xlsx', 'Oct 2024.xlsx', 'Sep 2024.xlsx']
  📂 Subfolder: US
    📄 Files: ['April 2023.xlsx', 'April 2024.xlsx', 'April 2025.xlsx'

In [5]:
# ---------------------------------------------------------
# Step 1: Build a mapping of month names and abbreviations
#         to their corresponding month numbers.
#   Example: "january" → 1, "jan" → 1, "february" → 2, etc.
# ---------------------------------------------------------
month_map = {name.lower(): i for i, name in enumerate(month_name) if name}
month_map.update({abbr.lower(): i for i, abbr in enumerate(month_abbr) if abbr})


# ---------------------------------------------------------
# Step 2: Function to extract (year, month) from a filename.
#         - Removes ".xlsx"
#         - Splits on spaces
#         - Searches tokens for a year (YYYY) and month name/abbr
# ---------------------------------------------------------
def extract_year_month(filename):
    # Remove extension and extra spaces
    name = filename.replace('.xlsx', '').strip()
    tokens = re.split(r'\s+', name)

    year = None
    month = None

    # Inspect each token
    for token in tokens:
        token_lower = token.lower()
        if token.isdigit() and len(token) == 4:    # Token looks like a year
            year = int(token)
        elif token_lower in month_map:             # Token matches a month name/abbr
            month = month_map[token_lower]

    # Defaults if missing
    if year is None:
        year = 0    # You might prefer datetime.now().year here
    if month is None:
        month = 0   # Could also default to 1 (January)

    return (year, month)


# ---------------------------------------------------------
# Step 3: Sort all files in result_dict by (year, month).
#         Uses extract_year_month() as sort key.
#         reverse=True → newest first.
# ---------------------------------------------------------
sorted_result_dict = {}

for folder, files in result_dict.items():
    sorted_files = sorted(
        files,
        key=lambda x: extract_year_month(x),
        reverse=True  # Sort descending (latest year/month first)
    )
    sorted_result_dict[folder] = sorted_files


# ---------------------------------------------------------
# Step 4: Print the sorted structure in a readable format
# ---------------------------------------------------------
for folder, files in sorted_result_dict.items():
    print(f"\n📁 {folder}")
    for f in files:
        print(f"  {f}")



📁 JP > All Info
  July 2025
  June 2025
  May 2025
  April 2025
  March 2025
  February 2025
  January 2025
  December 2024
  November 2024
  October 2024
  September 2024
  August 2024
  July 2024
  June 2024

📁 JP > Sales
  July 2025
  June 2025
  May 2025
  April 2025
  March 2025
  Feb 2025
  Jan 2025
  December 2024
  Nov 2024
  Oct 2024
  Sep 2024
  August 2024
  July 2024
  June 2024

📁 Sales Full > EU
  August 2025.xlsx
  July 2025.xlsx
  June 2025.xlsx
  May 2025.xlsx
  April 2025.xlsx
  March 2025.xlsx
  Feb 2025.xlsx
  Jan 2025.xlsx
  December 2024.xlsx
  Nov 2024.xlsx
  Oct 2024.xlsx
  Sep 2024.xlsx
  Aug 2024.xlsx
  July 2024.xlsx
  June 2024.xlsx
  May 2024.xlsx
  April 2024.xlsx
  March 2024.xlsx
  Feb 2024.xlsx
  Jan 2024.xlsx

📁 Sales Full > US
  August 2025.xlsx
  July 2025.xlsx
  June 2025.xlsx
  May 2025.xlsx
  April 2025.xlsx
  March 2025.xlsx
  Feb 2025.xlsx
  Jan 2025.xlsx
  Dec 2024.xlsx
  Nov 2024.xlsx
  Oct 2024.xlsx
  Sep 2024.xlsx
  Aug 2024.xlsx
  July 202

# Getting Units Sold, Gross Revenue, Net Profit, and Refund cost

### Getting the Sales Data

#### Sales Full

In [6]:
# Define the parent directory where "Sales" and its subfolders are stored
parent_folder = '/content/drive/My Drive/Sales Data'

# Switch working directory to the parent folder
os.chdir(parent_folder)

# Collect only subfolders (ignore files) within the parent folder
folders = [f for f in os.listdir(parent_folder) if os.path.isdir(os.path.join(parent_folder, f))]
folders


['Sales',
 'Product Prices',
 'Final Shape of Files',
 'Sales Analysis',
 'Price Adjustments',
 'Power BI',
 'Products Historical Prices',
 'Product Description',
 'Gaps',
 'MB',
 'Stuck Stocked']

In [7]:
"""
📂 Sales Full
-------------
This is one of the subfolders inside the main "Sales" folder.
It contains the **complete dataset with all columns** (raw/full version),
unlike the summary or Japan-specific folders which store filtered data.
However, it doesn't contain Japen data and the calculation operation over it is harder
"""


"""
Building Sales Full DataFrame
--------------------------------
We want to merge sales data from multiple Excel files stored in the
`Sales Full` folder into a single DataFrame (`df_sf`).

- The folder `Sales Full` contains two main subfolders: "US" and "EU".
- Each of these contains multiple Excel files with detailed sales records for each month.
- The script loops through both subfolders, reads all Excel files, and
  concatenates them into one unified DataFrame.

Result: `df_sf` will hold all sales records (US + EU) combined.
"""
# Initialize an empty DataFrame to hold merged results
list_sale = ["US", 'EU']

# Initialize an empty DataFrame to hold merged results
df_sf = pd.DataFrame()

# Loop through each region (subfolder inside "Sales Full")
for i in list_sale:
    # Define path for the current region
    root_path = r'/content/drive/My Drive/Sales Data/Sales/Sales Full'
    root_path  = os.path.join(root_path, i)

    # Change working directory to the current region folder
    os.chdir(root_path)

    # Collect all Excel files inside the region folder
    onlyfiles = [f for f in listdir(root_path) if isfile(join(root_path, f))]

    # Loop through each file and read data
    for f in onlyfiles:
        # Read current Excel file into a temporary DataFrame
        df_t = pd.read_excel(f)
        # Concatenate with the master DataFrame
        df_sf = pd.concat([df_t, df_sf])

# Reset index for better indexing
df_sf.reset_index(inplace = True, drop = True)

>> Now, we have **df_sf** that contains all full version sales records

In [8]:
"""
Saving the Merged Sales Data
-------------------------------
After merging all sales files into `df_sf`, we now save it as a CSV file.

- The output will be stored in the folder: "Final Shape of Files".
- File is named "Sales_full.csv".
- We drop the index column since it’s not needed in the output.
"""

# Define the target folder where the CSV will be saved
root_path = r'/content/drive/My Drive/Sales Data/Final Shape of Files'

# Change working directory to the target folder
os.chdir(root_path)

# Save DataFrame to CSV
df_sf.to_csv('Sales_full', index = False)

In [9]:
"""
Preparing Sales Data for Power BI (In case we needed to make a dashboard)
------------------------------------
Steps performed:
1. Replace all NaN values with 0 (to avoid calculation errors).
2. Create new calculated columns:
   - Gross Revenue = Organic + Sponsored Products + Sponsored Display
   - Units Sold    = Organic Units + PPC Units
   - FBA Storage Fee = Long-Term + Standard + Per-Unit Fulfillment
3. Export only the essential columns for Power BI analysis:
   Date, Marketplace, ASIN, Gross Revenue, Units Sold, FBA Storage Fee
"""

# Fill missing values with 0
df_sf.fillna(0, inplace = True)

# Create calculated metrics
df_sf['Gross Revenue'] =df_sf['SalesOrganic'] +  df_sf['SalesSponsoredProducts'] + df_sf['SalesSponsoredDisplay']
df_sf['Units Sold'] = df_sf['UnitsOrganic'] + df_sf['UnitsPPC']
df_sf['FBA storage fee'] = df_sf['FBALongTermStorageFee'] + df_sf['FBAStorageFee'] +  df_sf['FBAPerUnitFulfillmentFee']

# Select only the useful columns
df_sf[['Date', 'Marketplace', 'ASIN', 'Gross Revenue', 'Units Sold', 'FBA storage fee']].to_csv('Sales_full_power_BI.csv', index = False)

#### Sales Mini


In [10]:
"""
📂 Sales Mini
-------------
This is one of the subfolders inside the main "Sales" folder.
It contains the summary dataset with fewer columns.
⚠️ Note: It does not include Japan data, and because of the reduced size,
calculation operations are easier, faster, and more accurate compared to 'Sales Full'.
However, it doesn't contain all the needed columns for our many reports
"""

"""
📂 Load Sales Mini Dataset
---------------------------
Steps performed:
1. Define the sales regions of interest ("US" and "EU").
2. Navigate into each regional subfolder inside "Sales Mini".
3. Read all Excel files inside each region.
4. Concatenate them into one master DataFrame: df_sm.
"""

# Define regions available in Sales Mini
list_sale = ["US", 'EU']

# Initialize empty DataFrame for the final combined dataset
df_sm = pd.DataFrame()

# Loop through each region folder
for i in list_sale:
    # Construct full path to the region folder
    root_path = r"/content/drive/My Drive/Sales Data/Sales/Sales Mini"
    root_path  = os.path.join(root_path, i)
    os.chdir(root_path)

    # Get all file names inside the region folder
    onlyfiles = [f for f in listdir(root_path) if isfile(join(root_path, f))]

    # Read and merge each file into the master DataFrame
    for f in onlyfiles:
        df_t = pd.read_excel(f)
        df_sm = pd.concat([df_t, df_sm])


# Reset index for better indexing
df_sm.reset_index(inplace = True, drop = True)

>> Now, we have **df_sm** that contains the summary data version sales records

In [11]:
"""
Saving the Merged Sales Data
-------------------------------
After merging all sales files into `df_sm`, we now save it as a CSV file.

- The output will be stored in the folder: "Final Shape of Files".
- File is named "Sales_full.csv".
- We drop the index column since it’s not needed in the output.
"""

# Define the target folder where the CSV will be saved
root_path = r'/content/drive/My Drive/Sales Data/Final Shape of Files'

# Change working directory to the target folder
os.chdir(root_path)

# Fill missing values with 0
df_sm.fillna(0, inplace = True)

# Save a trimmed version for Power BI
df_sm[['Date', 'Marketplace', 'ASIN', 'NetProfit', 'RefundCost']].to_csv('Sales_Mini_power_BI.csv', index = False)

# Save the full version
df_sm.to_csv('Sales_mini', index = False)

#### Sales Japan

In [12]:
"""
📂 Japan
-------------
This is one of the subfolders inside the main "Sales" folder.
It contains the Japan sales data as we get them from Helium 10, not Sellerboard.
⚠️ Note: Any data from Helium 10 is just an estimation and is unreliable as it contains many problems.
"""

"""
📂 Japan Sales
-------------
This is one of the subfolders inside the main "Japan" folder.
It contains the summary sales data for Japan.
"""

"""
📂 Load Sales JP Dataset
---------------------------
Steps performed:
1. Define the path to the "Japan Sales" folder.
2. Loop through all subfolders (e.g., by month/year).
3. Inside each subfolder, read all Excel files.
4. Concatenate them into one master DataFrame: df_jp.
5. Reset the index and save the final dataset.
"""



# Initialize empty DataFrame
df_jp = pd.DataFrame()

# Path to Japan sales data
path = r"/content/drive/My Drive/Sales Data/Sales/JP/Sales"

# Collect subfolders
entries = os.listdir(path)
folders = [entry for entry in entries if os.path.isdir(os.path.join(path, entry))]

# Loop through subfolders
for folder in folders:
    root_path  = os.path.join(path, folder)
    os.chdir(root_path)

    # Collect Excel files names
    onlyfiles = [f for f in listdir(root_path) if isfile(join(root_path, f))]

    # Read each file and append to df_jp
    for f in onlyfiles:
        df_t = pd.read_excel(f)
        df_jp = pd.concat([df_t, df_jp])


# Reset index for better indexing
df_jp.reset_index(inplace = True, drop = True)

# Define the target folder where the CSV will be saved
root_path = r'/content/drive/My Drive/Sales Data/Final Shape of Files'

# Change working directory to the target folder
os.chdir(root_path)

# Save JP dataset
df_jp.to_csv('Sales_JP', index = False)

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn

In [13]:
"""
📂 Japan All Info
-------------
This is one of the subfolders inside the main "Japan" folder.
It contains the full version of sales data for Japan, organized by year/month/day.
⚠️ Note: This dataset is more detailed compared to the summary sales data, however, the inaccuracy problems still the same
"""

"""
📂 Load All Info JP Dataset
---------------------------
Steps performed:
1. Define the path to the "Japan All Info" folder.
2. Loop through all subfolders (e.g., month-year).
3. For each Excel file inside, extract the day from the filename.
4. Add "Day", "Month", and "Year" columns to each DataFrame.
5. Concatenate into one master DataFrame: df_jp.
6. Save the final dataset as 'Sales_JP_all_information.csv'.
"""




# Initialize empty DataFrame
df_jp = pd.DataFrame()

# Path to Japan All Info data
path = r"/content/drive/My Drive/Sales Data/Sales/JP/All Info"

# Collect subfolders
entries = os.listdir(path)
folders = [entry for entry in entries if os.path.isdir(os.path.join(path, entry))]

# Loop through subfolders
for folder in folders:
    month = folder[:-5].strip()
    year = folder[-5:].strip()
    root_path  = os.path.join(path, folder)
    os.chdir(root_path)

    # Collect Excel files
    onlyfiles = [f for f in listdir(root_path) if isfile(join(root_path, f))]

    # Read each file and append to df_jp
    for f in onlyfiles:
        Day = f[:2] # Extract day from filename
        df_t = pd.read_excel(f)
        df_t['Day'] = Day
        df_t['month'] = month
        df_t['year'] = year
        df_jp = pd.concat([df_t, df_jp])

# Reset index for better indexing
df_jp.reset_index(inplace = True, drop = True)

# Define the target folder where the CSV will be saved
root_path = r'/content/drive/My Drive/Sales Data/Final Shape of Files'

# Change working directory to the target folder
os.chdir(root_path)

# Save JP All Info dataset
df_jp.to_csv('Sales_JP_all_information', index = False)

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn

# Getting the product price

In [14]:
"""
📂 Product Prices
-----------------
This folder contains products' prices datasets.
We have one file for each country. The data comes from our Keepa API.
Goal: Load all Excel files, combine them, and create one clean master DataFrame.
"""
# Initialize empty DataFrame
df_cp = pd.DataFrame()

# Path to product prices
root_path= r'/content/drive/My Drive/Sales Data/Product Prices'
os.chdir(root_path)

# Collect Excel files
onlyfiles = [f for f in listdir(root_path) if isfile(join(root_path, f))]

# Loop through files
for i in onlyfiles:
    df_tt = pd.read_excel(i)
    df_cp = pd.concat([df_cp, df_tt])

# Drop unwanted columns if exist
df_cp.reset_index(inplace = True)
df_cp.drop(columns = {'index', 'Unnamed: 0'}, inplace = True)


# 🛠 Fix Japan Price Format
# In Japan, product prices are stored in a wronge format (12.99 instead of 1299)
# Multiply by 100 to align with other countries.
df_cp.loc[df_cp['Region'] == 'JP', 'Price'] = (
    df_cp.loc[df_cp['Region'] == 'JP', 'Price'] * 100
)


In [15]:
# Mapping dictionary: Region -> Marketplace
region_to_marketplace = {
    'US': 'Amazon.com',
    'CA': 'Amazon.ca',
    'GB': 'Amazon.co.uk',
    'DE': 'Amazon.de',
    'FR': 'Amazon.fr',
    'IT': 'Amazon.it',
    'ES': 'Amazon.es',
    'JP': 'Amazon.co.jp',
    'MX': 'Amazon.com.mx'
}


# Apply mapping
df_cp['Marketplace'] = df_cp['Region'].map(region_to_marketplace)

# Check for unmapped regions
unmapped = df_cp[df_cp['Marketplace'].isna()]['Region'].unique()
if len(unmapped) > 0:
    print("⚠️ Unmapped Regions found:", unmapped)


# Save final output
root_path = r'/content/drive/My Drive/Sales Data/Final Shape of Files'
os.chdir(root_path)
df_cp.to_csv('Current_price', index = False)


# List of All products update

> **This stage of aggregation uses a differential backup approach — we remove all products with no sales in the past six months and add any new products not currently in our list. These newly added products will then be included in the next run of the 'Getting Price' notebook to retrieve their prices**

> **The data pipeline for Japan is based on the Business Report from Amazon Seller Central, as we cannot rely on Helium10—it omits products that have no sales activity**

> **On the other hand, we can rely on SellerBoard to obtain an accurate list of active products**

> **We retrieve product names from the most recent product list, so some newly added products may initially appear without a name. In such cases, the names must be updated manually — but this is done only once for each new product. However, the absence of a product name does not affect or prevent the 'Getting Price' notebook from running and retrieving prices for those products**

In [16]:
def filter_df_by_previous_months(df, num_of_previous_months = 3):
    """
    Filters the input DataFrame to include only rows from the previous three months.

    Args:
        df_full (pd.DataFrame): The full dataset containing at least 'Month' and 'Year' columns.

    Returns:
        pd.DataFrame: Filtered DataFrame with rows from the previous three months.
    """
    dict_month_year = Get_Previous_Months_With_Year(num_of_previous_months)
    df_previous_months = pd.DataFrame()

    for month, year in dict_month_year.items():
        df_t = df[(df['Month'] == month) & (df['Year'] == year)]
        df_previous_months = pd.concat([df_previous_months, df_t], ignore_index=True)

    return df_previous_months




def Get_Previous_Months_With_Year(num_of_previous_months=3):
    """
    Returns a dictionary of previous months with their corresponding years.

    Parameters:
    -----------
    num_of_previous_months : int, optional (default=3)
        The number of months to go back from the current date.

    Returns:
    --------
    dict
        A dictionary where the keys are month names (e.g., 'April') and the values are their corresponding years (e.g., 2025).
        For example: {'June': 2025, 'May': 2025, 'April': 2025}
    """
    # Get the current date and time
    now = datetime.now()

    # Initialize an empty dictionary to store results
    months = {}

    # Loop through the specified number of previous months
    for i in range(1, num_of_previous_months + 1):
        # Calculate the past date by subtracting i months
        past_date = now - relativedelta(months=i)

        # Extract the full month name (e.g., "March")
        month_name = past_date.strftime("%B")

        # Extract the year of that month
        year = past_date.year

        # Add the month and year to the dictionary
        months[month_name] = year

    return months


In [17]:

# Define the root path and change the working directory
root_path = r"/content/drive/My Drive/Stock Products/JP"
os.chdir(root_path)

# Initialize the master DataFrame
df = pd.DataFrame()

# Optional: define country identifier if you're looping per country folder
country = "US"  # or dynamically passed as variable `i`

# Get a list of files in the current folder
onlyfiles = [f for f in listdir(root_path) if isfile(join(root_path, f)) and f.endswith('.xlsx')]

# Process each Excel file
for f in onlyfiles:
    try:
        # Read file
        df_t = pd.read_excel(f)

        # Add metadata
        df_t['Country'] = country
        parts = f.strip().split()

        if len(parts) >= 2:
            df_t['Month'] = parts[0]
            df_t['Year'] = parts[1][:4]
        else:
            df_t['Month'] = 'Unknown'
            df_t['Year'] = 'Unknown'

        # Clean column names
        df_t.columns = (
            df_t.columns
            .str.replace('-', '', regex=False)
            .str.replace('–', '', regex=False)
            .str.strip()
            .str.title()
        )

        # Append to master DataFrame
        df = pd.concat([df, df_t], ignore_index=True)
        df['Year'] = df['Year'].astype(int)

    except Exception as e:
        print(f"⚠️ Error processing {f}: {e}")



In [18]:
# Constants
COL_LIST = ['Date', 'Marketplace', 'Country', 'ASIN', 'SKU', 'Units Sold']
MKT_COUNTRY_MAP = {
    'Amazon.co.uk': "GB", 'Amazon.de': 'DE', 'Amazon.fr': 'FR',
    'Amazon.es': 'ES', 'Amazon.it': 'IT', 'Amazon.com': 'US',
    'Amazon.ca': 'CA', 'Amazon.com.mx': 'MX'
}
ROOT_PATH = '/content/drive/My Drive/Sales Data/Final Shape of Files'
PRODUCT_FILE = "List_Of_All_Products.xlsx"

# Step 1: Filter DataFrame by last 6 months and calculate Units Sold
df_sf['Date'] = pd.to_datetime(df_sf['Date'], format = "%d/%m/%Y")
df_sf['Year'] = df_sf['Date'].dt.year
df_sf['Month'] = df_sf['Date'].dt.month_name()

filter_df_by_previous_months(df_sf, num_of_previous_months=6)
df_sf['Units Sold'] = df_sf['UnitsOrganic'] + df_sf['UnitsPPC']
df_sf['Country'] = df_sf['Marketplace'].map(MKT_COUNTRY_MAP).fillna(0)

# Step 2: Pivot to summarize total units sold
pivot = (
    pd.pivot_table(
        df_sf[COL_LIST],
        index=['Marketplace', 'Country', 'ASIN'],
        values='Units Sold',
        aggfunc='sum'
    )
    .reset_index()
)

# Step 3: Filter out invalid entries
pivot = pivot[(pivot['Units Sold'] > 0) & (pivot['Country'] != 0)].reset_index(drop=True)

# Step 4: Create dictionary of active ASINs per marketplace
products_dict = {
    marketplace: asin_list['ASIN'].unique().tolist()
    for marketplace, asin_list in pivot.groupby('Marketplace')
}

# Step 5: Merge with master product list
os.chdir(ROOT_PATH)
df_tl = pd.read_excel(PRODUCT_FILE)
df_tl.rename(columns={'Region': 'Country'}, inplace=True)
df_tl.replace({'UK': 'GB'}, inplace=True)
df_jp = filter_df_by_previous_months(df, num_of_previous_months =6)
df_jp['Country'] = 'JP'
df_jp['Marketplace'] = 'amazon.co.jp'
df_jp = df_jp[['(Child) Asin', 'Country', 'Marketplace', 'Units Ordered']]
df_jp.columns =['ASIN', 'Country', 'Marketplace', 'Units Sold']
df_jp.drop_duplicates(subset = ['ASIN', 'Country'])
pivot = pd.concat([pivot, df_jp])


pivot = pivot.merge(df_tl, on=['ASIN', 'Country'], how='left').drop_duplicates(subset = ['ASIN', 'Country']) # To overcome the problem of the "unknown" values
pivot.fillna('Unknown', inplace=True)

# Step 6: Print summary of unknown product names
unknown_counts = pivot[pivot['Product Name'] == 'Unknown'].groupby('Marketplace')['ASIN'].nunique()
print('number of products without a product name \n\n',unknown_counts)

# Step 7: Save updated product list
output_df = pivot[['Product Name', 'ASIN', 'Country']].rename(columns={'Country': 'Region'})
output_df.to_excel(PRODUCT_FILE, index=False)


number of products without a product name 

 Marketplace
Amazon.ca        55
Amazon.co.uk     24
Amazon.com       51
Amazon.com.mx    27
Amazon.de        31
Amazon.es        27
Amazon.fr        31
Amazon.it        30
amazon.co.jp     32
Name: ASIN, dtype: int64


#Making Incremental Backup for the price files

In [19]:
# Initialize an empty DataFrame to hold historical price data
df_hp = pd.DataFrame()

# Define the main directory containing the product price folders
Main_path = '/content/drive/My Drive/Sales Data/Products Historical Prices'

# List all folders within the main directory (each folder is expected to contain Excel files)
all_folders = [f for f in os.listdir(Main_path) if os.path.isdir(os.path.join(Main_path, f))]
print(f"\n📁 Main Folder: 'Products Historical Prices'")

# Loop through each folder and read all Excel files
for folder in all_folders:
    folder_path = os.path.join(Main_path, folder)
    files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    os.chdir(folder_path)
    print(f"\n  📁 Folder: {folder}")
    for file in files:
      print(f"\n     📄 file: {file}")

      # Read and append each Excel file to the main DataFrame
      file_path = os.path.join(folder_path, file)
      df_hp = pd.concat([df_hp, pd.read_excel(file_path) ])

      # Drop index column if present
      try:
        df_hp.drop(columns = 'Unnamed: 0', inplace = True)
      except:
        pass



# Assuming df_cp exists and contains current product prices to merge with historical data including the current backup (as this gives us the flexibility to remove some historical folder data to save space with keeping the data in the backup)
# Drop 'Marketplace' column from df_cp before merging
df_backup = pd.concat([df_hp, df_cp.drop(columns = 'Marketplace')])

# Remove duplicates
df_backup.drop_duplicates(inplace = True)

# Save the backup file in the designated backup folder
backup_path = '/content/drive/My Drive/Sales Data/Products Historical Prices/Backup Folder'
backup_file = os.path.join(backup_path, 'Products price Backup.xlsx')
df_backup.to_excel(backup_file, index=False)
print(f"\n✅ Backup saved successfully to: {backup_file}")


📁 Main Folder: 'Products Historical Prices'

  📁 Folder: 2025-07-28

     📄 file: CA.xlsx

     📄 file: GB.xlsx

     📄 file: DE.xlsx

  📁 Folder: 2025-07-29

     📄 file: US.xlsx

  📁 Folder: Backup Folder

     📄 file: Products price Backup.xlsx

  📁 Folder: 2025-07-30

     📄 file: CA.xlsx

     📄 file: GB.xlsx

  📁 Folder: 2025-07-31

     📄 file: DE.xlsx

     📄 file: FR.xlsx

     📄 file: IT.xlsx

     📄 file: ES.xlsx

  📁 Folder: 2025-08-04

     📄 file: CA.xlsx

     📄 file: GB.xlsx

  📁 Folder: 2025-08-05

     📄 file: US.xlsx

     📄 file: DE.xlsx

     📄 file: ES.xlsx

  📁 Folder: 2025-08-06

     📄 file: FR.xlsx

     📄 file: IT.xlsx

  📁 Folder: 2025-08-08

     📄 file: CA.xlsx

     📄 file: GB.xlsx

  📁 Folder: 2025-08-10

     📄 file: CA.xlsx

  📁 Folder: 2025-08-11

     📄 file: GB.xlsx

     📄 file: US.xlsx

     📄 file: DE.xlsx

  📁 Folder: 2025-08-12

     📄 file: ES.xlsx

     📄 file: FR.xlsx

     📄 file: IT.xlsx

  📁 Folder: 2025-08-16

     📄 file: JP.xlsx

    