# Load Files

In [62]:
# Enable automatic reloading of modules
%reload_ext autoreload
%autoreload 2

# Alternative manual reload function for when autoreload doesn't work
import importlib
import sys

def reload_modules():
    """Manually reload custom modules"""
    modules_to_reload = ['Python.Inventory', 'Python.Filtering', 'Python.Duplicates', 'Python.Read_Files']
    
    for module_name in modules_to_reload:
        if module_name in sys.modules:
            importlib.reload(sys.modules[module_name])
            print(f"Reloaded: {module_name}")
    print("Manual reload complete!")

print("Autoreload enabled - modules will refresh automatically when files change")
print("If autoreload doesn't work, run: reload_modules()")

# Test autoreload status
%aimport

Autoreload enabled - modules will refresh automatically when files change
If autoreload doesn't work, run: reload_modules()
Modules to reload:
all-except-skipped

Modules to skip:



In [63]:
#Imports

reload_modules()
from Python.Read_Files import read_csv_dataframe, read_excel_dataframe, save_excel_dataframe, save_csv_dataframe
from Python.Filtering import quantity_outliers_improved

Reloaded: Python.Inventory
Reloaded: Python.Filtering
Reloaded: Python.Read_Files
Manual reload complete!


In [64]:
from pathlib import Path
from Python.Read_Files import read_csv_dataframe

input_dir = Path("Input")
file_name_FMV = "Part numbers with fair market value.csv"
csv_path = input_dir / file_name_FMV
FMV_DF = read_csv_dataframe(csv_path)
FMV_DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79800 entries, 0 to 79799
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   PN                            79800 non-null  object 
 1   CONDITION_CODE                79800 non-null  object 
 2   PNM_AUTO_KEY                  79800 non-null  int64  
 3   PCC_AUTO_KEY                  79800 non-null  int64  
 4   COUNT_DATA_POINTS_PN_PC       79705 non-null  float64
 5   LATEST_DATA_POINT_DATE_PN_PC  79705 non-null  object 
 6   NATTY_FMV                     79705 non-null  float64
 7   SYNTH_FMV                     1120 non-null   float64
 8   FMV                           79800 non-null  float64
 9   CONFIDENCE_PN_PC              79800 non-null  object 
 10  FMV_SOURCE                    79800 non-null  object 
dtypes: float64(4), int64(2), object(5)
memory usage: 6.7+ MB


In [65]:
# Read and combine all report CSV files from Input/Data/
import pandas as pd
import glob
from pathlib import Path

# Define the path to the data directory
data_dir = Path("Input/Data")

# Find all CSV files that start with "report_2025-09-29"
csv_pattern = str(data_dir / "report_2025-09-29*.csv")
csv_files = glob.glob(csv_pattern)

print(f"Found {len(csv_files)} CSV files:")
for file in csv_files:
    print(f"  - {Path(file).name}")

# Read and combine all CSV files into one dataframe
combined_dataframes = []

for csv_file in csv_files:
    try:
        df = pd.read_csv(csv_file)
        # Add a column to track which file this data came from
        df['source_file'] = Path(csv_file).name
        combined_dataframes.append(df)
        print(f"Successfully read {Path(csv_file).name}: {len(df)} rows, {len(df.columns)} columns")
    except Exception as e:
        print(f"Error reading {Path(csv_file).name}: {e}")

# Combine all dataframes
if combined_dataframes:
    combined_report_df = pd.concat(combined_dataframes, ignore_index=True)
    print(f"\nCombined dataframe created:")
    print(f"Total rows: {len(combined_report_df)}")
    print(f"Total columns: {len(combined_report_df.columns)}")
    print(f"Files combined: {combined_report_df['source_file'].nunique()}")
    
    # Show basic info about the combined dataframe
    print(f"\nColumn names:")
    for col in combined_report_df.columns:
        print(f"  - {col}")
    
    # Display first few rows
    print(f"\nFirst 5 rows:")
    display(combined_report_df.head())
else:
    print("No CSV files were successfully read!")
    combined_report_df = pd.DataFrame()  # Create empty dataframe as fallback

Found 7 CSV files:
  - report_2025-09-29T145010.csv
  - report_2025-09-29T145248.csv
  - report_2025-09-29T145327.csv
  - report_2025-09-29T145804.csv
  - report_2025-09-29T150053.csv
  - report_2025-09-29T150112.csv
  - report_2025-09-29T150138.csv


  df = pd.read_csv(csv_file)


Successfully read report_2025-09-29T145010.csv: 373550 rows, 23 columns
Successfully read report_2025-09-29T145248.csv: 203259 rows, 23 columns


  df = pd.read_csv(csv_file)


Successfully read report_2025-09-29T145327.csv: 257270 rows, 23 columns


  df = pd.read_csv(csv_file)


Successfully read report_2025-09-29T145804.csv: 176143 rows, 23 columns
Successfully read report_2025-09-29T150053.csv: 227130 rows, 23 columns
Successfully read report_2025-09-29T150112.csv: 211036 rows, 23 columns
Successfully read report_2025-09-29T150138.csv: 112691 rows, 23 columns

Combined dataframe created:
Total rows: 1561079
Total columns: 23
Files combined: 7

Column names:
  - Rotabull RFQ ID
  - Source RFQ ID
  - Received At (UTC)
  - Priority
  - Buyer Company Name
  - Buyer Company Address
  - Buyer Company Country
  - Buyer Industry
  - Buyer Contact Name
  - Buyer Contact Email
  - RFQ Status
  - RFQ Source
  - RFQ Type
  - Part Number
  - Condition Code
  - Quantity
  - Alternate Part Number
  - Description
  - ILS Flag Description
  - Service Requested
  - Assigned User
  - Assigned Team
  - source_file

First 5 rows:


Unnamed: 0,Rotabull RFQ ID,Source RFQ ID,Received At (UTC),Priority,Buyer Company Name,Buyer Company Address,Buyer Company Country,Buyer Industry,Buyer Contact Name,Buyer Contact Email,...,Part Number,Condition Code,Quantity,Alternate Part Number,Description,ILS Flag Description,Service Requested,Assigned User,Assigned Team,source_file
0,27824322,ILSCAWF250328192447,2025-03-29 00:27:23,Routine,AVTRADE LIMITED,"WEST SUSSEX, ENG, UNITED KINGDOM",UNITED KINGDOM,,Harry Chalfont,harry.chalfont@avtrade.com,...,3215302-4,OH,1,3215302-5,"VALVE, HIGH PRESSURE",,,,,report_2025-09-29T145010.csv
1,27824322,ILSCAWF250328192447,2025-03-29 00:27:23,Routine,AVTRADE LIMITED,"WEST SUSSEX, ENG, UNITED KINGDOM",UNITED KINGDOM,,Harry Chalfont,harry.chalfont@avtrade.com,...,3215302-5,OH,1,3215302-4,"VALVE, HIGH PRESSURE",,,,,report_2025-09-29T145010.csv
2,27824730,M27,2025-03-29 03:36:46,Expedite,PIONEER AERO SUPPLY,"CHICAGO, IL, UNITED STATES",UNITED STATES,,Patrick Armstrong,parmstrong@pioneer-aero.com,...,326975,SV,1,No Alt. PN,"VALVE, ON/OFF, ANTI-",,,,North America,report_2025-09-29T145010.csv
3,27824796,ILSCGK0250328224116,2025-03-29 03:43:17,Expedite,Airbay,"Solomejos Neries 9-54, Vilnius, 06317, Lithuania",Lithuania,,Adam Kupcevic,operations@airbayaviation.com,...,754D0000-01,OH,1,754A0000-03,"HEAT EXCHANGER, MAIN",Yes,,,,report_2025-09-29T145010.csv
4,27824796,ILSCGK0250328224116,2025-03-29 03:43:17,Expedite,Airbay,"Solomejos Neries 9-54, Vilnius, 06317, Lithuania",Lithuania,,Adam Kupcevic,operations@airbayaviation.com,...,755C0000-01,OH,1,754D0000-01,"REHEATER ASSY, AIR C",Yes,,,,report_2025-09-29T145010.csv


In [66]:
Rotabull_df = combined_report_df.copy()

# Remove Duplicates within a span of a 2 days and removing quantities that are unrealistic

In [67]:
from Python.Filtering import day2_duplicates
from pathlib import Path

filename_duplicates = "Rotabull_DeDuplicates.csv"
output_path = Path("Output") / filename_duplicates

if output_path.exists():
    print(f"File {output_path} already exists. Loading existing file...")
    Rotabull_df_dedup = read_csv_dataframe(output_path)
else:
    print(f"File {output_path} does not exist. Running day2_duplicates...")
    Rotabull_df_dedup = day2_duplicates(Rotabull_df, filename_duplicates)

File Output\Rotabull_DeDuplicates.csv already exists. Loading existing file...


  df = pd.read_csv(p)


In [68]:
save_csv_dataframe(Rotabull_df_dedup, filename_duplicates)
Rotabull_df = Rotabull_df_dedup.copy()

File already exists, skipping: c:\Users\mmarek\Documents\Project_Files\Rotabull Data Export 1 Year\Data_Pre_Processes\Output\Rotabull_DeDuplicates.csv


In [69]:
from pathlib import Path

filename_nonoutliers = "Rotabull_NonOutliers.csv"
output_path = Path("Output") / filename_nonoutliers

if output_path.exists():
    print(f"File {output_path} already exists. Loading existing file...")
    Rotabull_df_nonoutliers = read_csv_dataframe(output_path)
else:
    print(f"File {output_path} does not exist. Running quantity_outliers_improved...")
    Rotabull_df_nonoutliers = quantity_outliers_improved(Rotabull_df)

File Output\Rotabull_NonOutliers.csv does not exist. Running quantity_outliers_improved...
Starting improved quantity outlier detection...
Global quantity statistics:
  Median: 1
  95th percentile: 12
  99th percentile: 80
  99.9th percentile: 500
Progressive outlier detection results:
  Original rows: 1,299,632
  Removed outliers: 1,964
  Final rows: 1,297,668
  Removal rate: 0.15%

Parts with outliers removed (669 parts):
  002A0003-45 (LOW): removed 1 quantities [40000]
    → threshold: ≤ 5.0 | median=1, iqr=19999.5
  024147-000 (LOW): removed 1 quantities [70]
    → threshold: ≤ 50.0 | median=1, iqr=1.0
  025-1156-001 (LOW): removed 2 quantities [100, 108]
    → threshold: ≤ 85.0 | median=2, iqr=14.0
  025-1157-001 (LOW): removed 1 quantities [200]
    → threshold: ≤ 50.0 | median=1, iqr=7.8
  02607025AIR (LOW): removed 1 quantities [250]
    → threshold: ≤ 80.0 | median=4, iqr=14.2
  0320KPU01 (LOW): removed 2 quantities [5000, 5000]
    → threshold: ≤ 80.0 | median=1, iqr=0.0
  0

In [70]:
save_csv_dataframe(Rotabull_df_nonoutliers, filename_nonoutliers)
Rotabull_df = Rotabull_df_nonoutliers.copy()

Saving DataFrame to: Rotabull_NonOutliers.csv
Saved DataFrame to: c:\Users\mmarek\Documents\Project_Files\Rotabull Data Export 1 Year\Data_Pre_Processes\Output\Rotabull_NonOutliers.csv


# Finding Inventory

In [71]:
from Python.Inventory import findInventory

filename_inventory = "DF_Inventory_Snowflake.csv"
output_path = Path("Output") / filename_inventory

if output_path.exists():
    print(f"File {output_path} already exists. Loading existing file...")
    DF_Inventory = read_csv_dataframe(output_path)
else:
    print(f"File {output_path} does not exist. Running findInventory...")
    DF_Inventory = findInventory(Rotabull_df, filename_inventory)

File Output\DF_Inventory_Snowflake.csv already exists. Loading existing file...


In [72]:
save_csv_dataframe(DF_Inventory, filename_inventory)

File already exists, skipping: c:\Users\mmarek\Documents\Project_Files\Rotabull Data Export 1 Year\Data_Pre_Processes\Output\DF_Inventory_Snowflake.csv


In [73]:
Rotabull_df_copy = Rotabull_df.copy()
DF_Inventory_copy = DF_Inventory.copy()

# Finding Inventory based on Date 

In [74]:
def to_date_only(series):
    # parse to datetime (coerce invalids to NaT) then keep only the date part
    dt = pd.to_datetime(series, errors="coerce", infer_datetime_format=True)
    return dt.dt.date

# Rotabull copy: Received At (UTC) -> Received_Date (date only)
if "Received At (UTC)" in Rotabull_df_copy.columns:
    Rotabull_df_copy["Received_Date"] = to_date_only(Rotabull_df_copy["Received At (UTC)"])
    print("Rotabull: Created 'Received_Date' (date only). Nulls:", Rotabull_df_copy["Received_Date"].isna().sum())
else:
    print("Rotabull: column 'Received At (UTC)' not found")

# DF_Inventory copy: SNAP_DATE -> SNAP_DATE_date (date only)
if "SNAP_DATE" in DF_Inventory_copy.columns:
    DF_Inventory_copy["SNAP_DATE_date"] = to_date_only(DF_Inventory_copy["SNAP_DATE"])
    print("DF_Inventory: Created 'SNAP_DATE_date' (date only). Nulls:", DF_Inventory_copy["SNAP_DATE_date"].isna().sum())
else:
    print("DF_Inventory: column 'SNAP_DATE' not found")

# Quick sanity checks (show a few values)
# print("Rotabull sample dates:", Rotabull_df_copy["Received_Date"].dropna().unique()[:5])
# print("DF_Inventory sample dates:", DF_Inventory_copy["SNAP_DATE_date"].dropna().unique()[:5])

  dt = pd.to_datetime(series, errors="coerce", infer_datetime_format=True)


Rotabull: Created 'Received_Date' (date only). Nulls: 0


  dt = pd.to_datetime(series, errors="coerce", infer_datetime_format=True)


DF_Inventory: Created 'SNAP_DATE_date' (date only). Nulls: 0


In [75]:
Rotabull_df_copy["Part_norm"] = Rotabull_df_copy["Part Number"].astype(str).str.strip()
DF_Inventory_copy["PN_norm"] = DF_Inventory_copy["P/N"].astype(str).str.strip()

# ensure date-only columns exist and are actual date objects
# (assumes the to_date_only step ran earlier and created Received_Date and SNAP_DATE_date)
# coerce Qty Available to numeric for aggregation
DF_Inventory_copy["Qty Available"] = pd.to_numeric(DF_Inventory_copy.get("Qty Available"), errors="coerce")

# aggregate inventory by part and date (sum quantity available for that day)
inv_daily = (
    DF_Inventory_copy
    .dropna(subset=["PN_norm", "SNAP_DATE_date"])
    .groupby(["PN_norm", "SNAP_DATE_date"], as_index=False)["Qty Available"]
    .sum()
)

# merge inventory totals into rotabull rows on part + date
merged = Rotabull_df_copy.merge(
    inv_daily,
    left_on=["Part_norm", "Received_Date"],
    right_on=["PN_norm", "SNAP_DATE_date"],
    how="left",
    suffixes=("", "_inv")
)

# format Stock: numeric where found, 'NS' where not
def _format_stock(x):
    if pd.isna(x):
        return "NS"
    try:
        f = float(x)
    except Exception:
        return "NS"
    if f.is_integer():
        return int(f)
    return round(f, 2)

merged["Stock"] = merged["Qty Available"].apply(_format_stock)

# copy Stock back to Rotabull_df_copy_price and clean helper cols
Rotabull_df_copy["Stock"] = merged["Stock"].values

# optional: drop helper normalization columns if not needed
Rotabull_df_copy.drop(columns=["Part_norm"], inplace=True, errors="ignore")
DF_Inventory_copy.drop(columns=["PN_norm"], inplace=True, errors="ignore")

print(f"Added 'Stock' column to Rotabull_df_copy ({Rotabull_df_copy.shape[0]} rows). Sample:")
print(Rotabull_df_copy[["Part Number", "Received_Date", "Stock"]].head(10))

Added 'Stock' column to Rotabull_df_copy (1297668 rows). Sample:
       Part Number Received_Date Stock
0        3215302-4    2025-03-29    NS
1        3215302-5    2025-03-29    NS
2           326975    2025-03-29    NS
3      754D0000-01    2025-03-29    NS
4      755C0000-01    2025-03-29    NS
5          767669B    2025-03-29    NS
6   D5311047700007    2025-03-29    NS
7     D53132010000    2025-03-29    NS
8     664700500A4D    2025-03-29    NS
9  80-178-03-88013    2025-03-29    NS


# Whether no stock or not

In [76]:
df_temp1 = Rotabull_df_copy  # alias for readability

if "Stock" not in df_temp1.columns:
    print("Column 'Stock' not found in dataframe.")
else:
    # candidate names for the quantity column (extend if needed)
    qty_candidates = ["Quantity", "Qty", "Quantity Received", "Qty Received", "Qty Received (EA)"]
    cols = df_temp1.columns.tolist()

    # find the first matching quantity column present
    qty_col = next((c for c in qty_candidates if c in cols), None)

    if qty_col:
        # remove Stock then insert it right after the qty_col
        cols.remove("Stock")
        insert_idx = cols.index(qty_col) + 1
        cols.insert(insert_idx, "Stock")
        df_temp1 = df_temp1.loc[:, cols]
        print(f"Moved 'Stock' to follow '{qty_col}'.")
    else:
        # fallback: put Stock next to Part Number if Quantity not found
        if "Part Number" in cols:
            cols.remove("Stock")
            insert_idx = cols.index("Part Number") + 1
            cols.insert(insert_idx, "Stock")
            df_temp1 = df_temp1.loc[:, cols]
            print("Quantity column not found — placed 'Stock' after 'Part Number' instead.")
        else:
            print("No suitable column found to place 'Stock' next to. Leaving column order unchanged.")



Moved 'Stock' to follow 'Quantity'.


In [77]:

Rotabull_df = df_temp1.copy()
Rotabull_df.head(5)


Unnamed: 0,Rotabull RFQ ID,Source RFQ ID,Received At (UTC),Priority,Buyer Company Name,Buyer Company Address,Buyer Company Country,Buyer Industry,Buyer Contact Name,Buyer Contact Email,...,Quantity,Stock,Alternate Part Number,Description,ILS Flag Description,Service Requested,Assigned User,Assigned Team,source_file,Received_Date
0,27824322,ILSCAWF250328192447,2025-03-29 00:27:23,Routine,AVTRADE LIMITED,"WEST SUSSEX, ENG, UNITED KINGDOM",UNITED KINGDOM,,Harry Chalfont,harry.chalfont@avtrade.com,...,1,NS,3215302-5,"VALVE, HIGH PRESSURE",,,,,report_2025-09-29T145010.csv,2025-03-29
1,27824322,ILSCAWF250328192447,2025-03-29 00:27:23,Routine,AVTRADE LIMITED,"WEST SUSSEX, ENG, UNITED KINGDOM",UNITED KINGDOM,,Harry Chalfont,harry.chalfont@avtrade.com,...,1,NS,3215302-4,"VALVE, HIGH PRESSURE",,,,,report_2025-09-29T145010.csv,2025-03-29
2,27824730,M27,2025-03-29 03:36:46,Expedite,PIONEER AERO SUPPLY,"CHICAGO, IL, UNITED STATES",UNITED STATES,,Patrick Armstrong,parmstrong@pioneer-aero.com,...,1,NS,No Alt. PN,"VALVE, ON/OFF, ANTI-",,,,North America,report_2025-09-29T145010.csv,2025-03-29
3,27824796,ILSCGK0250328224116,2025-03-29 03:43:17,Expedite,Airbay,"Solomejos Neries 9-54, Vilnius, 06317, Lithuania",Lithuania,,Adam Kupcevic,operations@airbayaviation.com,...,1,NS,754A0000-03,"HEAT EXCHANGER, MAIN",Yes,,,,report_2025-09-29T145010.csv,2025-03-29
4,27824796,ILSCGK0250328224116,2025-03-29 03:43:17,Expedite,Airbay,"Solomejos Neries 9-54, Vilnius, 06317, Lithuania",Lithuania,,Adam Kupcevic,operations@airbayaviation.com,...,1,NS,754D0000-01,"REHEATER ASSY, AIR C",Yes,,,,report_2025-09-29T145010.csv,2025-03-29


# customer data (uncertain on implementation)

In [78]:
customer_path = Path("Input/customer_data.xlsx")
customer_df = read_excel_dataframe(customer_path)

  warn("Workbook contains no default style, apply openpyxl's default")


# FINAL FILE SAVING

In [82]:
output_path = Path("Output") / "Rotabull_Data_Processed.csv"

if output_path.exists():
    print(f"File {output_path} already exists. Skipping save operation.")
else:
    save_csv_dataframe(Rotabull_df, "Rotabull_Data_Processed.csv")
    print(f"File saved to {output_path}")

File Output\Rotabull_Data_Processed.csv already exists. Skipping save operation.
