In [2]:
import pandas as pd

# Output path helper function
from pathlib import Path
import os

def get_output_path(filename):
    """Get the correct output path based on file type"""
    base_path = Path('../../04_outputs')
    
    # Processed data files (should go to 01_data/processed)
    if any(x in filename.upper() for x in ['FINAL_', 'IBA_FAMILY', 'IBA_Fleet', 'merged_iba', 'market_tightness']):
        path = Path('../../01_data/processed') / filename
    # Predictions CSVs
    elif 'prediction' in filename.lower() or 'ensemble' in filename.lower():
        path = base_path / 'predictions' / filename
    # Metrics CSVs
    elif any(x in filename.lower() for x in ['metric', 'summary', 'quantile', 'winkler', 'segment']):
        path = base_path / 'metrics' / filename
    # Models (pkl files)
    elif filename.endswith('.pkl'):
        path = Path('../../03_models/saved_models') / filename
    # JSON mappings
    elif filename.endswith('.json'):
        path = Path('../../03_models/model_artifacts') / filename
    # Visualizations (all image files and html)
    elif filename.endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.html')):
        path = base_path / 'visualizations' / filename
    # Excel files - usually processed data
    elif filename.endswith('.xlsx'):
        path = Path('../../01_data/processed') / filename
    # Default to processed data for other CSVs
    elif filename.endswith('.csv'):
        path = Path('../../01_data/processed') / filename
    else:
        path = base_path / filename
    
    # Create directory if it doesn't exist
    path.parent.mkdir(parents=True, exist_ok=True)
    return path

In [3]:
# Load data
df_agg = pd.read_csv('../../01_data/processed/MVP_Aggregated_Part_Data_with_Classification.csv')

In [4]:
df_agg

Unnamed: 0,PN,year,month,Part Date,End User Companies,End User Inquiries,Non-End User Companies,Non-End User Inquiries,Total Sources,Total Quantity,...,Quantity NS,Quantity OH,Quantity SV,Quantity AR,entry_month,Demand_Type,ADI,CV²,P/N Type Code,P/N Application Code
0,019-012-001,2021,1,2021-01-01,10,12,33,60,77,236,...,7,18,65,141,2021-01,Erratic,1.0509,0.8883,ASG,B767
1,019-012-001,2021,2,2021-02-01,5,7,45,106,76,222,...,7,19,61,128,2021-02,Erratic,1.0509,0.8883,ASG,B767
2,019-012-001,2021,3,2021-03-01,4,5,15,43,74,216,...,7,17,60,125,2021-03,Erratic,1.0509,0.8883,ASG,B767
3,019-012-001,2021,4,2021-04-01,9,12,42,106,69,201,...,8,16,50,122,2021-04,Erratic,1.0509,0.8883,ASG,B767
4,019-012-001,2021,5,2021-05-01,6,7,33,88,71,198,...,7,16,50,120,2021-05,Erratic,1.0509,0.8883,ASG,B767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27197,Z014H000333B,2025,6,2025-06-01,20,25,136,320,35,83,...,0,1,46,30,2025-06,Smooth,1.0140,0.4036,ASG,A320
27198,Z014H000333B,2025,7,2025-07-01,33,57,189,455,37,90,...,0,1,51,32,2025-07,Smooth,1.0140,0.4036,ASG,A320
27199,Z014H000333B,2025,8,2025-08-01,33,46,359,821,37,80,...,0,1,43,28,2025-08,Smooth,1.0140,0.4036,ASG,A320
27200,Z014H000333B,2025,9,2025-09-01,18,28,148,329,37,84,...,0,1,41,34,2025-09,Smooth,1.0140,0.4036,ASG,A320


In [5]:
IBA_df = pd.read_csv('../../01_data/processed/IBA_Family_Features_FULL.csv')

In [6]:
# Check df_agg columns and sample data
print("df_agg shape:", df_agg.shape)
print("\ndf_agg columns:", df_agg.columns.tolist())
print("\nSample P/N Application Code values:")
print(df_agg['P/N Application Code'].value_counts().head(10))

df_agg shape: (27202, 21)

df_agg columns: ['PN', 'year', 'month', 'Part Date', 'End User Companies', 'End User Inquiries', 'Non-End User Companies', 'Non-End User Inquiries', 'Total Sources', 'Total Quantity', 'Quantity NE', 'Quantity NS', 'Quantity OH', 'Quantity SV', 'Quantity AR', 'entry_month', 'Demand_Type', 'ADI', 'CV²', 'P/N Type Code', 'P/N Application Code']

Sample P/N Application Code values:
P/N Application Code
CF6-80C2      5568
A320          4466
CFM56-7B      4350
B737-NG       3944
CFM56-5B      1856
PW4000 94"    1276
B767          1044
V2500-A5       870
GTCP131-9B     812
B777           754
Name: count, dtype: int64


In [7]:
# Check IBA_df columns and sample data
print("IBA_df shape:", IBA_df.shape)
print("\nIBA_df columns:", IBA_df.columns.tolist())
print("\nSample AircraftFamily values:")
print(IBA_df['AircraftFamily'].value_counts().head(10))

IBA_df shape: (83, 36)

IBA_df columns: ['AircraftFamily', 'fleet_count', 'age_mean', 'age_median', 'age_min', 'age_max', 'build_year_min', 'build_year_max', 'build_year_mean', 'order_year_min', 'order_year_max', 'order_year_mean', 'lease_start_year_median', 'num_operators', 'num_owners', 'num_countries_operated_for', 'age_0_8_count', 'age_0_8_share', 'age_8_15_count', 'age_8_15_share', 'age_15_20_count', 'age_15_20_share', 'age_20_30_count', 'age_20_30_share', 'age_30_plus_count', 'age_30_plus_share', 'age_15_25_count', 'age_15_25_share', 'age_25_plus_count', 'age_25_plus_share', 'share_owned', 'share_leased', 'share_operating_lease', 'share_finance_lease', 'share_gov_mil', 'share_commercial']

Sample AircraftFamily values:
AircraftFamily
144 Family                1
146 Family                1
2000 Family               1
204 Family                1
328 Family (Jet)          1
328 Family (Turboprop)    1
428 Family                1
614 Family                1
707 Family                

In [8]:
# See all unique values to understand the mapping
print("All unique P/N Application Code values:")
print(sorted(df_agg['P/N Application Code'].dropna().unique()))
print("\n" + "="*50 + "\n")
print("All unique AircraftFamily values:")
print(sorted(IBA_df['AircraftFamily'].unique()))

All unique P/N Application Code values:
['737', 'A320', 'A320-NEO', 'A330', 'A340-600', 'A350', 'B737-NG', 'B747', 'B757', 'B767', 'B777', 'B787', 'CF6-80A', 'CF6-80C2', 'CFM56-5A', 'CFM56-5B', 'CFM56-7', 'CFM56-7B', 'GTCP131-9A', 'GTCP131-9B', 'GTCP331-200', 'GTCP331-350C', 'PW2000', 'PW4000', 'PW4000 94"', 'PW4168 100"', 'TRENT 700', 'V2500-A5']


All unique AircraftFamily values:
['144 Family', '146 Family', '2000 Family', '204 Family', '328 Family (Jet)', '328 Family (Turboprop)', '428 Family', '614 Family', '707 Family', '717 Family', '727 Family', '728 Family', '737 Family', '747 Family', '757 Family', '767 Family', '777 Family', '787 Family', '928 Family', 'A220 Family', 'A300 Family', 'A310 Family', 'A320 Family', 'A330 Family', 'A340 Family', 'A350 Family', 'A380 Family', 'A400 Family', 'AG-600 Family', 'AN-124 Family', 'AN-140 Family', 'AN-148 Family', 'AN-22 Family', 'AN-225 Family', 'AN72/74 Family', 'ARJ21 / C909 Family', 'ATP Family', 'ATR Family', 'BE-200 Family', 'C-390

In [9]:
# Simplified mapping with one aircraft family per part (most popular)
mapping = {
    # Airframes
    '737': '737 Family',
    'A320': 'A320 Family',
    'A320-NEO': 'A320 Family',
    'A330': 'A330 Family',
    'A340-600': 'A340 Family',
    'A350': 'A350 Family',
    'B737-NG': '737 Family',
    'B747': '747 Family',
    'B757': '757 Family',
    'B767': '767 Family',
    'B777': '777 Family',
    'B787': '787 Family',
    
    # Engines 
    'CF6-80A': '767 Family',
    'CF6-80C2': '767 Family',  # Also 747/MD-11
    'CFM56-5A': 'A320 Family',
    'CFM56-5B': 'A320 Family',
    'CFM56-7': '737 Family',
    'CFM56-7B': '737 Family',
    'PW2000': '757 Family',
    'PW4000': '767 Family',    # Also 747/MD-11
    'PW4000 94"': '767 Family',  # Also 747/MD-11
    'PW4168 100"': 'A330 Family',
    'TRENT 700': 'A330 Family',
    'V2500-A5': 'A320 Family',
    
    # APUs
    'GTCP131-9A': 'A320 Family',
    'GTCP131-9B': '737 Family',
    'GTCP331-200': '757 Family',  # Also 767
    'GTCP331-350C': 'A330 Family'
}

print("Mapping created with", len(mapping), "entries")

Mapping created with 28 entries


In [10]:
# Simplified function to merge IBA data (one-to-one mapping)
def merge_iba_data(df_agg, IBA_df, mapping):
    """
    Merge IBA data into df_agg based on P/N Application Code to AircraftFamily mapping.
    Uses the most popular aircraft family for each part (one-to-one mapping).
    """
    # Create a copy to avoid modifying original
    result_df = df_agg.copy()
    
    # Map P/N Application Code to Aircraft Family
    result_df['Matched_AircraftFamily'] = result_df['P/N Application Code'].map(mapping)
    
    # Merge with IBA data
    result_df = result_df.merge(
        IBA_df, 
        left_on='Matched_AircraftFamily', 
        right_on='AircraftFamily', 
        how='left',
        suffixes=('', '_IBA')
    )
    
    # Rename IBA columns to have IBA_ prefix for clarity
    iba_columns = [col for col in IBA_df.columns if col != 'AircraftFamily']
    rename_dict = {col: f'IBA_{col}' for col in iba_columns}
    result_df = result_df.rename(columns=rename_dict)
    
    # Drop the duplicate AircraftFamily column from IBA
    if 'AircraftFamily' in result_df.columns:
        result_df = result_df.drop(columns=['AircraftFamily'])
    
    return result_df

print("Merge function created")

Merge function created


In [11]:
# Execute the merge
df_merged = merge_iba_data(df_agg, IBA_df, mapping)

print(f"Original df_agg shape: {df_agg.shape}")
print(f"Merged df shape: {df_merged.shape}")
print(f"\nNumber of rows expanded due to multiple aircraft families: {df_merged.shape[0] - df_agg.shape[0]}")

Original df_agg shape: (27202, 21)
Merged df shape: (27202, 57)

Number of rows expanded due to multiple aircraft families: 0


In [12]:
# Check the results
print("Sample of merged data:")
print(df_merged[['PN', 'P/N Application Code', 'Matched_AircraftFamily', 'IBA_fleet_count', 'IBA_age_mean']].head(20))
print("\n" + "="*80 + "\n")

# Check parts with multiple aircraft families
print("Example: CF6-80C2 engine (should have multiple aircraft families):")
cf6_example = df_merged[df_merged['P/N Application Code'] == 'CF6-80C2'][['PN', 'P/N Application Code', 'Matched_AircraftFamily', 'IBA_fleet_count']].head(10)
print(cf6_example)

Sample of merged data:
             PN P/N Application Code Matched_AircraftFamily  IBA_fleet_count  \
0   019-012-001                 B767             767 Family             1348   
1   019-012-001                 B767             767 Family             1348   
2   019-012-001                 B767             767 Family             1348   
3   019-012-001                 B767             767 Family             1348   
4   019-012-001                 B767             767 Family             1348   
5   019-012-001                 B767             767 Family             1348   
6   019-012-001                 B767             767 Family             1348   
7   019-012-001                 B767             767 Family             1348   
8   019-012-001                 B767             767 Family             1348   
9   019-012-001                 B767             767 Family             1348   
10  019-012-001                 B767             767 Family             1348   
11  019-012-001  

In [13]:
# Summary statistics
print("Matching statistics:")
print(f"Rows with matched aircraft family: {df_merged['Matched_AircraftFamily'].notna().sum()}")
print(f"Rows without matched aircraft family: {df_merged['Matched_AircraftFamily'].isna().sum()}")
print("\nMatched aircraft families distribution:")
print(df_merged['Matched_AircraftFamily'].value_counts())

Matching statistics:
Rows with matched aircraft family: 27202
Rows without matched aircraft family: 0

Matched aircraft families distribution:
Matched_AircraftFamily
737 Family     9222
767 Family     8004
A320 Family    7598
A330 Family     812
777 Family      754
747 Family      290
757 Family      232
A340 Family     116
787 Family      116
A350 Family      58
Name: count, dtype: int64


In [14]:
# Display the merged dataframe
df_merged

Unnamed: 0,PN,year,month,Part Date,End User Companies,End User Inquiries,Non-End User Companies,Non-End User Inquiries,Total Sources,Total Quantity,...,IBA_age_15_25_count,IBA_age_15_25_share,IBA_age_25_plus_count,IBA_age_25_plus_share,IBA_share_owned,IBA_share_leased,IBA_share_operating_lease,IBA_share_finance_lease,IBA_share_gov_mil,IBA_share_commercial
0,019-012-001,2021,1,2021-01-01,10,12,33,60,77,236,...,182,0.135015,811,0.601632,0.773739,0.226261,0.167656,0.020772,0.09273,0.688427
1,019-012-001,2021,2,2021-02-01,5,7,45,106,76,222,...,182,0.135015,811,0.601632,0.773739,0.226261,0.167656,0.020772,0.09273,0.688427
2,019-012-001,2021,3,2021-03-01,4,5,15,43,74,216,...,182,0.135015,811,0.601632,0.773739,0.226261,0.167656,0.020772,0.09273,0.688427
3,019-012-001,2021,4,2021-04-01,9,12,42,106,69,201,...,182,0.135015,811,0.601632,0.773739,0.226261,0.167656,0.020772,0.09273,0.688427
4,019-012-001,2021,5,2021-05-01,6,7,33,88,71,198,...,182,0.135015,811,0.601632,0.773739,0.226261,0.167656,0.020772,0.09273,0.688427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27197,Z014H000333B,2025,6,2025-06-01,20,25,136,320,35,83,...,3113,0.242691,1358,0.105870,0.436189,0.563811,0.480861,0.034303,0.00460,0.872612
27198,Z014H000333B,2025,7,2025-07-01,33,57,189,455,37,90,...,3113,0.242691,1358,0.105870,0.436189,0.563811,0.480861,0.034303,0.00460,0.872612
27199,Z014H000333B,2025,8,2025-08-01,33,46,359,821,37,80,...,3113,0.242691,1358,0.105870,0.436189,0.563811,0.480861,0.034303,0.00460,0.872612
27200,Z014H000333B,2025,9,2025-09-01,18,28,148,329,37,84,...,3113,0.242691,1358,0.105870,0.436189,0.563811,0.480861,0.034303,0.00460,0.872612


In [15]:
# Save the merged dataframe to CSV
df_merged.to_csv(get_output_path('FINAL_merged_iba_data.csv'), index=False)
print("Merged data saved to 'merged_iba_data.csv'")

Merged data saved to 'merged_iba_data.csv'
