In [None]:
import pandas as pd
import numpy as np
import math
from itertools import cycle
import csv

flight_df = pd.read_csv("flight_data.csv")

# Processing discrete columns (frequency calculation)
discrete_columns = ['from_airport_code', 'from_country', 'dest_airport_code', 'dest_country', 'stops']

# Calculate frequencies for 'from_country' and 'dest_country'
from_country_freq = flight_df['from_country'].value_counts()
dest_country_freq = flight_df['dest_country'].value_counts()

# Calculate frequencies for 'from_airport_code' and 'dest_airport_code'
from_airport_freq = flight_df['from_airport_code'].value_counts()
dest_airport_freq = flight_df['dest_airport_code'].value_counts()

# Create mappings based on sorted frequencies
from_country_mapping = {country: idx for idx, country in enumerate(from_country_freq.index)}
dest_country_mapping = {country: idx for idx, country in enumerate(dest_country_freq.index)}

from_airport_mapping = {airport: idx for idx, airport in enumerate(from_airport_freq.index)}
dest_airport_mapping = {airport: idx for idx, airport in enumerate(dest_airport_freq.index)}

# For 'stops', we can use factorize since it's only in one column
flight_df['stops'], _ = pd.factorize(flight_df['stops'])

# Number of unique values for each column
dim_nums = [
    flight_df['from_airport_code'].nunique(),
    flight_df['from_country'].nunique(),
    flight_df['dest_airport_code'].nunique(),
    flight_df['dest_country'].nunique(),
    len(pd.unique(flight_df['stops']))
]

# Apply the mappings to the DataFrame
flight_df['from_airport_code'] = flight_df['from_airport_code'].map(from_airport_mapping)
flight_df['from_country'] = flight_df['from_country'].map(from_country_mapping)
flight_df['dest_airport_code'] = flight_df['dest_airport_code'].map(dest_airport_mapping)
flight_df['dest_country'] = flight_df['dest_country'].map(dest_country_mapping)

# Add a header row with dim_num values
header = discrete_columns

# Save the dim_nums and the DataFrame to a CSV file
csv_filename_with_dim = 'flight_data_discrete_with_price_time.csv'
with open(csv_filename_with_dim, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(discrete_columns + ['price'] + ['departure_time'] + ['arrival_time'])  # Write the header
    for index, row in flight_df.iterrows():
        writer.writerow(row[discrete_columns].tolist() + [row['price'], row['departure_time'], row['arrival_time']])

In [None]:
import pandas as pd
import numpy as np
import math
from itertools import cycle

def init_tuples_and_data_info(filename, primary_cols, secondary_cols, exclude_col_index):
    df = pd.read_csv(filename)
    
    # Apply preprocessing that includes type conversions and mappings - this function is dataset specific to preprocess your dataset
    df = type_preproc(df)
    #print(df.columns)

    actual_columns = df.columns.tolist()
    if max(primary_cols + [col for col in secondary_cols if col != -1]) >= len(actual_columns):
        raise ValueError("One or more column indices are out of range. Please check the input indices.")

    for primary, secondary in zip(primary_cols, secondary_cols):
        if secondary != -1 and primary != exclude_col_index:
            primary_name = df.columns[primary]
            secondary_name = df.columns[secondary]

            # Ensure secondary values are non-negative before applying the transformation
            df[secondary_name] = df[secondary_name].apply(lambda x: max(0, x))
            # Combine primary and secondary into a new value in the primary column
            df[primary_name] = df.apply(lambda x: int(f"{int(x[primary_name]):02d}{int(x[secondary_name]):03d}"), axis=1)
    
    # Drop secondary columns post-combination
    droplist = [df.columns[secondary] for secondary in secondary_cols if secondary != -1 and secondary < len(df.columns)]
    df.drop(columns=droplist, inplace=True)

    #print(df.columns)
    return df

def type_preproc(df):
    df['departure_time'] = pd.to_datetime(df['departure_time'])
    df['arrival_time'] = pd.to_datetime(df['arrival_time'])

    # Calculate flight duration and convert to integer using the ceil function
    df['duration'] = (df['arrival_time'] - df['departure_time']).dt.total_seconds() / 3600  # Duration in hours
    df['duration'] = np.ceil(df['duration']).astype(int)  # Ceil and convert to integer    

    if 'departure_time' in df.columns:
        df['departure_time'] = df['departure_time'].dt.hour.apply(map_time_interval)
    if 'arrival_time' in df.columns:
        df['arrival_time'] = df['arrival_time'].dt.hour.apply(map_time_interval)

    df[['from_airport_code', 'from_country', 'dest_airport_code', 'dest_country', 'stops']] = df[['from_airport_code', 'from_country', 'dest_airport_code', 'dest_country', 'stops']].astype('int64')

    return df

def map_time_interval(hour):
    # Map hours to time intervals
    intervals = [4, 8, 12, 16, 20, 24]
    for i, interval in enumerate(intervals):
        if hour < interval:
            return i
    return 5  # Default case 

def apply_bucket_mapping(df, bucket_details, column_names):
    for column in column_names:
        # Initialize the bucket column with None
        bucket_col_name = f"{column}_bucket"
        df[bucket_col_name] = None
        
        # Get a series with the current column values
        col_values = df[column]
        
        # A dictionary to track the number of tuples needed to be assigned to each bucket
        bucket_counts = {}

        # Prepare a mapping from each bucket to its indices
        for bucket_index, values_in_bucket in enumerate(bucket_details[column]):
            for value, count in values_in_bucket.items():
                # Get indices where the column value matches 'value' and bucket is still unassigned
                valid_indices = df[(col_values == value) & (df[bucket_col_name].isnull())].index[:count]
                
                # Assign these indices the current bucket index
                if not valid_indices.empty:
                    df.loc[valid_indices, bucket_col_name] = bucket_index
                    
                    # Update the counts in bucket_counts
                    if value in bucket_counts:
                        bucket_counts[value] += len(valid_indices)
                    else:
                        bucket_counts[value] = len(valid_indices)

        # Optionally verify if all counts are matched (could be commented out for speed)
        for value in bucket_counts.keys():
            # Calculate expected count by summing all occurrences of 'value' across all buckets
            expected_count = sum(values_in_bucket.get(value, 0) for values_in_bucket in bucket_details[column])
            if bucket_counts[value] != expected_count:
                print(f"Warning: Mismatch in counts for value {value} in column {column}: expected {expected_count}, got {bucket_counts[value]}")

    return df

def calculate_bucket_allocations(grouped_data, num_buckets):
    total_tuples = grouped_data['freq'].sum()
    ideal_tuples_per_bucket = math.ceil(total_tuples / num_buckets)
    
    # Initialize bucket_details as a list of dictionaries for each bucket
    bucket_details = [{} for _ in range(num_buckets)]
    bucket_allocations = {}  # To keep track of which buckets each value goes into
    current_bucket = 0
    remaining_space_in_bucket = ideal_tuples_per_bucket

    for index, row in grouped_data.iterrows():
        value = row['value']
        freq = row['freq']
        start_bucket = current_bucket  # Remember the starting bucket for this value

        while freq > 0:
            space_used = min(freq, remaining_space_in_bucket)
            freq -= space_used
            remaining_space_in_bucket -= space_used

            # Track how many tuples of each value go into each bucket
            if value in bucket_details[current_bucket]:
                bucket_details[current_bucket][value] += space_used
            else:
                bucket_details[current_bucket][value] = space_used

            if remaining_space_in_bucket == 0:
                current_bucket += 1
                if current_bucket < num_buckets:
                    remaining_space_in_bucket = ideal_tuples_per_bucket

        end_bucket = current_bucket - 1 if remaining_space_in_bucket == 0 else current_bucket
        bucket_allocations[value] = (start_bucket, end_bucket)

    return bucket_allocations, bucket_details

# Integrate with existing workflow
def process_columns(df, num_buckets, column_names):
    all_allocations = {}
    all_bucket_details = {}
    for column in column_names:
        grouped_data = df.groupby(column).size().reset_index(name='freq')
        grouped_data.rename(columns={column: 'value'}, inplace=True)
        bucket_allocations, bucket_details = calculate_bucket_allocations(grouped_data, num_buckets)
        all_allocations[column] = bucket_allocations
        all_bucket_details[column] = bucket_details

    df = apply_bucket_mapping(df, all_bucket_details, column_names)
    return df, all_allocations

# Example usage
primary_cols = [0, 1, 2, 3, 4]  
secondary_cols = [-1, -1, -1, -1, -1]
#secondary_cols = [6, 6, 7, 7, 8]
#secondary_cols = [6, -1, 7, -1, 8]  # Make sure these indices are within the range of actual columns
exclude_col_index = 5  # Exclude column 5 from processing

try:
    df = init_tuples_and_data_info('flight_data_discrete_with_price_time.csv', primary_cols, secondary_cols, exclude_col_index)
    print(df.head())
except Exception as e:
    print(e)

column_names = ['from_airport_code', 'from_country', 'dest_airport_code', 'dest_country', 'stops']
num_buckets = 128
df_mapped, bucket_allocations = process_columns(df, num_buckets, column_names)
df_mapped.to_csv('tuples_with_buckets.csv', index=False)
print("Mapped DataFrame with bucket assignments has been saved to 'tuples_with_buckets.csv'.")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages

def plot_interactions(df, col1, col2, secondary1, secondary2, dataset_name, distribution_type, scale, bucket_suffix='_bucket', num_buckets=128):
    # Determine the bucketed column names
    bucket_col1 = col1 + bucket_suffix
    bucket_col2 = col2 + bucket_suffix
    
    # Ensure the column names are in the DataFrame
    if bucket_col1 not in df.columns or bucket_col2 not in df.columns:
        raise ValueError("One or both bucket columns are not in the DataFrame")
    
    # Prepare figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    
    # Plot Bucket Interactions
    interaction_matrix = np.zeros((num_buckets, num_buckets), dtype=int)
    interaction_counts = df.groupby([bucket_col1, bucket_col2]).size().reset_index(name='counts')
    for _, row in interaction_counts.iterrows():
        bucket1 = int(row[bucket_col1])
        bucket2 = int(row[bucket_col2])
        count = row['counts']
        if bucket1 < num_buckets and bucket2 < num_buckets:
            interaction_matrix[bucket1, bucket2] += count
    
    sns.heatmap(interaction_matrix, cmap='hot', annot=False, ax=ax1)
    title1 = f'Bucket Heatmap between {col1 + " + " + secondary1 if secondary1 else col1} and {col2 + " + " + secondary2 if secondary2 else col2}'
    ax1.set_title(title1, pad=12)  
    ax1.set_ylabel(bucket_col1)
    ax1.set_xlabel(bucket_col2)

    # Plot Value Interactions
    interaction_matrix = pd.crosstab(df[col1], df[col2]).values
    sns.heatmap(interaction_matrix, cmap='hot', annot=False, ax=ax2)
    title2 = f'Value Heatmap between {col1 + " + " + secondary1 if secondary1 else col1} and {col2 + " + " + secondary2 if secondary2 else col2}'
    ax2.set_title(title2, pad=12)  
    ax2.set_ylabel(col1)
    ax2.set_xlabel(col2)

    # Add a main title for the page
    main_title = f'Plots of {dataset_name} dataset showing {distribution_type} distribution in {scale} scale'
    fig.suptitle(main_title, fontsize=16)  
    
    plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust the layout to prevent overlap
    plt.subplots_adjust(top=0.9, wspace=0.008)  # Adjust wspace to add space between subplots
    return fig

# Creating the PDF with plots for each pair of columns
def generate_interaction_plots(df, columns, secondary_columns, dataset_name, distribution_type, scale, num_buckets=128):
    with PdfPages('flight_freq_minmax_sec.pdf') as pdf:
        for i in range(len(columns)):
            for j in range(i + 1, len(columns)):
                col1 = columns[i]
                col2 = columns[j]
                sec1 = secondary_columns[i]
                sec2 = secondary_columns[j]
                
                fig = plot_interactions(df, col1, col2, sec1, sec2, dataset_name, distribution_type, scale, num_buckets=num_buckets)
                pdf.savefig(fig)
                plt.close(fig)

# Usage example - secondary_columns should contain the names or None
columns = ['from_airport_code', 'from_country', 'dest_airport_code', 'dest_country', 'stops']
secondary_columns = ['departure_time', None, 'arrival_time', None, 'duration' ]  
#secondary_columns = [None, None, None, None, None]
dataset_name = "Flight"
distribution_type = "frequency"
scale = "absolute"
generate_interaction_plots(df, columns, secondary_columns, dataset_name, distribution_type, scale)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import TwoSlopeNorm, LinearSegmentedColormap
from matplotlib.backends.backend_pdf import PdfPages

def plot_interactions_ideal(df, col1, col2, secondary1, secondary2, dataset_name, distribution_type, scale, bucket_suffix='_bucket', num_buckets=128):
    # Determine the bucketed column names
    bucket_col1 = col1 + bucket_suffix
    bucket_col2 = col2 + bucket_suffix
    
    # Ensure the column names are in the DataFrame
    if bucket_col1 not in df.columns or bucket_col2 not in df.columns:
        raise ValueError("One or both bucket columns are not in the DataFrame")
    
    # Prepare figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    
    # Plot Bucket Interactions
    interaction_matrix = np.zeros((num_buckets, num_buckets), dtype=int)
    interaction_counts = df.groupby([bucket_col1, bucket_col2]).size().reset_index(name='counts')
    for _, row in interaction_counts.iterrows():
        bucket1 = int(row[bucket_col1])
        bucket2 = int(row[bucket_col2])
        count = row['counts']
        if bucket1 < num_buckets and bucket2 < num_buckets:
            interaction_matrix[bucket1, bucket2] += count

    # Total counts to determine average frequency per bucket for normalization
    total_counts = interaction_matrix.sum()
    average_frequency_per_bucket = total_counts / (num_buckets ** 2)

    # Custom colormap: green to black to red
    cmap = LinearSegmentedColormap.from_list("custom_cmap", ["green", "black", "red"])
    norm = TwoSlopeNorm(vmin=0, vcenter=average_frequency_per_bucket, vmax=2 * average_frequency_per_bucket)

    sns.heatmap(interaction_matrix, cmap=cmap, norm=norm, annot=False, ax=ax1)
    title1 = f'Bucket Heatmap between {col1 + " + " + secondary1 if secondary1 else col1} and {col2 + " + " + secondary2 if secondary2 else col2}'
    ax1.set_title(title1, pad=12)
    ax1.set_ylabel(bucket_col1)
    ax1.set_xlabel(bucket_col2)

    # Plot Value Interactions
    interaction_matrix = pd.crosstab(df[col1], df[col2]).values
    
    # Calculate average frequency per value pair
    unique_values_col1 = df[col1].unique()
    unique_values_col2 = df[col2].unique()
    total_counts_values = interaction_matrix.sum()
    average_frequency_per_value_pair = total_counts_values / (len(unique_values_col1) * len(unique_values_col2))

    norm = TwoSlopeNorm(vmin=0, vcenter=average_frequency_per_value_pair, vmax=2 * average_frequency_per_value_pair)

    sns.heatmap(interaction_matrix, cmap=cmap, norm=norm, annot=False, ax=ax2)
    title2 = f'Value Heatmap between {col1 + " + " + secondary1 if secondary1 else col1} and {col2 + " + " + secondary2 if secondary2 else col2}'
    ax2.set_title(title2, pad=12)
    ax2.set_ylabel(col1)
    ax2.set_xlabel(col2)

    # Add a main title for the page
    main_title = f'Plots of {dataset_name} dataset showing {distribution_type} distribution in {scale} scale'
    fig.suptitle(main_title, fontsize=16)
    # Print the average frequencies at the bottom of the page
    fig.text(0.5, 0.01, f'Average Frequency per Bucket: {average_frequency_per_bucket:.2f}', ha='center', fontsize=12)
    plt.tight_layout(rect=[0, 0, 1, 0.93])  
    plt.subplots_adjust(top=0.9, wspace=0.011)  
    return fig

# Creating the PDF with plots for each pair of columns
def generate_interaction_plots_ideal(df, columns, secondary_columns, dataset_name, distribution_type, scale, num_buckets=128):
    with PdfPages('flight_freq_ideal_sec.pdf') as pdf:
        for i in range(len(columns)):
            for j in range(i + 1, len(columns)):
                col1 = columns[i]
                col2 = columns[j]
                sec1 = secondary_columns[i]
                sec2 = secondary_columns[j]
                
                fig = plot_interactions_ideal(df, col1, col2, sec1, sec2, dataset_name, distribution_type, scale, num_buckets=num_buckets)
                pdf.savefig(fig)
                plt.close(fig)

# Usage example - secondary_columns should contain the names or None
columns = ['from_airport_code', 'from_country', 'dest_airport_code', 'dest_country', 'stops']
secondary_columns = ['departure_time', None, 'arrival_time', None, 'duration'] 
# secondary_columns = [None, None, None, None, None]
dataset_name = "Flight"
distribution_type = "frequency"
scale = "normalized"
generate_interaction_plots_ideal(df, columns, secondary_columns, dataset_name, distribution_type, scale)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages

def plot_interactions_price(df, col1, col2, secondary1, secondary2, dataset_name, distribution_type, scale, bucket_suffix='_bucket', num_buckets=128):
    # Determine the bucketed column names
    bucket_col1 = col1 + bucket_suffix
    bucket_col2 = col2 + bucket_suffix
    
    # Ensure the column names are in the DataFrame
    if bucket_col1 not in df.columns or bucket_col2 not in df.columns:
        raise ValueError("One or both bucket columns are not in the DataFrame")
    
    # Prepare figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    
    # Plot Bucket Interactions
    interaction_matrix = np.zeros((num_buckets, num_buckets), dtype=float)
    interaction_sums = df.groupby([bucket_col1, bucket_col2])['price'].sum().reset_index(name='total_price')
    for _, row in interaction_sums.iterrows():
        bucket1 = int(row[bucket_col1])
        bucket2 = int(row[bucket_col2])
        total_price = row['total_price']
        if bucket1 < num_buckets and bucket2 < num_buckets:
            interaction_matrix[bucket1, bucket2] += total_price
    
    sns.heatmap(interaction_matrix, cmap='hot', annot=False, ax=ax1)
    title1 = f'Bucket Heatmap between {col1 + " + " + secondary1 if secondary1 else col1} and {col2 + " + " + secondary2 if secondary2 else col2}'
    ax1.set_title(title1, pad=12)  
    ax1.set_ylabel(bucket_col1)
    ax1.set_xlabel(bucket_col2)

    # Plot Value Interactions
    interaction_matrix = pd.crosstab(df[col1], df[col2], values=df['price'], aggfunc='sum').fillna(0).values
    sns.heatmap(interaction_matrix, cmap='hot', annot=False, ax=ax2)
    title2 = f'Value Heatmap between {col1 + " + " + secondary1 if secondary1 else col1} and {col2 + " + " + secondary2 if secondary2 else col2}'
    ax2.set_title(title2, pad=12)  
    ax2.set_ylabel(col1)
    ax2.set_xlabel(col2)

    # Add a main title for the page
    main_title = f'Plots of {dataset_name} dataset showing {distribution_type} distribution in {scale} scale'
    fig.suptitle(main_title, fontsize=16)  
    
    plt.tight_layout(rect=[0, 0, 1, 0.96]) 
    plt.subplots_adjust(top=0.9, wspace=0.008)  
    return fig

# Creating the PDF with plots for each pair of columns
def generate_interaction_plots_price(df, columns, secondary_columns, dataset_name, distribution_type, scale, num_buckets=128):
    with PdfPages('flight_price_minmax_sec.pdf') as pdf:
        for i in range(len(columns)):
            for j in range(i + 1, len(columns)):
                col1 = columns[i]
                col2 = columns[j]
                sec1 = secondary_columns[i]
                sec2 = secondary_columns[j]
                
                fig = plot_interactions_price(df, col1, col2, sec1, sec2, dataset_name, distribution_type, scale, num_buckets=num_buckets)
                pdf.savefig(fig)
                plt.close(fig)

# Usage example - secondary_columns should contain the names or None
columns = ['from_airport_code', 'from_country', 'dest_airport_code', 'dest_country', 'stops']
secondary_columns = ['departure_time', None, 'arrival_time', None, 'duration']  
# secondary_columns = [None, None, None, None, None]
dataset_name = "Flight"
distribution_type = "price"
scale = "absolute"
generate_interaction_plots_price(df, columns, secondary_columns, dataset_name, distribution_type, scale)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import TwoSlopeNorm, LinearSegmentedColormap
from matplotlib.backends.backend_pdf import PdfPages

def plot_interactions_price_ideal(df, col1, col2, secondary1, secondary2, dataset_name, distribution_type, scale, bucket_suffix='_bucket', num_buckets=128):
    # Determine the bucketed column names
    bucket_col1 = col1 + bucket_suffix
    bucket_col2 = col2 + bucket_suffix
    
    # Ensure the column names are in the DataFrame
    if bucket_col1 not in df.columns or bucket_col2 not in df.columns:
        raise ValueError("One or both bucket columns are not in the DataFrame")
    
    # Prepare figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    
    # Plot Bucket Interactions
    interaction_matrix = np.zeros((num_buckets, num_buckets), dtype=float)
    interaction_sums = df.groupby([bucket_col1, bucket_col2])['price'].sum().reset_index(name='total_price')
    for _, row in interaction_sums.iterrows():
        bucket1 = int(row[bucket_col1])
        bucket2 = int(row[bucket_col2])
        total_price = row['total_price']
        if bucket1 < num_buckets and bucket2 < num_buckets:
            interaction_matrix[bucket1, bucket2] += total_price

    # Calculate average price per bucket
    total_price = interaction_matrix.sum()
    average_price = total_price / (num_buckets ** 2)

    # Custom colormap: green to black to red
    cmap = LinearSegmentedColormap.from_list("custom_cmap", ["green", "black", "red"])
    norm = TwoSlopeNorm(vmin=0, vcenter=average_price, vmax=2 * average_price)

    sns.heatmap(interaction_matrix, cmap=cmap, norm=norm, annot=False, ax=ax1)
    title1 = f'Bucket Price Heatmap between {col1 + " + " + secondary1 if secondary1 else col1} and {col2 + " + " + secondary2 if secondary2 else col2}'
    ax1.set_title(title1, pad=12)  
    ax1.set_xlabel(bucket_col2)
    ax1.set_ylabel(bucket_col1)

    # Plot Value Interactions (Summing up price)
    unique_values_col1 = df[col1].unique()
    unique_values_col2 = df[col2].unique()
    interaction_matrix = pd.DataFrame(index=unique_values_col1, columns=unique_values_col2, data=0.0)
    interaction_sums = df.groupby([col1, col2])['price'].sum().reset_index(name='total_price')
    for _, row in interaction_sums.iterrows():
        value1 = row[col1]
        value2 = row[col2]
        total_price = row['total_price']
        interaction_matrix.loc[value1, value2] = total_price

    # Calculate average price per value pair
    total_price_values = interaction_matrix.values.sum()
    average_price_per_value_pair = total_price_values / (len(unique_values_col1) * len(unique_values_col2))

    norm = TwoSlopeNorm(vmin=0, vcenter=average_price_per_value_pair, vmax=2 * average_price_per_value_pair)

    # Mapping the unique values to indices
    index_mapping_col1 = {val: idx+1 for idx, val in enumerate(unique_values_col1)}
    index_mapping_col2 = {val: idx+1 for idx, val in enumerate(unique_values_col2)}

    # Create a new DataFrame for the interaction matrix with indices
    interaction_matrix_indexed = interaction_matrix.rename(index=index_mapping_col1, columns=index_mapping_col2)

    sns.heatmap(interaction_matrix_indexed, cmap=cmap, norm=norm, annot=False, ax=ax2)
    title2 = f'Value Price Heatmap between {col1 + " + " + secondary1 if secondary1 else col1} and {col2 + " + " + secondary2 if secondary2 else col2}'
    ax2.set_title(title2, pad=12)  
    ax2.set_xlabel(f'{col2} (Indices)')
    ax2.set_ylabel(f'{col1} (Indices)')

    # Add a main title for the page
    main_title = f'Plots of {dataset_name} dataset showing {distribution_type} distribution in {scale} scale'
    fig.suptitle(main_title, fontsize=16)  
    
    plt.tight_layout(rect=[0, 0, 1, 0.93])  
    plt.subplots_adjust(top=0.9, wspace=0.011)  

    # Print the average price at the bottom of the page
    fig.text(0.5, 0.01, f'Average Price per Bucket: {average_price:.2f}', ha='center', fontsize=12)
    return fig

# Creating the PDF with plots for each pair of columns
def generate_interaction_plots_price_ideal(df, columns, secondary_columns, dataset_name, distribution_type, scale, num_buckets=128):
    with PdfPages('flight_price_ideal.pdf') as pdf:
        for i in range(len(columns)):
            for j in range(i + 1, len(columns)):
                col1 = columns[i]
                col2 = columns[j]
                sec1 = secondary_columns[i]
                sec2 = secondary_columns[j]
                
                fig = plot_interactions_price_ideal(df, col1, col2, sec1, sec2, dataset_name, distribution_type, scale, num_buckets=num_buckets)
                pdf.savefig(fig)
                plt.close(fig)

# Usage example - secondary_columns should contain the names or None
columns = ['from_airport_code', 'from_country', 'dest_airport_code', 'dest_country', 'stops']
# secondary_columns = ['departure_time', None, 'arrival_time', None, 'duration']  
secondary_columns = [None, None, None, None, None]
dataset_name = "Flight"
distribution_type = "price"
scale = "normalized"
generate_interaction_plots_price_ideal(df, columns, secondary_columns, dataset_name, distribution_type, scale)

