In [3]:
import pandas as pd
import csv
import numpy as np

taxi_df = pd.read_csv("taxi/yellow_tripdata_2019-01.csv")
print(taxi_df.columns)

columns_to_keep = [
    'passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID', 
    'payment_type', 'fare_amount', 'tpep_pickup_datetime', 'tpep_dropoff_datetime'
]

# Select these columns from the DataFrame
df_selected = taxi_df[columns_to_keep]

# Save the selected columns to a new CSV file
df_selected.to_csv('taxi_jan_with_sec.csv', index=False)

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge'],
      dtype='object')


In [7]:
import pandas as pd
import numpy as np
import math

def init_tuples_and_data_info(filename, primary_cols, secondary_cols, exclude_col_index):
    # Read the CSV file
    df = pd.read_csv(filename)
    
    # Apply preprocessing that includes type conversions and mappings
    df = type_preproc(df)
    actual_columns = df.columns.tolist()
    if max(primary_cols + [col for col in secondary_cols if col != -1]) >= len(actual_columns):
        raise ValueError("One or more column indices are out of range. Please check the input indices.")

    for primary, secondary in zip(primary_cols, secondary_cols):
        if secondary != -1 and primary != exclude_col_index:
            primary_name = df.columns[primary]
            secondary_name = df.columns[secondary]

            # Ensure primary and secondary values are integers before applying the transformation
            df[primary_name] = df[primary_name].astype(int)
            df[secondary_name] = df[secondary_name].apply(lambda x: max(0, int(x)))
            # Combine primary and secondary into a new value in the primary column
            df[primary_name] = df.apply(lambda x: int(f"{int(x[primary_name]):02d}{int(x[secondary_name]):03d}"), axis=1)
    
    # Drop secondary columns post-combination
    droplist = [df.columns[secondary] for secondary in secondary_cols if secondary != -1 and secondary < len(df.columns)]
    df.drop(columns=droplist, inplace=True)

    return df

def type_preproc(df):
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])   

    if 'tpep_pickup_datetime' in df.columns:
        df['tpep_pickup_datetime'] = df['tpep_pickup_datetime'].dt.hour.apply(map_time_interval)
    if 'tpep_dropoff_datetime' in df.columns:
        df['tpep_dropoff_datetime'] = df['tpep_dropoff_datetime'].dt.hour.apply(map_time_interval)
        
    df[['passenger_count', 'PULocationID', 'DOLocationID', 'payment_type']] = df[['passenger_count', 'PULocationID', 'DOLocationID', 'payment_type']].astype('int64')
    return df

def map_time_interval(hour):
    # Map hours to time intervals
    return hour // 1

def apply_bucket_mapping(df, bucket_details, column_names):
    for column in column_names:
        bucket_col_name = f"{column}_bucket"
        df[bucket_col_name] = None
        
        col_values = df[column]
        bucket_counts = {}

        for bucket_index, values_in_bucket in enumerate(bucket_details[column]):
            for value, count in values_in_bucket.items():
                valid_indices = df[(col_values == value) & (df[bucket_col_name].isnull())].index[:count]
                if not valid_indices.empty:
                    df.loc[valid_indices, bucket_col_name] = bucket_index
                    bucket_counts[value] = bucket_counts.get(value, 0) + len(valid_indices)

        for value in bucket_counts.keys():
            expected_count = sum(values_in_bucket.get(value, 0) for values_in_bucket in bucket_details[column])
            if bucket_counts[value] != expected_count:
                print(f"Warning: Mismatch in counts for value {value} in column {column}: expected {expected_count}, got {bucket_counts[value]}")
    
    return df

def calculate_bucket_allocations(grouped_data, num_buckets):
    total_tuples = grouped_data['freq'].sum()
    ideal_tuples_per_bucket = math.ceil(total_tuples / num_buckets)
    
    bucket_details = [{} for _ in range(num_buckets)]
    bucket_allocations = {}
    current_bucket = 0
    remaining_space_in_bucket = ideal_tuples_per_bucket

    for _, row in grouped_data.iterrows():
        value = row['value']
        freq = row['freq']
        start_bucket = current_bucket

        while freq > 0:
            space_used = min(freq, remaining_space_in_bucket)
            freq -= space_used
            remaining_space_in_bucket -= space_used

            if value in bucket_details[current_bucket]:
                bucket_details[current_bucket][value] += space_used
            else:
                bucket_details[current_bucket][value] = space_used

            if remaining_space_in_bucket == 0:
                current_bucket += 1
                remaining_space_in_bucket = ideal_tuples_per_bucket if current_bucket < num_buckets else 0

        end_bucket = current_bucket - 1 if remaining_space_in_bucket == 0 else current_bucket
        bucket_allocations[value] = (start_bucket, end_bucket)

    return bucket_allocations, bucket_details

def process_columns(df, num_buckets, column_names):
    all_allocations = {}
    all_bucket_details = {}
    for column in column_names:
        print(f"Processing column: {column}")
        grouped_data = df.groupby(column).size().reset_index(name='freq')
        grouped_data.rename(columns={column: 'value'}, inplace=True)
        bucket_allocations, bucket_details = calculate_bucket_allocations(grouped_data, num_buckets)
        all_allocations[column] = bucket_allocations
        all_bucket_details[column] = bucket_details

    df = apply_bucket_mapping(df, all_bucket_details, column_names)
    return df, all_allocations

def allocate_buckets_continuous(df, continuous_columns, num_buckets):
    total_tuples = len(df)
    ideal_tuples_per_bucket = math.ceil(total_tuples / num_buckets)
    
    all_bucket_details = {}

    for column in continuous_columns:
        df_sorted = df.sort_values(by=column).reset_index(drop=True)
        
        bucket_details = [{} for _ in range(num_buckets)]
        current_bucket = 0
        remaining_space_in_bucket = ideal_tuples_per_bucket
        
        for i in range(total_tuples):
            value = df_sorted.at[i, column]
            if value in bucket_details[current_bucket]:
                bucket_details[current_bucket][value] += 1
            else:
                bucket_details[current_bucket][value] = 1
            
            remaining_space_in_bucket -= 1
            if remaining_space_in_bucket == 0:
                current_bucket += 1
                remaining_space_in_bucket = ideal_tuples_per_bucket if current_bucket < num_buckets else 0
        
        all_bucket_details[column] = bucket_details
    
    df = apply_bucket_mapping(df, all_bucket_details, continuous_columns)
    return df, all_bucket_details

# Example usage
primary_cols = [0, 2, 3, 4]
secondary_cols = [6, -1, -1, 7]
exclude_col_index = 5  # Exclude column 5 from processing

df = init_tuples_and_data_info('taxi_jan_with_sec.csv', primary_cols, secondary_cols, exclude_col_index)
print(df.head())

continuous_columns = ['trip_distance']
discrete_columns = ['passenger_count', 'PULocationID', 'DOLocationID', 'payment_type']
num_buckets = 128

df_mapped_discrete, bucket_allocations_discrete = process_columns(df, num_buckets, discrete_columns)
df_mapped_continuous, bucket_allocations_continuous = allocate_buckets_continuous(df, continuous_columns, num_buckets)

# Combine discrete and continuous bucket allocations
df_mapped = df_mapped_discrete.copy()
for col in continuous_columns:
    df_mapped[f"{col}_bucket"] = df_mapped_continuous[f"{col}_bucket"]

df_mapped.to_csv('tuples_with_buckets_taxi_sec.csv', index=False)
print("Mapped DataFrame with bucket assignments has been saved to 'tuples_with_buckets_taxi_sec.csv'.")



   passenger_count  trip_distance  PULocationID  DOLocationID  payment_type  \
0             1000            1.5           151           239          1000   
1             1000            2.6           239           246          1001   
2             3013            0.0           236           236          1013   
3             5015            0.0           193           193          2015   
4             5015            0.0           193           193          2015   

   fare_amount  
0          7.0  
1         14.0  
2          4.5  
3          3.5  
4         52.0  
Processing column: passenger_count
Processing column: PULocationID
Processing column: DOLocationID
Processing column: payment_type
Mapped DataFrame with bucket assignments has been saved to 'tuples_with_buckets_taxi_sec.csv'.


In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages

def sanity_check_row_and_col_buckets(interaction_matrix):
    # Print column sums and row sums of the interaction matrix
    x_axis_sums = interaction_matrix.sum(axis=0)  # Column sums
    y_axis_sums = interaction_matrix.sum(axis=1)  # Row sums
    
    print(f'Sum of tuples for each x-axis bucket (Column sums):')
    for i, sum_val in enumerate(x_axis_sums):
        print(f'Bucket {i}: {sum_val}')

    print(f'Sum of tuples for each y-axis bucket (Row sums):')
    for i, sum_val in enumerate(y_axis_sums):
        print(f'Bucket {i}: {sum_val}')

def plot_interactions(df, col1, col2, secondary1, secondary2, dataset_name, distribution_type, scale, bucket_suffix='_bucket', num_buckets=128):
    # Determine the bucketed column names
    bucket_col1 = col1 + bucket_suffix
    bucket_col2 = col2 + bucket_suffix
    
    # Prepare figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    
    # Plot Bucket Interactions
    interaction_matrix = np.zeros((num_buckets, num_buckets), dtype=int)
    interaction_counts = df.groupby([bucket_col1, bucket_col2]).size().reset_index(name='counts')
    for _, row in interaction_counts.iterrows():
        bucket1 = int(row[bucket_col1])
        bucket2 = int(row[bucket_col2])
        count = row['counts']
        if bucket1 < num_buckets and bucket2 < num_buckets:
            interaction_matrix[bucket1, bucket2] += count
    
    # Plot the heatmap
    sns.heatmap(interaction_matrix, cmap='hot', annot=False, ax=ax1)
    title1 = f'Bucket Heatmap between {col1 + " + " + secondary1 if secondary1 else col1} and {col2 + " + " + secondary2 if secondary2 else col2}'
    ax1.set_title(title1, pad=12)
    ax1.set_ylabel(bucket_col1)
    ax1.set_xlabel(bucket_col2)
    
    # sanity_check_row_and_col_buckets(interaction_matrix)

    # Plot Value Interactions
    interaction_matrix = pd.crosstab(df[col1], df[col2]).values
    sns.heatmap(interaction_matrix, cmap='hot', annot=False, ax=ax2)
    title2 = f'Value Heatmap between {col1 + " + " + secondary1 if secondary1 else col1} and {col2 + " + " + secondary2 if secondary2 else col2}'
    ax2.set_title(title2, pad=12)
    ax2.set_ylabel(col1)
    ax2.set_xlabel(col2)

    # Add a main title for the page
    main_title = f'Plots of {dataset_name} dataset showing {distribution_type} distribution in {scale} scale'
    fig.suptitle(main_title, fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    return fig

# Creating the PDF with plots for each pair of columns
def generate_interaction_plots(df, columns, secondary_columns, dataset_name, distribution_type, scale, num_buckets=128):
    with PdfPages('taxi_freq_minmax_sec.pdf') as pdf:
        for i in range(len(columns)):
            for j in range(i + 1, len(columns)):
                col1 = columns[i]
                col2 = columns[j]
                sec1 = secondary_columns[i]
                sec2 = secondary_columns[j]
                
                fig = plot_interactions(df, col1, col2, sec1, sec2, dataset_name, distribution_type, scale, num_buckets=num_buckets)
                pdf.savefig(fig)
                plt.close(fig)

# Usage example assuming secondary_columns directly contains the names or None
columns = ['passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID', 'payment_type']
secondary_columns = ['pickup_time', None, None, None, 'dropoff_time']
# secondary_columns = [None, None, None, None, None]
dataset_name = "Taxi"
distribution_type = "frequency"
scale = "absolute"
generate_interaction_plots(df, columns, secondary_columns, dataset_name, distribution_type, scale)


In [9]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import TwoSlopeNorm, LinearSegmentedColormap
from matplotlib.backends.backend_pdf import PdfPages

def plot_interactions_ideal(df, col1, col2, secondary1, secondary2, dataset_name, distribution_type, scale, bucket_suffix='_bucket', num_buckets=128):
    # Determine the bucketed column names
    bucket_col1 = col1 + bucket_suffix
    bucket_col2 = col2 + bucket_suffix
    
    # Prepare figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    
    # Plot Bucket Interactions
    interaction_matrix = np.zeros((num_buckets, num_buckets), dtype=int)
    interaction_counts = df.groupby([bucket_col1, bucket_col2]).size().reset_index(name='counts')
    for _, row in interaction_counts.iterrows():
        bucket1 = int(row[bucket_col1])
        bucket2 = int(row[bucket_col2])
        count = row['counts']
        if bucket1 < num_buckets and bucket2 < num_buckets:
            interaction_matrix[bucket1, bucket2] += count

    # Calculate average frequency per bucket
    total_counts = interaction_matrix.sum()
    average_frequency_per_bucket = total_counts / (num_buckets ** 2)

    # Custom colormap: green to black to red
    cmap = LinearSegmentedColormap.from_list("custom_cmap", ["green", "black", "red"])
    norm = TwoSlopeNorm(vmin=0, vcenter=average_frequency_per_bucket, vmax=2 * average_frequency_per_bucket)

    sns.heatmap(interaction_matrix, cmap=cmap, norm=norm, annot=False, ax=ax1)
    title1 = f'Bucket Heatmap between {col1 + " + " + secondary1 if secondary1 else col1} and {col2 + " + " + secondary2 if secondary2 else col2}'
    ax1.set_title(title1, pad=12)
    ax1.set_ylabel(bucket_col1)
    ax1.set_xlabel(bucket_col2)

    # Plot Value Interactions
    interaction_matrix = pd.crosstab(df[col1], df[col2]).values
    
    # Calculate average frequency per value pair
    unique_values_col1 = df[col1].unique()
    unique_values_col2 = df[col2].unique()
    total_counts_values = interaction_matrix.sum()
    average_frequency_per_value_pair = total_counts_values / (len(unique_values_col1) * len(unique_values_col2))

    norm = TwoSlopeNorm(vmin=0, vcenter=average_frequency_per_value_pair, vmax=2 * average_frequency_per_value_pair)

    sns.heatmap(interaction_matrix, cmap=cmap, norm=norm, annot=False, ax=ax2)
    title2 = f'Value Heatmap between {col1 + " + " + secondary1 if secondary1 else col1} and {col2 + " + " + secondary2 if secondary2 else col2}'
    ax2.set_title(title2, pad=12)
    ax2.set_ylabel(col1)
    ax2.set_xlabel(col2)

    # Add a main title for the page
    main_title = f'Plots of {dataset_name} dataset showing {distribution_type} distribution in {scale} scale'
    fig.suptitle(main_title, fontsize=16)
    # Print the average frequency at the bottom of the page
    fig.text(0.5, 0.01, f'Average Frequency per Bucket: {average_frequency_per_bucket:.2f}', ha='center', fontsize=12)   
    plt.tight_layout(rect=[0, 0, 1, 0.93])  
    return fig

# Creating the PDF with plots for each pair of columns
def generate_interaction_plots_ideal(df, columns, secondary_columns, dataset_name, distribution_type, scale, num_buckets=128):
    with PdfPages('taxi_freq_ideal_sec.pdf') as pdf:
        for i in range(len(columns)):
            for j in range(i + 1, len(columns)):
                col1 = columns[i]
                col2 = columns[j]
                sec1 = secondary_columns[i] if secondary_columns[i] != None else None
                sec2 = secondary_columns[j] if secondary_columns[j] != None else None
                
                fig = plot_interactions_ideal(df, col1, col2, sec1, sec2, dataset_name, distribution_type, scale, num_buckets=num_buckets)
                pdf.savefig(fig)
                plt.close(fig)

# Usage example assuming secondary_columns directly contains the names or None
columns = ['passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID', 'payment_type']
# secondary_columns = [None, None, None, None, None]
secondary_columns = ['pickup_time', None, None, None, 'dropoff_time']
dataset_name = "Taxi"
distribution_type = "frequency"
scale = "normalized"
generate_interaction_plots_ideal(df, columns, secondary_columns, dataset_name, distribution_type, scale)


In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages

def sanity_check_row_and_col_buckets(interaction_matrix):
    # Print column sums and row sums of the interaction matrix
    x_axis_sums = interaction_matrix.sum(axis=0)  # Column sums
    y_axis_sums = interaction_matrix.sum(axis=1)  # Row sums
    
    print(f'Sum of fares for each x-axis bucket (Column sums):')
    for i, sum_val in enumerate(x_axis_sums):
        print(f'Bucket {i}: {sum_val}')

    print(f'Sum of fares for each y-axis bucket (Row sums):')
    for i, sum_val in enumerate(y_axis_sums):
        print(f'Bucket {i}: {sum_val}')

def plot_interactions_fare(df, col1, col2, secondary1, secondary2, dataset_name, distribution_type, scale, bucket_suffix='_bucket', num_buckets=128):
    # Determine the bucketed column names
    bucket_col1 = col1 + bucket_suffix
    bucket_col2 = col2 + bucket_suffix
    
    # Prepare figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    
    # Plot Bucket Interactions (Summing up fare_amount)
    interaction_matrix = np.zeros((num_buckets, num_buckets), dtype=float)
    interaction_fares = df.groupby([bucket_col1, bucket_col2])['fare_amount'].sum().reset_index(name='total_fare')
    for _, row in interaction_fares.iterrows():
        bucket1 = int(row[bucket_col1])
        bucket2 = int(row[bucket_col2])
        total_fare = row['total_fare']
        if bucket1 < num_buckets and bucket2 < num_buckets:
            interaction_matrix[bucket1, bucket2] += total_fare
    
    # Plot the heatmap
    sns.heatmap(interaction_matrix, cmap='hot', annot=False, ax=ax1)
    title1 = f'Bucket Heatmap (Total Fare) between {col1 + " + " + secondary1 if secondary1 else col1} and {col2 + " + " + secondary2 if secondary2 else col2}'
    ax1.set_title(title1, pad=12)
    ax1.set_ylabel(bucket_col1)
    ax1.set_xlabel(bucket_col2)
    
    # sanity_check_row_and_col_buckets(interaction_matrix)

    # Plot Value Interactions (Summing up fare_amount)
    unique_values_col1 = df[col1].unique()
    unique_values_col2 = df[col2].unique()
    interaction_matrix = pd.DataFrame(index=unique_values_col1, columns=unique_values_col2, data=0.0)
    interaction_fares = df.groupby([col1, col2])['fare_amount'].sum().reset_index(name='total_fare')
    for _, row in interaction_fares.iterrows():
        value1 = row[col1]
        value2 = row[col2]
        total_fare = row['total_fare']
        interaction_matrix.loc[value1, value2] = total_fare

    # Mapping the unique values to indices
    index_mapping_col1 = {val: idx+1 for idx, val in enumerate(unique_values_col1)}
    index_mapping_col2 = {val: idx+1 for idx, val in enumerate(unique_values_col2)}

    # Create a new DataFrame for the interaction matrix with indices
    interaction_matrix_indexed = interaction_matrix.rename(index=index_mapping_col1, columns=index_mapping_col2)

    sns.heatmap(interaction_matrix_indexed, cmap='hot', annot=False, ax=ax2)
    title2 = f'Value Heatmap (Total Fare) between {col1 + " + " + secondary1 if secondary1 else col1} and {col2 + " + " + secondary2 if secondary2 else col2}'
    ax2.set_title(title2, pad=12)
    ax2.set_ylabel(f'{col1} (Indices)')
    ax2.set_xlabel(f'{col2} (Indices)')

    # Add a main title for the page
    main_title = f'Plots of {dataset_name} dataset showing {distribution_type} distribution in {scale} scale'
    fig.suptitle(main_title, fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    return fig

# Creating the PDF with plots for each pair of columns
def generate_interaction_plots_fare(df, columns, secondary_columns, dataset_name, distribution_type, scale, num_buckets=128):
    with PdfPages('taxi_price_minmax_sec.pdf') as pdf:
        for i in range(len(columns)):
            for j in range(i + 1, len(columns)):
                col1 = columns[i]
                col2 = columns[j]
                sec1 = secondary_columns[i]
                sec2 = secondary_columns[j]
                
                fig = plot_interactions_fare(df, col1, col2, sec1, sec2, dataset_name, distribution_type, scale, num_buckets=num_buckets)
                pdf.savefig(fig)
                plt.close(fig)

# Usage example assuming secondary_columns directly contains the names or None
columns = ['passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID', 'payment_type']
secondary_columns = ['pickup_time', None, None, None, 'dropoff_time']
# secondary_columns = [None, None, None, None, None]
dataset_name = "Taxi"
distribution_type = "fare"
scale = "Absolute"
generate_interaction_plots_fare(df, columns, secondary_columns, dataset_name, distribution_type, scale)


In [11]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import TwoSlopeNorm, LinearSegmentedColormap
from matplotlib.backends.backend_pdf import PdfPages

def plot_interactions_fare_ideal(df, col1, col2, secondary1, secondary2, dataset_name, distribution_type, scale, bucket_suffix='_bucket', num_buckets=128):
    # Determine the bucketed column names
    bucket_col1 = col1 + bucket_suffix
    bucket_col2 = col2 + bucket_suffix
    
    # Prepare figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    
    # Plot Bucket Interactions (Summing up fare_amount)
    interaction_matrix = np.zeros((num_buckets, num_buckets), dtype=float)
    interaction_fares = df.groupby([bucket_col1, bucket_col2])['fare_amount'].sum().reset_index(name='total_fare')
    for _, row in interaction_fares.iterrows():
        bucket1 = int(row[bucket_col1])
        bucket2 = int(row[bucket_col2])
        total_fare = row['total_fare']
        if bucket1 < num_buckets and bucket2 < num_buckets:
            interaction_matrix[bucket1, bucket2] += total_fare

    # Total fares to determine average frequency for normalization
    total_fares = interaction_matrix.sum()
    average_fare_buckets = total_fares / (num_buckets ** 2)

    # Custom colormap: green to black to red
    cmap = LinearSegmentedColormap.from_list("custom_cmap", ["green", "black", "red"])
    norm = TwoSlopeNorm(vmin=0, vcenter=average_fare_buckets, vmax=2*average_fare_buckets)

    sns.heatmap(interaction_matrix, cmap=cmap, norm=norm, annot=False, ax=ax1)
    title1 = f'Bucket Heatmap (Total Fare) between {col1 + " + " + secondary1 if secondary1 else col1} and {col2 + " + " + secondary2 if secondary2 else col2}'
    ax1.set_title(title1, pad=12)
    ax1.set_ylabel(bucket_col1)
    ax1.set_xlabel(bucket_col2)

    # Plot Value Interactions (Summing up fare_amount)
    unique_values_col1 = df[col1].unique()
    unique_values_col2 = df[col2].unique()
    interaction_matrix = pd.DataFrame(index=unique_values_col1, columns=unique_values_col2, data=0.0)
    interaction_fares = df.groupby([col1, col2])['fare_amount'].sum().reset_index(name='total_fare')
    for _, row in interaction_fares.iterrows():
        value1 = row[col1]
        value2 = row[col2]
        total_fare = row['total_fare']
        interaction_matrix.loc[value1, value2] = total_fare

    # Mapping the unique values to indices
    index_mapping_col1 = {val: idx+1 for idx, val in enumerate(unique_values_col1)}
    index_mapping_col2 = {val: idx+1 for idx, val in enumerate(unique_values_col2)}

    # Create a new DataFrame for the interaction matrix with indices
    interaction_matrix_indexed = interaction_matrix.rename(index=index_mapping_col1, columns=index_mapping_col2)

    # Total fares to determine average frequency for normalization
    total_fares_values = interaction_matrix_indexed.values.sum()
    average_fare_values = total_fares_values / (len(unique_values_col1) * len(unique_values_col2))

    norm = TwoSlopeNorm(vmin=0, vcenter=average_fare_values, vmax=2*average_fare_values)

    sns.heatmap(interaction_matrix_indexed, cmap=cmap, norm=norm, annot=False, ax=ax2)
    title2 = f'Value Interaction Heatmap (Total Fare) between {col1 + " + " + secondary1 if secondary1 else col1} and {col2 + " + " + secondary2 if secondary2 else col2}'
    ax2.set_title(title2, pad=12)
    ax2.set_ylabel(f'{col1} (Indices)')
    ax2.set_xlabel(f'{col2} (Indices)')
    
    # Add a main title for the page
    main_title = f'Plots of {dataset_name} dataset showing {distribution_type} distribution in {scale} scale'
    fig.suptitle(main_title, fontsize=16)
    # Print the average price at the bottom of the page
    fig.text(0.5, 0.01, f'Average Fare per Bucket: {average_fare_buckets:.2f}', ha='center', fontsize=12)  
    plt.tight_layout(rect=[0, 0, 1, 0.96])  
    return fig

# Creating the PDF with plots for each pair of columns
def generate_interaction_plots_fare_ideal(df, columns, secondary_columns, dataset_name, distribution_type, scale, num_buckets=128):
    with PdfPages('taxi_price_ideal_sec.pdf') as pdf:
        for i in range(len(columns)):
            for j in range(i + 1, len(columns)):
                col1 = columns[i]
                col2 = columns[j]
                sec1 = secondary_columns[i] if secondary_columns[i] != None else None
                sec2 = secondary_columns[j] if secondary_columns[j] != None else None
                
                fig = plot_interactions_fare_ideal(df, col1, col2, sec1, sec2, dataset_name, distribution_type, scale, num_buckets=num_buckets)
                pdf.savefig(fig)
                plt.close(fig)

# Usage example assuming secondary_columns directly contains the names or None
columns = ['passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID', 'payment_type']
secondary_columns = ['pickup_time', None, None, None, 'dropoff_time']
# secondary_columns = [None, None, None, None, None]
dataset_name = "Taxi"
distribution_type = "fare"
scale = "Normalized"

generate_interaction_plots_fare_ideal(df, columns, secondary_columns, dataset_name, distribution_type, scale)
