In [None]:
from google.colab import auth, files
import pandas as pd
from google.cloud import storage
import os
from datetime import datetime
import pytz
import numpy as np

# Authenticate to GCS
auth.authenticate_user()

# Define the GCS bucket
bucket_name = 'prd-marketshare'

# Set up GCS client
client = storage.Client()
bucket = client.get_bucket(bucket_name)

# Function to get the latest file path from a bucket folder
def get_latest_file_path(bucket, folder_path):
    blobs = list(bucket.list_blobs(prefix=folder_path))
    if not blobs:
        return None
    latest_blob = max(blobs, key=lambda b: b.time_created)
    return f"gs://{bucket.name}/{latest_blob.name}"

# Function to save files to bucket folders
def save_to_bucket(df, bucket, destination_folder, file_name):
    # Temporary saving the file locally
    local_file_path = f'/tmp/{file_name}'
    df.to_csv(local_file_path, index=False)

    # Upload the file to the specific folder in GCS
    blob = bucket.blob(f'{destination_folder}/{file_name}')
    blob.upload_from_filename(local_file_path)

    # Download the file (if needed)
    #files.download(local_file_path)

# Get the paths of the latest files from specified folders
vl_file_gcs_path = get_latest_file_path(bucket, '02_value_list/02_vl_cleaned/')
pihs_file_gcs_path = get_latest_file_path(bucket, '01_pihs/02_pihs_cleaned/')

# Read the latest files into DataFrames directly from GCS paths
if vl_file_gcs_path:
    vl = pd.read_csv(vl_file_gcs_path)
    vl = vl[vl['change_tracker'] != 'removed']
else:
    print("No vl file found")

if pihs_file_gcs_path:
    pihs = pd.read_csv(pihs_file_gcs_path)
else:
    print("No pihs file found")


# Define columns for exact and range matching
exact_match_columns = [
    'electrification', 'propulsion_system_subdesign_architecture', 'ap_px_definition'
]

range_match_columns = [
    ('system_voltage_v_lower_bound', 'system_voltage_v_upper_bound', 'system_voltage_v_range_type', 'system_voltage_v')
]

# Explicitly set the specified columns in vl to float
float_columns_vl = [
    'system_voltage_v_lower_bound', 'system_voltage_v_upper_bound'
]

for col in float_columns_vl:
    vl[col] = pd.to_numeric(vl[col], errors='coerce').astype('float')

# Explicitly set the specified columns in pihs to float
float_columns_pihs = [
    'system_voltage_v',
    'ap_system_power_kw',
    'electric_motor_power_kw'
]

for col in float_columns_pihs:
    pihs[col] = pd.to_numeric(pihs[col], errors='coerce').astype('float')

# Define the handle_range_match function
def handle_range_match(vl_row, lower_bound_col, upper_bound_col, range_type_col, pihs_col):
    lower_bound_value = vl_row[lower_bound_col]
    upper_bound_value = vl_row[upper_bound_col]
    range_type = vl_row[range_type_col]

    if pd.isnull(range_type):
        return None

    # Logic for each range type
    if range_type == 'equal':
        return f"({pihs_col} == {lower_bound_value})" if not pd.isnull(lower_bound_value) else None
    elif range_type == 'less than':
        return f"({pihs_col} < {upper_bound_value})" if not pd.isnull(upper_bound_value) else None
    elif range_type == 'greater than':
        return f"({pihs_col} > {lower_bound_value})" if not pd.isnull(lower_bound_value) else None
    elif range_type == 'greater or equal':
        return f"({pihs_col} >= {lower_bound_value})" if not pd.isnull(lower_bound_value) else None
    elif range_type == 'less or equal':
        return f"({pihs_col} <= {upper_bound_value})" if not pd.isnull(upper_bound_value) else None
    elif range_type == 'exclusive-exclusive':
        return f"({pihs_col} > {lower_bound_value}) & ({pihs_col} < {upper_bound_value})" if not pd.isnull(lower_bound_value) and not pd.isnull(upper_bound_value) else None
    elif range_type == 'exclusive-inclusive':
        return f"({pihs_col} > {lower_bound_value}) & ({pihs_col} <= {upper_bound_value})" if not pd.isnull(lower_bound_value) and not pd.isnull(upper_bound_value) else None
    elif range_type == 'inclusive-exclusive':
        return f"({pihs_col} >= {lower_bound_value}) & ({pihs_col} < {upper_bound_value})" if not pd.isnull(lower_bound_value) and not pd.isnull(upper_bound_value) else None
    elif range_type == 'inclusive-inclusive':
        return f"({pihs_col} >= {lower_bound_value}) & ({pihs_col} <= {upper_bound_value})" if not pd.isnull(lower_bound_value) and not pd.isnull(upper_bound_value) else None
    elif range_type == 'non-inclusive gap':
        return f"({pihs_col} < {lower_bound_value}) | ({pihs_col} > {upper_bound_value})" if not pd.isnull(lower_bound_value) and not pd.isnull(upper_bound_value) else None

    return None

# Define the construct_query function
def construct_query(row, exact_match_columns, range_match_columns):
    query_parts = [f"{col} == '{row[col]}'" for col in exact_match_columns if pd.notnull(row[col])]

    for lower_bound_col, upper_bound_col, range_type_col, pihs_col in range_match_columns:
        range_query = handle_range_match(row, lower_bound_col, upper_bound_col, range_type_col, pihs_col)
        if range_query:
            query_parts.append(range_query)

    return ' & '.join(query_parts) if query_parts else None

# Initialize a DataFrame to store queries
query_log = pd.DataFrame(columns=['rule_number', 'query'])

# Additional columns from vl to append to merged DataFrame
additional_columns_vl = vl.columns[vl.columns.get_loc('system_voltage_v_range_type') + 1:].tolist()

# Initialize merged DataFrame with all rows from pihs and additional columns from vl
merged_columns = pihs.columns.tolist() + additional_columns_vl + ['pihs_vl_status']
merged_df = pihs.reindex(columns=merged_columns)
merged_df['pihs_vl_status'] = 'unmatched'

# Add a marker column in pihs to track matched status
pihs['matched'] = False

# Iterate over each rule in vl and apply to pihs, logging queries
for index, row in vl.iterrows():
    rule_number = row['row']  # Assuming 'row' is a column in vl
    query = construct_query(row, exact_match_columns, range_match_columns)
    if query:
        query_log = query_log.append({'rule_number': rule_number, 'query': query}, ignore_index=True)
        matched_rows = pihs.query(query).index
        for idx in matched_rows:
            if not pihs.at[idx, 'matched']:
                merged_df.loc[idx, additional_columns_vl] = row[additional_columns_vl]
                merged_df.loc[idx, 'pihs_vl_status'] = row['row']
                pihs.at[idx, 'matched'] = True

# Filter merged_df by 'unmatched' status
unmerged_df = merged_df[merged_df['pihs_vl_status'] == 'unmatched']

# Generate file name for the processed files
paris_tz = pytz.timezone('Europe/Paris')
current_time = datetime.now(paris_tz).strftime("%Y%m%d_%H%M%S")

# Save query_log to GCS bucket
query_log_file_name = f'pihs_vl_query_log_{current_time}.csv'
query_log_folder = '02_value_list/05_pihs_vl_query_log'
save_to_bucket(query_log, bucket, query_log_folder, query_log_file_name)

# Save merged_df to GCS bucket
merged_df_file_name = f'pihs_vl_merged_{current_time}.csv'
merged_df_folder = '02_value_list/03_pihs_vl_merged'
save_to_bucket(merged_df, bucket, merged_df_folder, merged_df_file_name)

# Save merged_df to GCS bucket
unmerged_df_file_name = f'pihs_vl_unmerged_{current_time}.csv'
unmerged_df_folder = '02_value_list/04_pihs_vl_unmerged'
save_to_bucket(unmerged_df, bucket, unmerged_df_folder, unmerged_df_file_name)

# Print statements to confirm the process
print("\nQuery log:")
print(query_log.head())
print("\nSample of merged DataFrame:")
print(merged_df.head())