In [None]:
from google.colab import auth, files
import pandas as pd
from google.cloud import storage
import os
from datetime import datetime
import pytz
import numpy as np

# Authenticate to GCS
auth.authenticate_user()

# Define the GCS bucket
bucket_name = 'prd-marketshare'

# Set up GCS client
client = storage.Client()
bucket = client.get_bucket(bucket_name)

# Function to get the latest file path from a bucket folder
def get_latest_file_path(bucket, folder_path):
    blobs = list(bucket.list_blobs(prefix=folder_path))
    if not blobs:
        return None
    latest_blob = max(blobs, key=lambda b: b.time_created)
    return f"gs://{bucket.name}/{latest_blob.name}"

# Function to save files to bucket folders
def save_to_bucket(df, bucket, destination_folder, file_name):
    # Temporary saving the file locally
    local_file_path = f'/tmp/{file_name}'
    df.to_csv(local_file_path, index=False)

    # Upload the file to the specific folder in GCS
    blob = bucket.blob(f'{destination_folder}/{file_name}')
    blob.upload_from_filename(local_file_path)

    # Download the file (if needed)
    #files.download(local_file_path)

# Get the paths of the latest files from specified folders
ps_file_gcs_path = get_latest_file_path(bucket, '03_power_schedule/02_power_schedule_cleaned/')
pihs_vl_file_gcs_path = get_latest_file_path(bucket, '02_value_list/03_pihs_vl_merged')

# Read the latest files into DataFrames directly from GCS paths
if ps_file_gcs_path:
    ps = pd.read_csv(ps_file_gcs_path)
    ps = ps[ps['change_tracker'] != 'removed']
else:
    print("No ps file found")

if pihs_vl_file_gcs_path:
    pihs_vl = pd.read_csv(pihs_vl_file_gcs_path)
else:
    print("No pihs_vl file found")


# Define columns for exact and range matching
exact_match_columns = [
    'electrification', 'propulsion_system_subdesign_architecture'
]


# Explicitly set the specified columns in pihs_vl to float
float_columns_pihs_vl = [
    'system_voltage_v',
    'ap_system_power_kw',
    'electric_motor_power_kw'
]

for col in float_columns_pihs_vl:
    pihs_vl[col] = pd.to_numeric(pihs_vl[col], errors='coerce').astype('float')


# Function to construct query based on exact matching columns
def construct_query(row, exact_match_columns):
    query_parts = [f"{col} == '{row[col]}'" for col in exact_match_columns if pd.notnull(row[col])]
    return ' & '.join(query_parts) if query_parts else None

# Initialize a DataFrame to store the constructed queries
query_log = pd.DataFrame(columns=['rule_number', 'query'])

# Initialize merged DataFrame with all rows from pihs_vl and additional status column
merged_columns = pihs_vl.columns.tolist() + ['pihs_vl_ps_status']
merged_df = pihs_vl.reindex(columns=merged_columns)
merged_df['pihs_vl_ps_status'] = 'unmatched'

# Add a marker column in pihs_vl to track if a row has been matched
pihs_vl['matched'] = False

# Function to calculate power based on given coefficients and values
def calculate_power(row, a, b):
    x = row['electric_motor_power_kw']
    y = row['ap_system_power_kw']
    return a * x + b * y

# Iterate over each row in ps and apply logic to pihs_vl
for index, row in ps.iterrows():
    query = construct_query(row, exact_match_columns)
    if query:
        # Append the constructed query to the query log
        query_log = query_log.append({'rule_number': row['row'], 'query': query}, ignore_index=True)

        # Find rows in pihs_vl that match the query
        matched_rows = pihs_vl.query(query).index
        for idx in matched_rows:
            if not pihs_vl.at[idx, 'matched']:
                # Calculate power for each component and add as a new column in merged_df
                for component in ['none_emotor_1', 'none_emotor_2', 'none_emotor_3', 'none_emotor_4', 'none_emotor_5',
                                  'none_inverter_1', 'none_inverter_2', 'none_inverter_3', 'none_inverter_4', 'none_inverter_5',
                                  'diff_reducer_1', 'diff_reducer_2', 'diff_reducer_3',
                                  'diff_reducer_4', 'nodiff_reducer_1', 'nodiff_reducer_2',
                                  'nodiff_reducer_3', 'nodiff_reducer_4', 'none_eaxle_1', 'none_eaxle_2',
                                  'none_eaxle_3', 'none_eaxle_4', 'none_obc_1', 'none_dcdc_1', 'none_starter_1', 'none_restarter_1', 'none_alternator_1']:
                    a = row[f'{component}_a']
                    b = row[f'{component}_b']
                    merged_df.at[idx, f'power_{component}'] = calculate_power(pihs_vl.loc[idx], a, b)

                # Update the status in merged_df and mark the row as matched in pihs_vl
                merged_df.loc[idx, 'pihs_vl_ps_status'] = row['row']
                pihs_vl.at[idx, 'matched'] = True



# Remove all columns from ps in the final merged DataFrame
final_columns = ['pihs_vl_ps_status'] + [col for col in merged_df.columns if col.startswith('power_')]
final_merged_df = merged_df[final_columns]

# Filter merged_df by 'unmatched' status
unmerged_df = merged_df[merged_df['pihs_vl_ps_status'] == 'unmatched']

# Generate file name for the processed files
paris_tz = pytz.timezone('Europe/Paris')
current_time = datetime.now(paris_tz).strftime("%Y%m%d_%H%M%S")

# Save query_log to GCS bucket
query_log_file_name = f'pihs_vl_ps_query_log_{current_time}.csv'
query_log_folder = '03_power_schedule/05_pihs_vl_ps_query_log'
save_to_bucket(query_log, bucket, query_log_folder, query_log_file_name)

# Save merged_df to GCS bucket
final_merged_df_file_name = f'pihs_vl_ps_merged_{current_time}.csv'
final_merged_df_folder = '03_power_schedule/03_pihs_vl_ps_merged'
save_to_bucket(merged_df, bucket, final_merged_df_folder, final_merged_df_file_name)

# Save merged_df to GCS bucket
unmerged_df_file_name = f'pihs_vl_ps_unmerged_{current_time}.csv'
unmerged_df_folder = '03_power_schedule/04_pihs_vl_ps_unmerged'
save_to_bucket(unmerged_df, bucket, unmerged_df_folder, unmerged_df_file_name)

# Print statements to confirm the process
print("\nQuery log:")
print(query_log.head())
print("\nSample of merged DataFrame:")
print(merged_df.head())

# Identify the column with mixed types
mixed_type_column = pihs_vl.columns[128]
print(f"Column with mixed types: {mixed_type_column}")

# Inspect the values in this column
print(pihs_vl[mixed_type_column].value_counts())