In [None]:
from google.colab import auth, files
import pandas as pd
from google.cloud import storage
import os
from datetime import datetime
import pytz
import numpy as np
from google.cloud import bigquery

# Authenticate to GCS
auth.authenticate_user()

# Define the GCS bucket
bucket_name = 'prd-marketshare'

# Set up GCS client
client = storage.Client()
bucket = client.get_bucket(bucket_name)

# Function to get the latest file path from a bucket folder
def get_latest_file_path(bucket, folder_path):
    blobs = list(bucket.list_blobs(prefix=folder_path))
    if not blobs:
        return None
    latest_blob = max(blobs, key=lambda b: b.time_created)
    return f"gs://{bucket.name}/{latest_blob.name}"

# Function to save files to bucket folders
def save_to_bucket(df, bucket, destination_folder, file_name):
    # Temporary saving the file locally
    local_file_path = f'/tmp/{file_name}'
    df.to_csv(local_file_path, index=False)

    # Upload the file to the specific folder in GCS
    blob = bucket.blob(f'{destination_folder}/{file_name}')
    blob.upload_from_filename(local_file_path)

    # Download the file (if needed)
    #files.download(local_file_path)

# Get the paths of the latest files from specified folders
source_gcs_path = get_latest_file_path(bucket, '04_dictionary/08_pihs_vl_ps_sd_obc_merged')

# Read the latest files into DataFrames directly from GCS paths
if source_gcs_path:
    df = pd.read_csv(source_gcs_path)
else:
    print("No source file found")


# Columns to parse through
parse_cols = ['hv_bsg_emotor_1', 'hv_bsg_emotor_2', 'hv_bsg_emotor_3', 'hv_bsg_emotor_4', 'hv_bsg_emotor_5',
'hv_cmg_emotor_1', 'hv_cmg_emotor_2', 'hv_cmg_emotor_3', 'hv_cmg_emotor_4', 'hv_cmg_emotor_5',
'hv_gmg_emotor_1', 'hv_gmg_emotor_2', 'hv_gmg_emotor_3', 'hv_gmg_emotor_4', 'hv_gmg_emotor_5',
'hv_edrive_emotor_1', 'hv_edrive_emotor_2', 'hv_edrive_emotor_3', 'hv_edrive_emotor_4', 'hv_edrive_emotor_5',
'hv_hub_emotor_1', 'hv_hub_emotor_2', 'hv_hub_emotor_3', 'hv_hub_emotor_4', 'hv_hub_emotor_5',
'mv_bsg_emotor_1', 'mv_bsg_emotor_2', 'mv_cmg_emotor_1', 'mv_cmg_emotor_2', 'mv_gmg_emotor_1', 'mv_gmg_emotor_2',
'mv_edrive_emotor_1', 'mv_edrive_emotor_2', 'hv_diff_reducer_1', 'hv_diff_reducer_2', 'hv_diff_reducer_3',
'hv_nodiff_reducer_1', 'hv_nodiff_reducer_2', 'hv_nodiff_reducer_3', 'hv_nodiff_reducer_4',
'lv_none_starter_1', 'lv_none_restarter_1', 'lv_none_alternator_1', 'lv_bsg_emotor_1', 'mv_none_eaxle_1',
'mv_none_eaxle_2', 'mv_none_inverter_1', 'mv_none_inverter_2', 'mv_none_dcdc_1', 'mv_none_obc_1',
'hv_none_eaxle_1', 'hv_none_eaxle_2', 'hv_none_eaxle_3', 'hv_none_inverter_1', 'hv_none_inverter_2',
'hv_none_inverter_3', 'hv_none_inverter_4', 'hv_none_obc_1', 'hv_none_dcdc_1'
]

# Columns for power and supplier
power_cols = ['power_none_emotor_1', 'power_none_emotor_2', 'power_none_emotor_3', 'power_none_emotor_4', 'power_none_emotor_5',
'power_none_inverter_1', 'power_none_inverter_2', 'power_none_inverter_3', 'power_none_inverter_4', 'power_none_inverter_5',
'power_diff_reducer_1', 'power_diff_reducer_2', 'power_diff_reducer_3', 'power_diff_reducer_4',
'power_nodiff_reducer_1', 'power_nodiff_reducer_2', 'power_nodiff_reducer_3', 'power_nodiff_reducer_4',
'power_none_eaxle_1', 'power_none_eaxle_2', 'power_none_eaxle_3', 'power_none_eaxle_4', 'power_none_obc_1',	'power_none_dcdc_1',	'power_none_starter_1',	'power_none_restarter_1',	'power_none_alternator_1'
]

supplier_cols = ['supplier_none_emotor_1', 'supplier_none_emotor_2', 'supplier_none_emotor_3', 'supplier_none_emotor_4',
'supplier_none_inverter_1', 'supplier_none_inverter_2', 'supplier_none_inverter_3', 'supplier_none_inverter_4',
'supplier_none_eaxle_1', 'supplier_none_eaxle_2', 'supplier_none_obc_1',	'supplier_none_dcdc_1'
]

def decompose_name(col_name):
    parts = col_name.split('_')
    return {
        'product_voltage_type': parts[0],
        'product_subtype': None if parts[1] == 'none' else parts[1],
        'product_type': parts[2],
        'product_level': parts[-1]
    }

def match_column(parsed_name, target_cols):
    for target_col in target_cols:
        target_parts = target_col.split('_')
        if (parsed_name['product_subtype'] == target_parts[1] or target_parts[1] == 'none') and \
           parsed_name['product_type'] == target_parts[2] and \
           parsed_name['product_level'] == target_parts[-1]:
            return target_col
    return None


new_rows = []

for index, row in df.iterrows():
    for col in parse_cols:
        if row[col] != 0 and pd.notna(row[col]):
            parsed_name = decompose_name(col)
            power_col = match_column(parsed_name, power_cols)
            supplier_col = match_column(parsed_name, supplier_cols)

            new_row = {
                'product_vl': col,
                'product_voltage_type': parsed_name['product_voltage_type'],
                'product_subtype': parsed_name['product_subtype'],
                'product_type': parsed_name['product_type'],
                'product_level': parsed_name['product_level'],
                'product_power': row[power_col] if power_col else None,
                'product_supplier': row[supplier_col] if supplier_col else None
            }
            # Add original row data to the new row
            for original_col in df.columns:
                new_row[original_col] = row[original_col]

            new_rows.append(new_row)

new_df = pd.DataFrame(new_rows)

transformed_all_columns = new_df.copy()

# Removing unnecessary columns
cols_to_remove = ['product_vl'] + parse_cols + power_cols + supplier_cols
new_df.drop(columns=cols_to_remove, inplace=True)
print(f"Number of rows after transformation: {len(new_df)}")  # Print row count after melting


# Define the columns for volumes
volume_columns = [
    'volumes_2020', 'volumes_2021', 'volumes_2022', 'volumes_2023',
    'volumes_2024', 'volumes_2025', 'volumes_2026', 'volumes_2027',
    'volumes_2028', 'volumes_2029', 'volumes_2030', 'volumes_2031',
    'volumes_2032', 'volumes_2033', 'volumes_2034', 'volumes_2035'
]

# Melt the DataFrame
melted_df = new_df.melt(id_vars=[col for col in new_df.columns if col not in volume_columns],
                    value_vars=volume_columns,
                    var_name='year',
                    value_name='volume')
print(f"Number of rows after melting: {len(melted_df)}")  # Print row count after melting

# Remove rows with NaN or zero in 'volume'
melted_df = melted_df[melted_df['volume'].notna() & (melted_df['volume'] != 0)]
print(f"Number of rows after removing NaNs and zeros in 'volume': {len(melted_df)}")  # Print row count after filtering

# Convert 'year' to integer by extracting the year part
melted_df['year'] = melted_df['year'].apply(lambda x: int(x.split('_')[1]))

# Define custom order of columns
custom_order = [
'pihs_vl_status', 'pihs_vl_ps_status', 'pihs_vl_ps_sd_status', 'pihs_vl_ps_sd_obc_status', 'powertrain_id', 'product_voltage_type', 'product_subtype', 'product_type', 'product_level', 'product_power', 'product_supplier', 'year', 'volume', 'lv_hv', 'sales_group', 'vehicle_design_parent', 'solution_owner_design_parent', 'vehicle_platform', 'global_nameplate', 'vehicle_program', 'electrification', 'propulsion_system_subdesign', 'propulsion_system_subdesign_architecture', 'ap_px_definition', 'system_voltage_v', 'ap_system_power_kw', 'ap_system_torque_nm', 'electric_motor_power_kw', 'electric_motor_torque_nm', 'vehicle_sub_region', 'drive_type', 'engine_eop', 'engine_fuel_type', 'layout', 'engine_manufacturer', 'engine_platform', 'transmission_design', 'transmission_sub_design', 'transmission_forward_speed', 'transmission_manufacturer', 'transmission_program', 't_torque_nm', 't_design_2', 't_sub_design_2', 't_forward_speeds_2', 't_program_2', 'country', 'vehicle_eop_end_of_production', 'global_sales_segment', 'global_sales_sub_segment', 'gvw_class', 'vehicle_manufacturer', 'mnemonic_vehicle_id', 'production_brand', 'vehicle_production_plant', 'production_type', 'vehicle_ihs_region', 'sales_parent', 'vehicle_sop_start_of_production', 'global_make', 'transmission_design', 'battery_type', 'ap_battery_capacity_kwh', 'vehicle_id', 'model_code', 'creation_date_calendar_year', 'primary_eaxle_motor_technology', 'secondary_eaxle_motor_technology', 'primary_eaxle_layout', 'secondary_eaxle_layout', 'switching_techno_inverter_1', 'switching_techno_inverter_2', 'switching_techno_inverter_3', 'switching_techno_inverter_4', 'x_in_1_edrive_nb_of_functions', 'x_in_1_edrive_functions_description', 'x_in_1_edrive_emot', 'x_in_1_edrive_inv', 'x_in_1_edrive_red', 'x_in_1_edrive_obc', 'x_in_1_edrive_dcdc', 'x_in_1_edrive_pdu', 'x_in_1_edrive_vcu', 'x_in_1_edrive_bms', 'x_in_1_edrive_edc', 'x_in_1_edrive_ptc', 'x_in_1_edrive_dcac', 'active_parts_emotor_1', 'active_parts_emotor_2', 'active_parts_emotor_3', 'active_parts_emotor_4', 'active_parts_inverter_1', 'active_parts_inverter_2'

]

# Reorder columns based on custom order
# Use .reindex() to ensure only existing columns in melted_df are selected
melted_df = melted_df.reindex(columns=custom_order)
print(f"Number of rows after reordering columns: {len(melted_df)}")  # Print row count after reordering

# Save or use the new DataFrame
melted_df.to_csv('reshaped_data.csv', index=False)

# Reload the CSV and check the number of rows
reloaded_df = pd.read_csv('reshaped_data.csv')
print("Number of rows after reloading:", len(reloaded_df))

# Generate file name for the processed files
paris_tz = pytz.timezone('Europe/Paris')
current_time = datetime.now(paris_tz).strftime("%Y%m%d_%H%M%S")

# Save transformed_all_columns to GCS bucket
transformed_all_columns_file_name = f'transformed_all_columns_{current_time}.csv'
transformed_all_columns_folder = '05_transformation/01_transformed_all_columns'
save_to_bucket(transformed_all_columns, bucket, transformed_all_columns_folder, transformed_all_columns_file_name)

# Save transformed_dropped_columns to GCS bucket
transformed_dropped_columns_file_name = f'transformed_dropped_columns_{current_time}.csv'
transformed_dropped_columns_folder = '05_transformation/02_transformed_dropped_columns'
save_to_bucket(new_df, bucket, transformed_dropped_columns_folder, transformed_dropped_columns_file_name)

# Save transformed_melted to GCS bucket
transformed_melted_file_name = f'transformed_melted_{current_time}.csv'
transformed_melted_folder = '05_transformation/03_transformed_melted'
save_to_bucket(melted_df, bucket, transformed_melted_folder, transformed_melted_file_name)

# Function to load CSV from GCS to BigQuery
def load_csv_to_bigquery(bucket, transformed_melted_folder, transformed_melted_file_name, project_id, dataset_name):
    # BigQuery client
    client = bigquery.Client()

    # Configuration for the load job
    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.CSV,
        skip_leading_rows=1,  # Assuming your CSV has a header row
        autodetect=True,  # Auto-detect schema
        write_disposition='WRITE_APPEND'  # Append to table if it exists, otherwise create a new one
    )

    # Construct GCS URI
    file_uri = f"gs://{bucket}/{transformed_melted_folder}/{transformed_melted_file_name}"

    # Remove '.csv' from file name for the BigQuery table name
    table_name = transformed_melted_file_name.rsplit('.', 1)[0]

    # BigQuery table ID
    table_id = f"{project_id}.{dataset_name}.{table_name}"

    # Load data from GCS to BigQuery
    load_job = client.load_table_from_uri(file_uri, table_id, job_config=job_config)
    load_job.result()

    print(f"Loaded {load_job.output_rows} rows into {table_id}.")

# Load only the transformed_melted CSV to BigQuery
load_csv_to_bigquery(
    bucket_name,
    transformed_melted_folder,
    transformed_melted_file_name,
    'bq-rf5039',
    'MELTED'
)
