### Step 1: Load the packages

In [92]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.cloud import bigquery_storage
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings(action="ignore") # Suppress pandas warnings

### Step 2: Download the datasets that contains the asa and vendor level CVRs

In [93]:
client = bigquery.Client(project="logistics-data-staging-flat")
bqstorage_client = bigquery_storage.BigQueryReadClient()
vendor_query = """SELECT * FROM `dh-logistics-product-ops.pricing.cvr_per_df_bucket_vendor_level_loved_brands_scaled_code`"""
asa_query = """SELECT * FROM `dh-logistics-product-ops.pricing.cvr_per_df_bucket_asa_level_loved_brands_scaled_code`"""

# Download the datasets
df_vendor = client.query(query=vendor_query).result().to_dataframe(bqstorage_client=bqstorage_client)
df_asa = client.query(query=asa_query).result().to_dataframe(bqstorage_client=bqstorage_client)

### Step 3: Define a function that fits a linear line through the CVR points

In [112]:
def model(df, cvr_col):
    data_x = df[["df_total"]].values
    data_y = df[[cvr_col]].values
    lm = LinearRegression()
    lm.fit(X=data_x, y=data_y)
    return float(np.squeeze(lm.coef_))

x = df_vendor[df_vendor["num_tiers_vendor"] > 1].groupby(["entity_id", "country_code", "master_asa_id", "vendor_code"]).apply(model, cvr_col = "cvr3").to_frame(name="vendor_cvr3_slope")
y = df_asa[df_asa["num_tiers_master_asa"] > 1].groupby(["entity_id", "country_code", "master_asa_id"]).apply(model, cvr_col = "asa_cvr3_per_df").to_frame(name="asa_cvr3_slope")

In [113]:
# Display the results
display(x.head(10))
display(y.head(10))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,vendor_cvr3_slope
entity_id,country_code,master_asa_id,vendor_code,Unnamed: 4_level_1
AP_PA,pa,4,50897,-0.4623
AP_PA,pa,4,50898,-0.2246
AP_PA,pa,4,50901,-0.0367
AP_PA,pa,4,50908,-0.0602
AP_PA,pa,4,50996,0.1337
AP_PA,pa,4,54338,0.0314
AP_PA,pa,38,104612,0.0207
AP_PA,pa,38,111365,-0.1353
AP_PA,pa,38,111578,-0.1027
AP_PA,pa,38,111598,-0.0522


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,asa_cvr3_slope
entity_id,country_code,master_asa_id,Unnamed: 3_level_1
AP_PA,pa,4,-0.0062
AP_PA,pa,38,-0.0253
AP_PA,pa,40,-0.0314
AP_PA,pa,41,-0.0706
AP_PA,pa,45,0.0085
AP_PA,pa,47,-0.0707
AP_PA,pa,48,-0.1388
AP_PA,pa,49,-0.0188
AP_PA,pa,51,-0.0012
AP_PA,pa,52,0.0465


In [114]:
# Join the results to the original data frame
df_vendor_merged = pd.merge(left=df_vendor, right=x, on=["entity_id", "country_code", "master_asa_id", "vendor_code"], how="left")
df_asa_merged = pd.merge(left=df_asa, right=y, on=["entity_id", "country_code", "master_asa_id"], how="left")

### Step 4: Upload the dataset to GBQ

In [117]:
job_config = bigquery.LoadJobConfig()

# Set the job_config to overwrite the data in the table
job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

# Upload the df_vendor frame to BQ
job1 = client.load_table_from_dataframe(
    dataframe=df_vendor_merged.reset_index(),
    destination="dh-logistics-product-ops.pricing.cvr_per_df_bucket_vendor_level_loved_brands_scaled_code",
    job_config=job_config
).result()

# Upload the df_asa frame to BQ
job2 = client.load_table_from_dataframe(
    dataframe=df_asa_merged.reset_index(),
    destination="dh-logistics-product-ops.pricing.cvr_per_df_bucket_asa_level_loved_brands_scaled_code",
    job_config=job_config
).result()