# Matched

## Dependencies

In [None]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta

import sys
from pathlib import Path

# Automatically detect the repo root (parent of notebook folder)
repo_root = Path().resolve().parent  # if notebook is in 'notebooks/' folder
sys.path.append(str(repo_root))

from config.config import get_environment

from config.config import data_import_json, data_import_pandas, data_export_pandas

## ENV

In [None]:
ENV = get_environment(
    env_path="../environments",
    env_name="env.json"
)

# content_date = datetime.now().date() + timedelta(days=0)
content_date = ENV['CONTENT_DATE']
version = ENV['VERSION']

website = 'all'
source = ENV['SOURCE']['NAME']
target1 = ENV['TARGET']["1"]['NAME']
target2 = ENV['TARGET']["2"]['NAME']

## Input

In [None]:
df_input = data_import_pandas(
    website=website,
    content_date=content_date,
    version=version,
    folder_name=f'normalize/{website}',
    additional_info='normalize'
)

# Split source target
df_source = df_input[df_input['website'] == source].copy(deep=True).reset_index(drop=True)
df_target1 = df_input[df_input['website'] == target1].copy(deep=True).reset_index(drop=True)
df_target2 = df_input[df_input['website'] == target2].copy(deep=True).reset_index(drop=True)

In [None]:
df_match_matched1 = data_import_pandas(
    website=f'{source}_{target1}',
    content_date=content_date,
    version=version,
    folder_name=f'matching/{website}',
    additional_info='matching_matched'
)

df_match_matched1[f'{source}_item_id'] = df_match_matched1[f'{source}_{target1}_id'].str.split('---').str[0]
df_match_matched1[f'{target1}_item_id'] = df_match_matched1[f'{source}_{target1}_id'].str.split('---').str[1]

df_match_matched2 = data_import_pandas(
    website=f'{source}_{target2}',
    content_date=content_date,
    version=version,
    folder_name=f'matching/{website}',
    additional_info='matching_matched'
)

df_match_matched2[f'{source}_item_id'] = df_match_matched2[f'{source}_{target2}_id'].str.split('---').str[0]
df_match_matched2[f'{target2}_item_id'] = df_match_matched2[f'{source}_{target2}_id'].str.split('---').str[1]

## Overview Matched Items

In [None]:
count_list = list()

data_dict1 = {
    'pair': f'{source} - {target1}',
    '#_source_input': len(df_source),
    '#_source_matched': len(df_match_matched1[f'{source}_item_id'].unique()),
    '%_source_matched': round(len(df_match_matched1[f'{source}_item_id'].unique())*100/len(df_source), 2),
    '#_target_input': len(df_target1),
    '#_target_matched': len(df_match_matched1[f'{target1}_item_id'].unique()),
    '%_target_matched': round(len(df_match_matched1[f'{target1}_item_id'].unique())*100/len(df_target1), 2),
    
}

count_list.append(data_dict1)

data_dict2 = {
    'pair': f'{source} - {target2}',
    '#_source_input': len(df_source),
    '#_source_matched': len(df_match_matched2[f'{source}_item_id'].unique()),
    '%_source_matched': round(len(df_match_matched2[f'{source}_item_id'].unique())*100/len(df_source), 2),
    '#_target_input': len(df_target2),
    '#_target_matched': len(df_match_matched2[f'{target2}_item_id'].unique()),
    '%_target_matched': round(len(df_match_matched2[f'{target2}_item_id'].unique())*100/len(df_target2), 2),
    
}

count_list.append(data_dict2)

df_count = pd.DataFrame(count_list)

data_export_pandas(
    df_output=df_count,
    website=website,
    content_date=content_date,
    version=version,
    folder_name=f'matched/{website}',
    additional_info='matched_count',
    file_extension='xlsx'
)

## Merge Across Websites

In [None]:
# Rename columns source with suffix
exclude_cols = []
rename_cols = [col for col in df_source.columns if col not in exclude_cols]
df_source.rename(columns={
    col: f"{source}_{col}" for col in rename_cols
}, inplace=True)

# Rename columns target with suffix
exclude_cols = []
rename_cols = [col for col in df_target1.columns if col not in exclude_cols]
df_target1.rename(columns={
    col: f"{target1}_{col}" for col in rename_cols
}, inplace=True)

# Rename columns target with suffix
exclude_cols = []
rename_cols = [col for col in df_target2.columns if col not in exclude_cols]
df_target2.rename(columns={
    col: f"{target2}_{col}" for col in rename_cols
}, inplace=True)

In [None]:
# Merge target id per website
df_matched = df_source.copy(deep=True)
df_matched[f'{target1}_item_id'] = df_matched[f'{source}_item_id'].map(
    df_match_matched1.set_index(f'{source}_item_id')[f'{target1}_item_id'].to_dict()
)
df_matched[f'{target2}_item_id'] = df_matched[f'{source}_item_id'].map(
    df_match_matched2.set_index(f'{source}_item_id')[f'{target2}_item_id'].to_dict()
)

In [None]:
# Merge first target
df_matched = pd.merge(
    left=df_matched,
    right=df_target1[[f'{target1}_item_id', f'{target1}_brand', f'{target1}_item_name', f'{target1}_item_url', f'{target1}_in_stock', f'{target1}_review_total', f'{target1}_review_rating', f'{target1}_price', f'{target1}_price_after_disc', f'{target1}_price_disc']],
    on=f'{target1}_item_id',
    how='left'
)

In [None]:
# Merge second target
df_matched = pd.merge(
    left=df_matched,
    right=df_target2[[f'{target2}_item_id', f'{target2}_brand', f'{target2}_item_name', f'{target2}_item_url', f'{target2}_in_stock', f'{target2}_review_total', f'{target2}_review_rating', f'{target2}_price', f'{target2}_price_after_disc', f'{target2}_price_disc']],
    on=f'{target2}_item_id',
    how='left'
)

In [None]:
# Create match flag count
df_matched['match'] = np.select(
    [
        df_matched[[f'{target1}_item_id', f'{target2}_item_id']].notna().all(axis=1),
        df_matched[[f'{target1}_item_id', f'{target2}_item_id']].notna().any(axis=1),
    ],
    [
        2,
        1
    ],
    default=0
)

In [None]:
# Filter Necessary columns only
static_cols1 = [f'{source}_scrape_date', 'match']

dynamic_cols2 = ['item_id']
dynamic_cols2 = [f'{prefix}_{col}' for col in dynamic_cols2 for prefix in [source, target1, target2]]

static_cols3 = [f'{source}_category', f'{source}_is_package']

dynamic_cols4 = ['brand', 'item_name']
dynamic_cols4 = [f'{prefix}_{col}' for col in dynamic_cols4 for prefix in [source, target1, target2]]

static_cols5 = [f'{source}_currency']

dynamic_cols6 = ['price', 'price_after_disc', 'price_disc', 'in_stock', 'review_total', 'review_rating', 'item_url']
dynamic_cols6 = [f'{prefix}_{col}' for col in dynamic_cols6 for prefix in [source, target1, target2]]

df_matched = df_matched[static_cols1 + dynamic_cols2 + static_cols3 + dynamic_cols4 + static_cols5 + dynamic_cols6].copy(deep=True)

# Rename columns
df_matched.rename(columns={
    f'{source}_scrape_date': 'scrape_date',
    f'{source}_category': 'category',
    f'{source}_is_package': 'is_package',
    f'{source}_currency': 'currency',
}, inplace=True)

In [None]:
data_export_pandas(
    df_output=df_matched,
    website=website,
    content_date=content_date,
    version=version,
    folder_name=f'matched/{website}',
    additional_info='matched_all',
    incl_excel=True
)