# OaSIS Knowledge base
https://open.canada.ca/data/dataset/eeb3e442-9f19-4d12-8b38-c488fe4f6e5e

In [10]:
import os
import glob
import pandas as pd
from functools import reduce

folder_path = "/Users/philippebeliveau/Desktop/Notebook/Orientor_project/Orientor_project/data_n_notebook/data/OaSIS_small"
knowledge_path = folder_path = "/Users/philippebeliveau/Desktop/Notebook/Orientor_project/Orientor_project/data_n_notebook/data/KnowlegdeBase"
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

def load_and_standardize(filepath):
    """
    Reads a CSV, renames either 'OaSIS_profile_code' or 'OaSIS Code - Final'
    to a standard 'oasis_code', and returns (DataFrame, filename_for_debug).
    """
    df = pd.read_csv(filepath)
    possible_cols = {"OaSIS profile code", "OaSIS Code - Final"}
    found_cols = set(df.columns).intersection(possible_cols)
    
    if not found_cols:
        print(f"[SKIP] No recognized code column in {filepath}")
        return None, os.path.basename(filepath)
    if len(found_cols) > 1:
        print(f"[WARN] Multiple code columns in {filepath}, picking one: {found_cols}")
    
    old_col = list(found_cols)[0]
    df.rename(columns={old_col: "oasis_code"}, inplace=True)
    return df, os.path.basename(filepath)

def safe_merge(left_df, right_df, right_name):
    """
    Tries a many-to-many outer merge on 'oasis_code'.
    If it fails, prints a message with the failing file and returns 'left_df' unchanged.
    
    The 'many_to_many' validation ensures a full Cartesian product for duplicates:
    - If left has 5 duplicates of code=1010
    - And right has 2 duplicates of code=1010
    => final DataFrame has 5 x 2 = 10 rows for code=1010
    """
    try:
        # 'validate' can enforce that we want many-to-many merges to be allowed
        merged = pd.merge(
            left_df,
            right_df,
            on="oasis_code",
            how="outer",
            validate="many_to_many"  # ensures duplicates produce a cross product
        )
        return merged
    except Exception as e:
        print(f"[MERGE ERROR] Could not merge with '{right_name}': {e}")
        return left_df

# 1) Load & unify code columns for each CSV
df_list = []
name_list = []
for f in csv_files:
    df, fname = load_and_standardize(f)
    if df is not None:  # only append if recognized code column
        df_list.append(df)
        name_list.append(fname)

# 2) Sequentially merge everything
if not df_list:
    print("No DataFrames to merge. Possibly no matching columns in any CSV.")
else:
    combined_df = df_list[0]
    combined_name = name_list[0]
    
    for i in range(1, len(df_list)):
        next_df = df_list[i]
        next_name = name_list[i]
        combined_df = safe_merge(combined_df, next_df, next_name)
    
    print("\nFinal shape:", combined_df.shape)
    print("Preview of final merged data:")
    print(combined_df.head(20))  # see a bit more to observe duplicates
    
    # 3) Save to CSV
    out_csv = os.path.join(knowledge_path, "KnowledgeBase.csv")
    combined_df.to_csv(out_csv, index=False)
    print(f"Saved merged dataset to: {out_csv}")


  df = pd.read_csv(filepath)


[SKIP] No recognized code column in /Users/philippebeliveau/Desktop/Notebook/Orientor_project/Orientor_project/data_n_notebook/data/OaSIS_small/OaSIS_combined_many_to_many.csv

Final shape: (144673, 100)
Preview of final merged data:
    oasis_code OaSIS Label - Final_x   Active Learning   Adaptability  \
0         10.0           Legislators                 4              5   
1         10.0           Legislators                 4              5   
2         10.0           Legislators                 4              5   
3         10.0           Legislators                 4              5   
4         10.0           Legislators                 4              5   
5         10.0           Legislators                 4              5   
6         10.0           Legislators                 4              5   
7         10.0           Legislators                 4              5   
8         10.0           Legislators                 4              5   
9         10.0           Legislators

In [8]:
group = combined_df['OaSIS Label - Final'].unique()
print(combined_df['OaSIS Label - Final'].nunique())

for i in group: 
    length = combined_df[combined_df['OaSIS Label - Final']==i].shape[0]
    print(f'{i}: {length}')

899
Legislators: 36
Senior government managers and officials: 78
Senior managers - financial, communications and other business services: 68
Senior managers - health, education, social and community services and membership organizations: 86
Senior managers - trade, broadcasting and other services: 56
Senior managers - construction, transportation, production and utilities: 60
Financial managers: 56
Human resources managers: 54
Purchasing managers: 35
Other administrative services managers: 66
Insurance managers: 5
Real estate service managers: 3
Mortgage broker managers: 1
Securities manager: 5
Banking and other investment managers: 7
Credit managers: 6
Advertising managers: 5
Marketing managers: 5
Public relation managers: 8
E-business managers: 3
Other business services managers: 39
Telecommunication carriers managers: 39
Financial auditors: 4
Accountants: 15
Financial analysts: 4
Investment analysts: 5
Financial advisors: 19
Securities agents and investment dealers: 7
Brokers: 6
Fin

In [14]:
from typing import Dict

def combine_row_text(row: Dict[str, str]) -> str:
    """Combine relevant columns into a single text string for integrated embedding."""
    text_parts = []

    if row.get("Label"):
        text_parts.append(f"Occupation: {row['Label']}")
    if row.get("Lead statement"):
        text_parts.append(f"Description: {row['Lead statement']}")
    if row.get("Main duties"):
        text_parts.append(f"Main duties: {row['Main duties']}")

    for skill in ["Creativity", "Leadership", "Digital Literacy", "Critical Thinking", "Problem Solving"]:
        if row.get(skill):
            text_parts.append(f"{skill}: {row[skill]}")

    return ". ".join(text_parts).strip()

In [21]:
import os
import csv
from dotenv import load_dotenv
from typing import Dict

# Load .env just in case, though not needed here
load_dotenv()

CSV_PATH = "/Users/philippebeliveau/Desktop/Notebook/Orientor_project/Orientor_project/data_n_notebook/data/KnowledgeBase/KnowledgeBase.csv"

def combine_row_text(row: Dict[str, str]) -> str:
    """Combine relevant columns into a single text string."""
    text_parts = []

    if row.get("Label"):
        text_parts.append(f"Occupation: {row['Label']}")
    if row.get("Lead statement"):
        text_parts.append(f"Description: {row['Lead statement']}")
    if row.get("Main duties"):
        text_parts.append(f"Main duties: {row['Main duties']}")

    for skill in ["Creativity", "Leadership", "Digital Literacy", "Critical Thinking", "Problem Solving"]:
        if row.get(skill):
            text_parts.append(f"{skill}: {row[skill]}")

    return ". ".join(text_parts).strip()

# Read the CSV and preview the first few combined texts
try:
    with open(CSV_PATH, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)

        for i, row in enumerate(reader):
            if not row or not row.get("oasis_code"):
                continue  # skip invalid
            text = combine_row_text(row)
            print(f"--- Row {i} ---")
            print(text)
            print()

            if i >= 4:  # just preview the first 5 rows
                break

except Exception as e:
    print(f"Error reading CSV: {e}")


--- Row 0 ---
Occupation: Legislators. Description: Legislators participate in the activities of a federal, provincial, territorial or local government legislative body or executive council, band council or school board as elected or appointed members.. Main duties: Enact, amend or repeal laws and regulations. Leadership: 5. Critical Thinking: 5. Problem Solving: 5

--- Row 1 ---
Occupation: Legislators. Description: Legislators participate in the activities of a federal, provincial, territorial or local government legislative body or executive council, band council or school board as elected or appointed members.. Main duties: Participate in developing or amending government policies, programs or procedures. Leadership: 5. Critical Thinking: 5. Problem Solving: 5

--- Row 2 ---
Occupation: Legislators. Description: Legislators participate in the activities of a federal, provincial, territorial or local government legislative body or executive council, band council or school board as e

In [20]:
text

''