# OaSIS Knowledge base
https://open.canada.ca/data/dataset/eeb3e442-9f19-4d12-8b38-c488fe4f6e5e

In [27]:
import os
import glob
import pandas as pd
from functools import reduce

folder_path = "/Users/philippebeliveau/Desktop/Notebook/Orientor_project/Orientor_project/data_n_notebook/data/OaSIS-selection"
knowledge_path = "/Users/philippebeliveau/Desktop/Notebook/Orientor_project/Orientor_project/data_n_notebook/data/KnowlegdeBase"
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

def load_and_standardize(filepath):
    """
    Reads a CSV, renames 'OaSIS profile code' or 'OaSIS Code - Final' to 'oasis_code',
    then groups all rows by oasis_code using appropriate aggregation.
    """
    df = pd.read_csv(filepath)
    possible_cols = {"OaSIS profile code", "OaSIS Code - Final"}
    found_cols = set(df.columns).intersection(possible_cols)

    if not found_cols:
        print(f"[SKIP] No recognized code column in {filepath}")
        return None, os.path.basename(filepath)

    old_col = list(found_cols)[0]
    df.rename(columns={old_col: "oasis_code"}, inplace=True)

    # Remove duplicate columns (e.g., if Label is repeated across datasets)
    df = df.loc[:, ~df.columns.duplicated()]

    # Group by oasis_code
    def aggregate_column(series):
        if pd.api.types.is_numeric_dtype(series):
            return series.mean()
        else:
            return " | ".join(series.dropna().astype(str).unique())

    grouped = df.groupby("oasis_code").agg(aggregate_column).reset_index()
    return grouped, os.path.basename(filepath)

def safe_merge(left_df, right_df, right_name):
    try:
        merged = pd.merge(left_df, right_df, on="oasis_code", how="outer")
        return merged
    except Exception as e:
        print(f"[MERGE ERROR] Could not merge with '{right_name}': {e}")
        return left_df

# Load and unify
df_list = []
name_list = []

for f in csv_files:
    df, name = load_and_standardize(f)
    if df is not None:
        df_list.append(df)
        name_list.append(name)

# Sequentially merge
if not df_list:
    print("No DataFrames to merge.")
else:
    combined_df = df_list[0]
    for i in range(1, len(df_list)):
        combined_df = safe_merge(combined_df, df_list[i], name_list[i])

    print("\n✅ Final merged shape:", combined_df.shape)
    print(combined_df.head())

    # Save final CSV
    os.makedirs(knowledge_path, exist_ok=True)
    output_csv = os.path.join(knowledge_path, "KnowledgeBase.csv")
    combined_df.to_csv(output_csv, index=False)
    print(f"📁 Saved unified dataset to: {output_csv}")



✅ Final merged shape: (900, 55)
   oasis_code                              OaSIS Label - Final_x  \
0        10.0                                        Legislators   
1        11.0           Senior government managers and officials   
2        12.0  Senior managers - financial, communications an...   
3        13.0  Senior managers - health, education, social an...   
4        14.0  Senior managers - trade, broadcasting and othe...   

    Active Learning   Adaptability  Analytical Thinking  Attention to Detail  \
0               4.0            5.0                  5.0                  5.0   
1               4.0            4.0                  5.0                  5.0   
2               4.0            4.0                  5.0                  5.0   
3               5.0            4.0                  5.0                  5.0   
4               4.0            4.0                  5.0                  5.0   

    Creativity  Concern for Others  Collaboration   Independence  ...  \
0   

In [29]:
group = combined_df['OaSIS Label - Final'].unique()
# print(combined_df['OaSIS Label - Final'].nunique())

# Show me the Legislators, main duties by expanding the text so that I can see all of it
combined_df[combined_df['OaSIS Label - Final']=='Legislators']['Main duties'].str.split(',').explode().unique()

KeyError: 'OaSIS Label - Final'

In [30]:
# Show me the columns (all of them) there is to
combined_df.columns

Index(['oasis_code', 'OaSIS Label - Final_x', ' Active Learning',
       ' Adaptability', 'Analytical Thinking', 'Attention to Detail',
       ' Creativity', 'Concern for Others', 'Collaboration', ' Independence',
       'Innovativeness', 'Leadership', 'Social Orientation',
       'Service Orientation', 'Stress Tolerance', 'Concordance number',
       'Job title type', 'Job title text', 'Main duties',
       'OaSIS Label - Final_y', 'Reading Comprehension', 'Writing  ',
       'Numeracy ', ' Digital Literacy',
       'Oral Communication: Active Listening   ',
       'Oral Communication: Oral Comprehension   ',
       'Oral Communication: Oral Expression ', 'Critical Thinking',
       'Decision Making', 'Evaluation', 'Learning and Teaching Strategies',
       'Problem Solving', 'Systems Analysis', 'Digital Production',
       'Preventative Maintenance', 'Equipment and Tool Selection',
       'Operation and Control',
       'Operation Monitoring of Machinery and Equipment',
       'Quali

In [None]:
# import os
# import csv
# from dotenv import load_dotenv
# from typing import Dict

# # Load .env just in case, though not needed here
# load_dotenv()

# CSV_PATH = "/Users/philippebeliveau/Desktop/Notebook/Orientor_project/Orientor_project/data_n_notebook/data/KnowledgeBase/KnowledgeBase.csv"

# def combine_row_text(row: Dict[str, str]) -> str:
#     """Combine relevant columns into a single text string."""
#     text_parts = []

#     if row.get("Label"):
#         text_parts.append(f"Occupation: {row['Label']}")
#     if row.get("Lead statement"):
#         text_parts.append(f"Description: {row['Lead statement']}")
#     if row.get("Main duties"):
#         text_parts.append(f"Main duties: {row['Main duties']}")

#     for skill in ["Creativity", "Leadership", "Digital Literacy", "Critical Thinking", "Problem Solving"]:
#         if row.get(skill):
#             text_parts.append(f"{skill}: {row[skill]}")

#     return ". ".join(text_parts).strip()

# # Read the CSV and preview the first few combined texts
# try:
#     with open(CSV_PATH, "r", encoding="utf-8") as f:
#         reader = csv.DictReader(f)

#         for i, row in enumerate(reader):
#             if not row or not row.get("oasis_code"):
#                 continue  # skip invalid
#             text = combine_row_text(row)
#             print(f"--- Row {i} ---")
#             print(text)
#             print()

#             if i >= 4:  # just preview the first 5 rows
#                 break

# except Exception as e:
#     print(f"Error reading CSV: {e}")
