In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [None]:
df = pd.read_excel("C:/Users/112363/Downloads/programme_name.xlsx")

In [None]:
df = df.rename(columns={'ProgrammeName': 'Programme'})

In [None]:
df.head(10)

In [None]:
df['Programme'] = df['Programme'].fillna('')
df['Level'] = df['Level'].fillna('Unknown')
df['Vertical'] = df['Vertical'].fillna('Unknown')

In [None]:
X = df[['Programme', 'Level', 'Vertical']]
y = df['Programme Name']

In [None]:
label_enc = LabelEncoder()
y_encoded = label_enc.fit_transform(y)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(), 'Programme'),
        ('onehot', OneHotEncoder(handle_unknown='ignore'), ['Level', 'Vertical'])
    ]
)


In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
accuracy = pipeline.score(X_test, y_test)

In [None]:
print(f"Model Accuracy: {accuracy:.2f}")

In [None]:
pip install joblib

In [None]:
import joblib

# Save the pipeline
joblib.dump(pipeline, "c:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/model_programme_name/programme_name_pipeline.pkl")

# Save the label encoder separately for decoding predictions later
joblib.dump(label_enc, "c:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/model_programme_name/label_encoder.pkl")

In [1]:
from sqlalchemy import create_engine
from urllib.parse import quote_plus
import pandas as pd

username = "insights"
password = quote_plus("M@rk3t1nG")  
host = "10.99.64.56"
port = "5432"
database = "cms_sas"

In [2]:
engine = create_engine(
    f"postgresql+psycopg2://{username}:{password}@{host}:{port}/{database}"
)

In [3]:
engine

Engine(postgresql+psycopg2://insights:***@10.99.64.56:5432/cms_sas)

In [4]:
query = """
select distinct programme1,
	    level1,
		vertical1,
        intake_year,
        entry_qualification
from sf_lead_opp_activity"""

In [5]:
programme_name = pd.read_sql(query, engine)

In [6]:
programme_name.head(5)

Unnamed: 0,programme1,level1,vertical1,intake_year,entry_qualification
0,0,Degree - SOP - TU,Pharmacy - TU,2024,
1,0,Degree - TBS - TU,Business - TU,2024,
2,0,Diploma - SOCIT - TC,Computing - TC,2024,
3,0,Foundation - SOE - TC,Engineering - TC,2024,
4,0,Foundation - TDS - TC,Design - TC,2024,


In [7]:
programme_name.to_csv("C:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/PEMO/programme_name.csv", index=False) 

### Define Rule

In [25]:
import re
import pandas as pd

# --- Load files ---
first_tier  = pd.read_excel(r"C:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/programme_code_mapping.xlsx", sheet_name="Tier1")
second_tier = pd.read_excel(r"C:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/programme_code_mapping.xlsx", sheet_name="Tier2")
third_tier  = pd.read_excel(r"C:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/programme_code_mapping.xlsx", sheet_name="Tier3")
fourth_tier = pd.read_excel(r"C:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/programme_code_mapping.xlsx", sheet_name="Tier4")
odl_tier    = pd.read_excel(r"C:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/programme_code_mapping.xlsx", sheet_name="ODL")

data_df = pd.read_csv(r"C:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/PEMO/programme_name.csv")

# --- 0. Normalize column names (lowercase, stripped) for safety ---
def normalize_cols(df):
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    return df

first_tier  = normalize_cols(first_tier)
second_tier = normalize_cols(second_tier)
third_tier  = normalize_cols(third_tier)
fourth_tier = normalize_cols(fourth_tier)
odl_tier    = normalize_cols(odl_tier)
data_df     = normalize_cols(data_df)

# Ensure data has intake_year column (int)
if "intake_year" not in data_df.columns:
    # Try common variants
    for alt in ["Intake Year", "intakeyear", "IntakeYear"]:
        if alt in data_df.columns:
            data_df = data_df.rename(columns={alt: "intake_year"})
            break
if "intake_year" in data_df.columns:
    data_df["intake_year"] = pd.to_numeric(data_df["intake_year"], errors="coerce").astype("Int64")

# --- 1. Year range parser ---
def get_year_range(year_str):
    year_str = str(year_str).strip()
    if "-" in year_str:
        start, end = year_str.split("-")
        return int(start.strip()), int(end.strip())
    else:
        return int(year_str), int(year_str)

def prepare_rule_tier(df_rules):
    """Add start_year/end_year if Intake Year exists; return copy."""
    df = df_rules.copy()
    if "Intake Year" in df.columns and "start_year" not in df.columns:
        df[["start_year", "end_year"]] = df["Intake Year"].apply(lambda x: pd.Series(get_year_range(x)))
    return df

first_tier  = prepare_rule_tier(first_tier)
second_tier = prepare_rule_tier(second_tier)
third_tier  = prepare_rule_tier(third_tier)
fourth_tier = prepare_rule_tier(fourth_tier)
odl_tier    = prepare_rule_tier(odl_tier)

# --- 2. Wildmatch mimic (Qlik: '*' any length, '?' single char). Escape other regex chars. ---
def wild_to_regex(p):
    # escape everything, then restore wildcards
    esc = re.escape(str(p))
    esc = esc.replace(r"\*", ".*").replace(r"\?", ".")
    return "^" + esc + "$"

def wildmatch(value, *patterns):
    """Return True if value matches ANY pattern (full-string, case-insensitive)."""
    v = "" if value is None else str(value)
    for p in patterns:
        if re.fullmatch(wild_to_regex(p), v, flags=re.IGNORECASE):
            return True
    return False

# --- 3. Convert Qlik rule to Python expression ---
def qlik_to_python(rule_str, row):
    """
    Replace identifiers (programme1, level1, vertical1, intake_year, etc.) with row['...'].
    Keep 'wildmatch', 'and', 'or', 'not', parentheses as is.
    """
    expr = str(rule_str).strip()
    # Normalize logical ops
    expr = expr.replace("AND", "and").replace("OR", "or").replace("NOT ", "not ")

    # Replace bare identifiers that are columns in the row with row['col']
    # Sort columns by length desc to avoid partial replacements (e.g., 'level' vs 'level1')
    cols_sorted = sorted(list(row.index), key=len, reverse=True)
    for col in cols_sorted:
        # word boundary to avoid replacing inside longer names; handle underscores/numbers
        expr = re.sub(rf"\b{re.escape(col)}\b", f"row['{col}']", expr)

    return expr

# --- 4. Evaluate a single rule expression on a row ---
def evaluate_rule(rule_str, row):
    expr = qlik_to_python(rule_str, row)
    try:
        return bool(eval(expr, {"wildmatch": wildmatch}, {"row": row.to_dict()}))
    except Exception as e:
        # You can log/print for debugging if needed
        # print(f"Error in rule: {rule_str}\nExpr: {expr}\nErr: {e}")
        return False

# --- 5. Evaluate a 'rule tier' (with Programme_Code_Rule) exactly like Tier1 ---
def evaluate_rule_tier(df_rules, row, tier_name="Tier"):
    """
    Iterate rule rows:
      - optional intake year range filter if start_year/end_year present
      - evaluate Programme_Code_Rule expression
      - return Programme Name on first match, else None
    """
    # Column name variations
    rule_col = "Programme_Code_Rule" if "Programme_Code_Rule" in df_rules.columns else None
    name_col = "Programme Name" if "Programme Name" in df_rules.columns else None

    if not rule_col or not name_col:
        return None  # not a rule tier

    for _, rule in df_rules.iterrows():
        # Year guard (if provided)
        if "start_year" in rule and pd.notna(row.get("intake_year", pd.NA)):
            iy = int(row["intake_year"]) if pd.notna(row["intake_year"]) else None
            if iy is None:
                continue
            if not (int(rule["start_year"]) <= iy <= int(rule["end_year"])):
                continue

        if evaluate_rule(rule[rule_col], row):
            return rule[name_col]
    return None

# --- 6. Evaluate a 'mapping tier' (no Programme_Code_Rule) as fallback ---
def evaluate_mapping_tier(df_map, row):
    """
    Try to map by exact match on the strongest available keys (in order).
    Adjust the key priority list to your sheet structure if needed.
    """
    candidates = []
    # prioritize richer keys if present
    key_priority = [
        ["programme1", "level1", "vertical1"],
        ["programme1", "level1"],
        ["programme1", "vertical1"],
        ["programme1"],
    ]

    # Ensure 'Programme Name' exists to return
    if "Programme Name" not in df_map.columns:
        return None

    for keys in key_priority:
        if all(k in df_map.columns and k in row.index for k in keys):
            df = df_map.copy()
            mask = pd.Series([True] * len(df))
            for k in keys:
                # case-insensitive exact match
                mask &= df[k].astype(str).str.lower().eq(str(row[k]).lower())
            match = df.loc[mask]
            if not match.empty:
                return match.iloc[0]["Programme Name"]
            candidates.append(keys)

    return None

# --- 7. Orchestrate across ALL tiers (each tier behaves like Tier1 if it has rules) ---
TIERS = [
    ("Tier1",  first_tier),
    ("Tier2",  second_tier),
    ("Tier3",  third_tier),
    ("Tier4",  fourth_tier),
    ("ODL",    odl_tier),
]

def find_programme_name(row):
    # Try each tier: if it has Programme_Code_Rule, use rule logic; else use mapping fallback
    for tier_name, tier_df in TIERS:
        if "Programme_Code_Rule" in tier_df.columns:
            result = evaluate_rule_tier(tier_df, row, tier_name=tier_name)
        else:
            result = evaluate_mapping_tier(tier_df, row)

        if result:
            row["__matched_tier__"] = tier_name  # optional: annotate
            return result

    row["__matched_tier__"] = "Unknown"
    return "Unknown"

# --- Apply to your dataset ---
# Ensure required columns exist in your data (at least programme1 & intake_year for typical rules)
required = ["programme1", "intake_year"]
missing = [c for c in required if c not in data_df.columns]
if missing:
    raise ValueError(f"Missing required column(s) in data_df: {missing}")

data_df["Programme_Name"] = data_df.apply(find_programme_name, axis=1)



In [31]:
import re
import pandas as pd

# --- Load files ---
first_tier  = pd.read_excel(r"C:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/programme_code_mapping.xlsx", sheet_name="Tier1")
second_tier = pd.read_excel(r"C:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/programme_code_mapping.xlsx", sheet_name="Tier2")
third_tier  = pd.read_excel(r"C:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/programme_code_mapping.xlsx", sheet_name="Tier3")
fourth_tier = pd.read_excel(r"C:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/programme_code_mapping.xlsx", sheet_name="Tier4")
odl_tier    = pd.read_excel(r"C:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/programme_code_mapping.xlsx", sheet_name="ODL")

data_df = pd.read_csv(r"C:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/PEMO/programme_name.csv")

# --- 0. Normalize column names ---
def normalize_cols(df):
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    return df

first_tier  = normalize_cols(first_tier)
second_tier = normalize_cols(second_tier)
third_tier  = normalize_cols(third_tier)
fourth_tier = normalize_cols(fourth_tier)
odl_tier    = normalize_cols(odl_tier)
data_df     = normalize_cols(data_df)

# Ensure data has intake_year
if "intake_year" not in data_df.columns:
    for alt in ["Intake Year", "intakeyear", "IntakeYear"]:
        if alt in data_df.columns:
            data_df = data_df.rename(columns={alt: "intake_year"})
            break
if "intake_year" in data_df.columns:
    data_df["intake_year"] = pd.to_numeric(data_df["intake_year"], errors="coerce").astype("Int64")

# --- 1. Year range parser ---
def get_year_range(year_str):
    year_str = str(year_str).strip()
    if "-" in year_str:
        start, end = year_str.split("-")
        return int(start.strip()), int(end.strip())
    else:
        return int(year_str), int(year_str)

def prepare_rule_tier(df_rules):
    df = df_rules.copy()
    if "Intake Year" in df.columns and "start_year" not in df.columns:
        df[["start_year", "end_year"]] = df["Intake Year"].apply(lambda x: pd.Series(get_year_range(x)))
    return df

first_tier  = prepare_rule_tier(first_tier)
second_tier = prepare_rule_tier(second_tier)
third_tier  = prepare_rule_tier(third_tier)
fourth_tier = prepare_rule_tier(fourth_tier)
odl_tier    = prepare_rule_tier(odl_tier)

# --- 2. Wildmatch mimic ---
def wild_to_regex(p):
    esc = re.escape(str(p))
    esc = esc.replace(r"\*", ".*").replace(r"\?", ".")
    return "^" + esc + "$"

def wildmatch(value, *patterns):
    v = "" if value is None else str(value)
    for p in patterns:
        if re.fullmatch(wild_to_regex(p), v, flags=re.IGNORECASE):
            return True
    return False

# --- 3. Convert Qlik rule to Python expression ---
def qlik_to_python(rule_str, row):
    expr = str(rule_str).strip()
    expr = expr.replace("AND", "and").replace("OR", "or").replace("NOT ", "not ")
    cols_sorted = sorted(list(row.index), key=len, reverse=True)
    for col in cols_sorted:
        expr = re.sub(rf"\b{re.escape(col)}\b", f"row['{col}']", expr)
    return expr

# --- 4. Evaluate a single rule ---
def evaluate_rule(rule_str, row):
    expr = qlik_to_python(rule_str, row)
    try:
        return bool(eval(expr, {"wildmatch": wildmatch}, {"row": row.to_dict()}))
    except Exception:
        return False

# --- 5. Evaluate a 'rule tier' ---
def evaluate_rule_tier(df_rules, row):
    if "Programme_Code_Rule" not in df_rules.columns or "Programme Name" not in df_rules.columns:
        return None
    for _, rule in df_rules.iterrows():
        if "start_year" in rule and pd.notna(row.get("intake_year", pd.NA)):
            iy = int(row["intake_year"]) if pd.notna(row["intake_year"]) else None
            if iy is None:
                continue
            if not (int(rule["start_year"]) <= iy <= int(rule["end_year"])):
                continue
        if evaluate_rule(rule["Programme_Code_Rule"], row):
            return rule["Programme Name"]
    return None

# --- 6. Evaluate a 'mapping tier' ---
def evaluate_mapping_tier(df_map, row):
    key_priority = [
        ["programme1", "level1", "vertical1"],
        ["programme1", "level1"],
        ["programme1", "vertical1"],
        ["programme1"],
    ]
    if "Programme Name" not in df_map.columns:
        return None
    for keys in key_priority:
        if all(k in df_map.columns and k in row.index for k in keys):
            mask = pd.Series([True] * len(df_map))
            for k in keys:
                mask &= df_map[k].astype(str).str.lower().eq(str(row[k]).lower())
            match = df_map.loc[mask]
            if not match.empty:
                return match.iloc[0]["Programme Name"]
    return None

# --- 7. Tiers list ---
TIERS = [
    first_tier,
    second_tier,
    third_tier,
    fourth_tier,
    odl_tier,
]

# --- 8. Find programme details ---
def find_programme_details(row):
    for tier_df in TIERS:
        if "Programme_Code_Rule" in tier_df.columns:
            result_name = evaluate_rule_tier(tier_df, row)
        else:
            result_name = evaluate_mapping_tier(tier_df, row)
        if result_name:
            if "Programme Code" in tier_df.columns:
                codes = (
                    tier_df.loc[tier_df["Programme Name"] == result_name, "Programme Code"]
                    .dropna()
                    .unique()
                )
                if len(codes) > 1:
                    code_value = sorted(map(str, codes))[0]  # pick alphabetically first
                else:
                    code_value = str(codes[0])
            else:
                code_value = None
            return result_name, code_value
    return "Unknown", None

# --- 9. Apply to dataset ---
required = ["programme1", "intake_year"]
missing = [c for c in required if c not in data_df.columns]
if missing:
    raise ValueError(f"Missing required column(s): {missing}")

data_df[["Programme_Name", "Programme_Code"]] = data_df.apply(
    lambda row: pd.Series(find_programme_details(row)), axis=1
)

# --- Optional: save ---
# data_df.to_csv(r"C:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/PEMO/programme_name_with_codes.csv", index=False)


In [32]:
data_df.head(10)

Unnamed: 0,programme1,level1,vertical1,intake_year,entry_qualification,Programme_Name,Programme_Code
0,0,Degree - SOP - TU,Pharmacy - TU,2024,,Bachelor of Pharmacy (Honours),95604.0
1,0,Degree - TBS - TU,Business - TU,2024,,Bachelor of Business (Honours),93601.0
2,0,Diploma - SOCIT - TC,Computing - TC,2024,,Diploma in Information Technology,83002.0
3,0,Foundation - SOE - TC,Engineering - TC,2024,,Foundation in Engineering,82007.0
4,0,Foundation - TDS - TC,Design - TC,2024,,Foundation in Design,82006.0
5,0,Others - TC,,2024,,Unknown,
6,0,Postgraduate PhD - SLAS - TU,Liberal Arts and Sciences - TU,2024,,Doctor of Philosophy in Science,14805.0
7,0,Pre-U - CAL - TC,Pre-U - TC,2024,,General Certificate of Education - Advanced Le...,82009.0
8,0,Undergraduate,Education - TU,2024,Diploma,Bachelor of Education (Honours),91606.0
9,0,Undergraduate,Education - TU,2024,,Unknown,


In [26]:
data_df['Programme_Name'].value_counts()

Programme_Name
Unknown                                                                                      3617
General Certificate of Education - Advanced Level (GCE A Level)                              1617
Bachelor of Business (Honours)                                                               1322
Bachelor of Medicine, Bachelor of Surgery - MBBS                                             1165
Bachelor of Computer Science (Honours)                                                       1146
                                                                                             ... 
MicroCred (BPA)                                                                                 4
MicroCred (MIHM)                                                                                4
Bachelor of International Tourism Management (Honours) (Travel and Recreation Management)       2
MicroCred (MTL)                                                                                 2
Micro

In [30]:
data_df.to_excel("C:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/PEMO/programme_name_mapped_2.xlsx", index=False)

In [23]:
rules_df.to_excel("C:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/PEMO/rules.xlsx", index=False)

### Get Other Sources


In [14]:
pmf_main = pd.read_excel("C:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/PEMO/pmf_main.xlsx")

In [15]:
data_df.head(1)

Unnamed: 0,programme1,level1,vertical1,intake_year,entry_qualification,Programme_Name
0,0,Degree - SOP - TU,Pharmacy - TU,2024,,Unknown


In [16]:
pmf_main.head(1)

Unnamed: 0,Programme Name,Prog.Acronym,School,Faculty,Vertical,Level,UG/PG/TC,Campus
0,ACCA (IRC),ACCA (IRC),SDPS,TC,Professional Studies,Professional Studies,TC,TC


In [17]:
rename_dict_data_df = {'Programme_Name':'programme_name'}
rename_dict_pmf_main = {"Programme Name":'programme_name',
                        "Prog.Acronym":'programme_acronym',
                        "School":'school',
                        "Faculty":'faculty',
                        "Vertical":'vertical',
                        "Level":'level',
                        "UG/PG/TC":'ug_pg_tcm',
                        "Campus":'campus'}

In [18]:
data_df = data_df.rename(columns=rename_dict_data_df)
pmf_main = pmf_main.rename(columns=rename_dict_pmf_main)

In [19]:
data_df_1 = pd.merge(data_df, pmf_main, on='programme_name', how='left')

In [20]:
data_df_1

Unnamed: 0,programme1,level1,vertical1,intake_year,entry_qualification,programme_name,programme_acronym,school,faculty,vertical,level,ug_pg_tcm,campus
0,0,Degree - SOP - TU,Pharmacy - TU,2024,,Unknown,,,,,,,
1,0,Degree - TBS - TU,Business - TU,2024,,Unknown,,,,,,,
2,0,Diploma - SOCIT - TC,Computing - TC,2024,,Unknown,,,,,,,
3,0,Foundation - SOE - TC,Engineering - TC,2024,,Unknown,,,,,,,
4,0,Foundation - TDS - TC,Design - TC,2024,,Unknown,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63689,,,,2025,Year 12,Unknown,,,,,,,
63690,,,,2025,Yemen,Unknown,,,,,,,
63691,,,,2025,Zambia,Unknown,,,,,,,
63692,,,,2025,Zimbabwe,Unknown,,,,,,,


In [21]:
data_df_1.to_excel("C:/Users/112363/OneDrive - Taylor's Education Group/DWH_WIP/PEMO/programme_name_mapped.xlsx", index=False)