## Setup

In [None]:
!pip install datasets scikit-learn pandas numpy spacy xgboost wikidata -q
!python -m spacy download en_core_web_sm -q

In [None]:
import pandas as pd
import numpy as np
import time
import spacy
import re
from collections import Counter
import joblib
from datetime import datetime
import warnings
import os
import pickle
from google.colab import drive
from sklearn.metrics import confusion_matrix
try:
    from wikidata.client import Client
    from wikidata.entity import EntityId
    print("Wikidata library imported successfully.")
except ImportError:
    print("Wikidata library not found. Please ensure 'pip install Wikidata' succeeded.")
    Client = None

from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
import xgboost as xgb

from sklearn.metrics import classification_report, make_scorer, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from scipy.sparse import hstack
from scipy.stats import randint, uniform
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
drive.mount('/content/drive', force_remount=True)
base_drive_path = '/content/drive/MyDrive/Bert&Ernie_shared_folder/'
data_path = os.path.join(base_drive_path, 'data')
models_path = os.path.join(base_drive_path, 'models')
results_path = os.path.join(base_drive_path, 'results')
plot_save_dir = os.path.join(results_path, 'plots_no_transformer')

In [None]:
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn')
warnings.filterwarnings("ignore", category=FutureWarning, module='xgboost')

try:
    nlp = spacy.load("en_core_web_sm") # This model is NOT transformer based
    print("spaCy model loaded successfully.")
except OSError:
    print('Downloading spaCy model...')
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model downloaded and loaded.")

In [None]:
!huggingface-cli login --token #INSERT TOKEN HERE

## Dataset

In [None]:
print("Loading dataset...")
dataset = load_dataset('sapienzanlp/nlp2025_hw1_cultural_dataset')
train_data = dataset["train"]
val_data = dataset["validation"]
train_df = pd.DataFrame(train_data)
val_df = pd.DataFrame(val_data)
print(f"Train data shape: {train_df.shape}")
print(f"Validation data shape: {val_df.shape}")

In [None]:
def combine_text(row):
    parts = [str(row.get(col, '')) for col in ['name', 'description', 'type', 'category', 'subcategory']]
    return " ".join(filter(None, parts))

In [None]:
print("Combining text fields...")
train_df['combined_text'] = train_df.apply(combine_text, axis=1)
val_df['combined_text'] = val_df.apply(combine_text, axis=1)

label_map = {'cultural agnostic': 0, 'cultural exclusive': 1, 'cultural representative': 2}
inverse_label_map = {v: k for k, v in label_map.items()}
target_names = list(label_map.keys())

train_df['label_id'] = train_df['label'].map(label_map)
val_df['label_id'] = val_df['label'].map(label_map)

y_train = train_df['label_id'].values
y_val = val_df['label_id'].values

print("Data preprocessing complete.")
wikidata_client = Client() if Client else None
wikidata_cache = {}

### Wikidata query

In [None]:
def get_qid_from_url(url):
    if isinstance(url, str) and 'wikidata.org/entity/Q' in url:
        return url.split('/')[-1]
    return None

def parse_wikidata_date(date_str):
    if not date_str or not isinstance(date_str, str):
        return None
    match = re.match(r'\+?(-?\d{4})', date_str)
    if match:
        try:
            year = int(match.group(1))
            if year > -4000 and year != 0:
                 return year
        except ValueError:
            return None
    return None

def get_wikidata_features(qid):
    if not qid or not wikidata_client:
        return {'sitelink_count': 0, 'has_country_of_origin': 0, 'item_age': np.nan}

    if qid in wikidata_cache:
        return wikidata_cache[qid]

    sitelinks = 0
    has_origin = 0
    inception_year = None
    item_age = np.nan
    current_year = datetime.now().year

    try:
        entity_data = wikidata_client.get(EntityId(qid), load=True).data

        if 'sitelinks' in entity_data and isinstance(entity_data['sitelinks'], dict):
            sitelinks = len(entity_data['sitelinks'])

        P495_qid = 'P495'
        P571_qid = 'P571'
        P577_qid = 'P577'

        if P495_qid in entity_data.get('claims', {}):
            if len(entity_data['claims'][P495_qid]) > 0:
                 has_origin = 1

        date_prop_qid = None
        if P571_qid in entity_data.get('claims', {}):
            date_prop_qid = P571_qid
        elif P577_qid in entity_data.get('claims', {}):
             date_prop_qid = P577_qid

        if date_prop_qid:
             try:
                 claim_list = entity_data.get('claims', {}).get(date_prop_qid, [])
                 if claim_list:
                    datavalue = claim_list[0].get('mainsnak', {}).get('datavalue', {})
                    if datavalue and datavalue.get('type') == 'time':
                        time_string = datavalue.get('value', {}).get('time')
                        if time_string:
                            inception_year = parse_wikidata_date(time_string)

             except (KeyError, IndexError, TypeError, AttributeError) as date_e:
                 inception_year = None

        if inception_year is not None:
            if inception_year <= current_year:
                 item_age = current_year - inception_year
            else:
                 item_age = 0

    except Exception as e:
        return {'sitelink_count': 0, 'has_country_of_origin': 0, 'item_age': np.nan}

    result = {'sitelink_count': sitelinks, 'has_country_of_origin': has_origin, 'item_age': item_age}
    wikidata_cache[qid] = result
    time.sleep(0.05)
    return result

### Feature Extraction

In [None]:
nationality_keywords = [
    'italian', 'italy', 'roman', 'rome', 'florence', 'venice', 'sicilian', 'neapolitan', 'lazio',
    'french', 'france', 'paris', 'provence', 'gaulish', 'norman',
    'german', 'germany', 'bavarian', 'prussian', 'berlin',
    'spanish', 'spain', 'catalan', 'andalusian', 'castilian', 'madrid', 'barcelona',
    'british', 'britain', 'uk', 'english', 'scottish', 'welsh', 'irish', 'london', 'celtic',
    'greek', 'greece', 'hellenic', 'athenian', 'spartan', 'crete', 'byzantine',
    'russian', 'russia', 'slavic', 'moscow', 'soviet',
    'portuguese', 'portugal', 'lisbon',
    'dutch', 'netherlands', 'amsterdam',
    'belgian', 'belgium',
    'swiss', 'switzerland',
    'austrian', 'austria', 'viennese',
    'swedish', 'sweden', 'norwegian', 'norway', 'finnish', 'finland', 'scandinavian', 'viking',
    'polish', 'poland', 'hungarian', 'hungary', 'czech',
    'european',
    'chinese', 'china', 'mandarin', 'cantonese', 'beijing', 'shanghai', 'han', 'tang', 'ming', 'qing',
    'japanese', 'japan', 'tokyo', 'kyoto', 'edo', 'samurai', 'shinto', 'zen',
    'korean', 'korea', 'seoul',
    'indian', 'india', 'hindi', 'sanskrit', 'mughal', 'vedic', 'hindu', 'buddhist', 'delhi', 'mumbai', 'bengali',
    'thai', 'thailand', 'vietnamese', 'vietnam',
    'indonesian', 'indonesia', 'malaysian', 'malaysia',
    'filipino', 'philippines',
    'turkish', 'turkey', 'ottoman', 'istanbul',
    'persian', 'iran', 'iranian', 'farsi',
    'arab', 'arabic', 'arabian',
    'israeli', 'israel', 'hebrew', 'jewish',
    'asian',
    'american', 'usa', 'us', 'new york', 'hollywood',
    'canadian', 'canada',
    'mexican', 'mexico', 'aztec', 'mayan', 'nahuatl',
    'cuban', 'cuba',
    'brazilian', 'brazil', 'rio',
    'argentinian', 'argentine', 'buenos aires',
    'peruvian', 'peru', 'inca', 'quechua',
    'colombian', 'colombia',
    'native american', 'indigenous american',
    'egyptian', 'egypt', 'cairo', 'pharaoh', 'ancient egyptian',
    'moroccan', 'morocco',
    'ethiopian', 'ethiopia',
    'nigerian', 'nigeria', 'yoruba', 'igbo',
    'kenyan', 'kenya',
    'south african', 'south africa', 'zulu',
    'african',
    'australian', 'australia', 'aboriginal australian',
    'new zealander', 'new zealand', 'maori',
    'polynesian', 'hawaiian', 'samoan', 'tongan',
    'islamic', 'muslim', 'christian', 'catholic', 'protestant', 'orthodox', 'jewish', 'judaism', 'buddhist', 'hindu',
    'gypsy', 'roma',
    'latin',
]

cultural_keywords = [
    'traditional', 'tradition', 'custom', 'customary', 'ritual', 'rite', 'ceremony', 'ceremonial',
    'heritage', 'historical', 'ancient', 'classical', 'medieval', 'renaissance',
    'folk', 'folklore', 'myth', 'legend', 'mythology', 'sacred', 'holy', 'religious', 'spiritual', 'belief',
    'indigenous', 'native', 'vernacular', 'dialect',
    'regional', 'local', 'provincial', 'ethnic', 'ethnicity',
    'artisan', 'handcrafted', 'handmade', 'guild',
    'cuisine', 'recipe', 'gastronomy',
    'art', 'music', 'dance', 'theatre', 'literature', 'architecture', 'philosophy',
    'style', 'genre', 'form', 'technique',
    'unique', 'distinctive', 'specific to', 'characteristic', 'endemic',
    'social', 'societal', 'community', 'tribe', 'clan', 'caste', 'dynasty', 'kingdom', 'empire',
    'symbol', 'symbolic', 'iconic'
]

global_keywords = [
    'global', 'worldwide', 'world', 'international', 'universal', 'ubiquitous', 'widespread', 'transnational',
    'common', 'standard', 'standardized', 'basic', 'fundamental', 'essential', 'general', 'generic',
    'modern', 'contemporary', 'current', 'recent', 'new',
    'popular', 'famous', 'well-known',
    'scientific', 'science', 'technological', 'technology', 'digital', 'electronic', 'mechanical', 'engineering',
    'mathematical', 'mathematics', 'physics', 'chemistry', 'biology', 'medical', 'computational',
    'concept', 'idea', 'theory', 'principle', 'method', 'system', 'structure', 'process', 'framework', 'model',
    'tool', 'instrument', 'device', 'machine', 'vehicle',
    'data', 'information', 'analysis', 'measurement',
    'human', 'person', 'people'
]

representative_keywords = [
    'popularized', 'adapted', 'adaptation', 'variation', 'variant', 'fusion', 'hybrid',
    'influenced', 'inspired', 'derived',
    'spread', 'exported', 'imported', 'adopted', 'introduced',
    'version', 'interpretation', 'style of'
]


def extract_ner_lexicon_features(text):
    features = Counter()
    doc = nlp(text)

    target_ner_labels = ['GPE', 'LOC', 'NORP', 'ORG', 'FAC', 'EVENT', 'LANGUAGE', 'PERSON', 'WORK_OF_ART', 'PRODUCT']
    ner_entity_count = 0
    ner_nationality_group_count = 0
    for ent in doc.ents:
        if ent.label_ in target_ner_labels:
            features[f'ner_{ent.label_}_count'] += 1
            ner_entity_count += 1
            if ent.label_ in ['GPE', 'NORP', 'LOC']:
                 ner_nationality_group_count += 1

    features['ner_total_entities'] = ner_entity_count
    features['ner_total_geo_nat_entities'] = ner_nationality_group_count

    text_lower = text.lower()
    counts = {'cultural': 0, 'nationality': 0, 'global': 0, 'representative': 0}
    keyword_lists = {
        'cultural': cultural_keywords,
        'nationality': nationality_keywords,
        'global': global_keywords,
        'representative': representative_keywords
    }

    for category, keywords in keyword_lists.items():
        for keyword in keywords:
             if re.search(r'\b' + re.escape(keyword) + r'\b', text_lower):
                 features[f'lex_{category}_keyword_{keyword}'] = 1
                 counts[category] += 1

    features['lex_total_cultural'] = counts['cultural']
    features['lex_total_nationality'] = counts['nationality']
    features['lex_total_global'] = counts['global']
    features['lex_total_representative'] = counts['representative']

    features['text_length_chars'] = len(text)
    features['text_length_tokens'] = len(doc)

    total_tokens = len(doc) if len(doc) > 0 else 1
    features['ratio_cultural_keywords'] = counts['cultural'] / total_tokens
    features['ratio_nationality_keywords'] = counts['nationality'] / total_tokens
    features['ratio_global_keywords'] = counts['global'] / total_tokens
    features['ratio_representative_keywords'] = counts['representative'] / total_tokens
    features['ratio_ner_entities'] = ner_entity_count / total_tokens
    features['ratio_ner_geo_nat_entities'] = ner_nationality_group_count / total_tokens

    features['diff_nationality_global'] = counts['nationality'] - counts['global']
    features['diff_cultural_global'] = counts['cultural'] - counts['global']
    features['ratio_nationality_vs_global'] = counts['nationality'] / (counts['global'] + 1e-6)
    features['ratio_cultural_vs_global'] = counts['cultural'] / (counts['global'] + 1e-6)


    return features

In [None]:
CACHE_FILE = os.path.join(data_path, 'wikidata_feature_cache.pkl')

try:
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE, 'rb') as f:
            wikidata_cache = pickle.load(f)
        print(f"Loaded Wikidata feature cache from {CACHE_FILE}, {len(wikidata_cache)} entries.")
    else:
        wikidata_cache = {}
        print(f"No existing cache found at {CACHE_FILE}. Starting fresh.")
except Exception as e:
    print(f"Error loading cache from {CACHE_FILE}: {e}. Using in-memory cache only for this session.")
    CACHE_FILE = None
    wikidata_cache = {}

In [None]:
print("\nProcessing Wikidata features (fetching missing, using cache)...")
wikidata_fetch_errors = 0
wikidata_cache_hits = 0
wikidata_newly_fetched = 0

if 'qid' not in train_df.columns:
     train_df['qid'] = train_df['item'].apply(get_qid_from_url)
if 'qid' not in val_df.columns:
     val_df['qid'] = val_df['item'].apply(get_qid_from_url)

def apply_wikidata_features(qid_series):
    global wikidata_cache_hits, wikidata_newly_fetched, wikidata_fetch_errors
    processed_features = []
    qids_to_fetch = []

    for qid in qid_series:
        if qid and qid in wikidata_cache:
            processed_features.append(wikidata_cache[qid])
            wikidata_cache_hits += 1
        elif qid:
             qids_to_fetch.append(qid)
             processed_features.append(None)
        else:
             processed_features.append({'sitelink_count': 0, 'has_country_of_origin': 0, 'item_age': np.nan})

    print(f"Identified {len(qids_to_fetch)} unique QIDs needing fetch...")
    if qids_to_fetch and wikidata_client:
        unique_qids_to_fetch = list(set(qids_to_fetch))
        print(f"Fetching data for {len(unique_qids_to_fetch)} unique QIDs...")
        start_fetch_time = time.time()
        for qid_to_fetch in unique_qids_to_fetch:
             fetched_data = get_wikidata_features(qid_to_fetch)
             if fetched_data['sitelink_count']==0 and fetched_data['has_country_of_origin']==0 and np.isnan(fetched_data['item_age']):
                 wikidata_fetch_errors +=1
             wikidata_newly_fetched += 1
        print(f"Fetching took {time.time() - start_fetch_time:.2f}s")


    final_features_list = []
    for i, qid in enumerate(qid_series):
        if processed_features[i] is not None:
            final_features_list.append(processed_features[i])
        elif qid and qid in wikidata_cache:
             final_features_list.append(wikidata_cache[qid])
        else:
             final_features_list.append({'sitelink_count': 0, 'has_country_of_origin': 0, 'item_age': np.nan})

    return final_features_list


train_wiki_list = apply_wikidata_features(train_df['qid'])
val_wiki_list = apply_wikidata_features(val_df['qid'])

print(f"Wikidata processing summary: Cache Hits={wikidata_cache_hits}, Newly Fetched={wikidata_newly_fetched}, Fetch Errors={wikidata_fetch_errors}")

if CACHE_FILE:
    try:
        with open(CACHE_FILE, 'wb') as f:
            pickle.dump(wikidata_cache, f)
        print(f"Wikidata cache saved/updated to {CACHE_FILE}, now {len(wikidata_cache)} entries.")
    except Exception as e:
        print(f"Error saving cache to {CACHE_FILE}: {e}")


train_wiki_df = pd.json_normalize(train_wiki_list).add_prefix('wd_')
val_wiki_df = pd.json_normalize(val_wiki_list).add_prefix('wd_')

train_df = pd.concat([train_df.drop(columns=[c for c in train_wiki_df.columns if c in train_df.columns], errors='ignore'), train_wiki_df], axis=1)
val_df = pd.concat([val_df.drop(columns=[c for c in val_wiki_df.columns if c in val_df.columns], errors='ignore'), val_wiki_df], axis=1)

print("Wikidata features merged into DataFrames.")
print("Sample Train DF columns:", train_df.columns.tolist())

required_wd_cols = ['wd_sitelink_count', 'wd_has_country_of_origin', 'wd_item_age']
for df in [train_df, val_df]:
    for col in required_wd_cols:
        if col not in df.columns:
             print(f"Warning: Column '{col}' missing after load/fetch. Adding placeholder.")
             df[col] = 0 if 'count' in col or 'has' in col else np.nan

In [None]:
print("\nPreparing Feature Set 1: TF-IDF Baseline...")
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=3)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['combined_text'])
X_val_tfidf = tfidf_vectorizer.transform(val_df['combined_text'])
print(f"TF-IDF Matrix Shape (Train): {X_train_tfidf.shape}")

In [None]:
print("\nPreparing Feature Set 2: NER + Lexicon...")
start_time = time.time()
train_features_list_nerlex = train_df['combined_text'].apply(extract_ner_lexicon_features).tolist()
val_features_list_nerlex = val_df['combined_text'].apply(extract_ner_lexicon_features).tolist()
print(f"NER/Lexicon feature extraction took {time.time() - start_time:.2f}s")

vectorizer_nerlex = DictVectorizer(sparse=True)
X_train_nerlex = vectorizer_nerlex.fit_transform(train_features_list_nerlex)
X_val_nerlex = vectorizer_nerlex.transform(val_features_list_nerlex)
print(f"NER+Lexicon Matrix Shape (Train): {X_train_nerlex.shape}")

In [None]:
print("\nPreparing Feature Set 3: Hybrid Features...")

wikidata_numerical_cols = ['wd_sitelink_count', 'wd_item_age']
wikidata_binary_cols = ['wd_has_country_of_origin']

imputer_numerical = SimpleImputer(strategy='median')
train_df[wikidata_numerical_cols] = imputer_numerical.fit_transform(train_df[wikidata_numerical_cols])
val_df[wikidata_numerical_cols] = imputer_numerical.transform(val_df[wikidata_numerical_cols])

imputer_binary = SimpleImputer(strategy='constant', fill_value=0)
train_df[wikidata_binary_cols] = imputer_binary.fit_transform(train_df[wikidata_binary_cols])
val_df[wikidata_binary_cols] = imputer_binary.transform(val_df[wikidata_binary_cols])

scaler_wd = StandardScaler()
X_train_wd_numeric_scaled = scaler_wd.fit_transform(train_df[wikidata_numerical_cols])
X_val_wd_numeric_scaled = scaler_wd.transform(val_df[wikidata_numerical_cols])

X_train_wd_binary = train_df[wikidata_binary_cols].values
X_val_wd_binary = val_df[wikidata_binary_cols].values

X_train_hybrid = hstack([
    X_train_tfidf,
    X_train_nerlex,
    X_train_wd_numeric_scaled,
    X_train_wd_binary
], format='csr')

X_val_hybrid = hstack([
    X_val_tfidf,
    X_val_nerlex,
    X_val_wd_numeric_scaled,
    X_val_wd_binary
], format='csr')

print(f"Hybrid (TFIDF+NERLEX+WD) Matrix Shape (Train): {X_train_hybrid.shape}")


f1_macro_scorer = make_scorer(f1_score, average='macro')

## Model

In [None]:
def train_evaluate_model(model, params, X_train_feat, y_train_labels, X_val_feat, y_val_labels, use_random_search=False, n_iter=20):
    model_name = model.__class__.__name__
    print(f"\n--- Training and Tuning {model_name} ---")
    if params:
        print("Starting hyperparameter search...")
        start_time = time.time()
        if use_random_search:
            search = RandomizedSearchCV(model, params, n_iter=n_iter, cv=5, scoring=f1_macro_scorer,verbose=1, n_jobs=1, random_state=42)
        else:
            search = GridSearchCV(model, params, cv=5, scoring=f1_macro_scorer,verbose=1, n_jobs=1)

        try:
            fit_params = {}
            if isinstance(model, xgb.XGBClassifier):
                fit_params['eval_set'] = [(X_val_feat, y_val_labels)]

            search.fit(X_train_feat, y_train_labels, **fit_params)
            print(f"Tuning completed in {time.time() - start_time:.2f} seconds")
            print(f"Best parameters: {search.best_params_}")
            print(f"Best cross-validation F1-macro score: {search.best_score_:.4f}")
            best_model = search.best_estimator_
        except Exception as e:
             print(f"ERROR during {model_name} tuning: {e}")
             print("Falling back to default parameters.")
             fit_params = {}
             if isinstance(model, xgb.XGBClassifier):
                 fit_params['eval_set'] = [(X_val_feat, y_val_labels)]
             model.fit(X_train_feat, y_train_labels, **fit_params)
             best_model = model
    else:
        print("Training with default parameters...")
        start_time = time.time()
        model.fit(X_train_feat, y_train_labels)
        print(f"Training completed in {time.time() - start_time:.2f} seconds")
        best_model = model
    print(f"\nValidation Performance for {model_name}:")
    try:
        y_pred = best_model.predict(X_val_feat)
        report = classification_report(y_val_labels, y_pred, target_names=target_names, zero_division=0)
        print(report)
        f1_val = f1_score(y_val_labels, y_pred, average='macro')
        acc_val = accuracy_score(y_val_labels, y_pred)
        print(f"Validation F1-macro: {f1_val:.4f}")
        print(f"Validation Accuracy: {acc_val:.4f}")
        return best_model, f1_val
    except Exception as e:
        print(f"ERROR during prediction/evaluation for {model_name}: {e}")
        return model, 0.0

In [None]:
feature_sets = {
    "TF-IDF": (X_train_tfidf, X_val_tfidf),
    "NER+Lexicon": (X_train_nerlex, X_val_nerlex),
    "Hybrid (TFIDF+NERLEX+WD)": (X_train_hybrid, X_val_hybrid)
}

results = {}

models_to_try = {
    "LogisticRegression": (
        LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced', solver='liblinear'),
        {'C': [0.1, 1.0, 5.0, 10.0], 'penalty': ['l1', 'l2']}, False
    ),
     "LinearSVC": (
        LinearSVC(max_iter=2500, random_state=42, class_weight='balanced', dual=False),
         {'C': [0.01, 0.1, 1.0, 5.0, 10.0], 'penalty': ['l1', 'l2']}, False
     ),
    "RandomForest": (
        RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1),
        {'n_estimators': randint(100, 300), 'max_depth': [10, 20, 30, 40, None], 'min_samples_split': randint(2, 15), 'min_samples_leaf': randint(1, 10), 'bootstrap': [True, False]}, True, 25
    ),
     "XGBoost": (
         xgb.XGBClassifier(random_state=42, tree_method='hist', objective='multi:softmax', num_class=len(target_names), eval_metric='mlogloss', early_stopping_rounds=10, use_label_encoder=False, error_score='raise', device='cuda'),
         {'n_estimators': randint(150, 500), 'max_depth': randint(4, 12), 'learning_rate': uniform(0.01, 0.2), 'subsample': uniform(0.6, 0.4), 'colsample_bytree': uniform(0.6, 0.4), 'gamma': uniform(0, 0.5), 'min_child_weight': randint(1, 8)}, True, 25
     ),
}

In [None]:
for fs_name, (X_train_fs, X_val_fs) in feature_sets.items():
    print(f"\n{'='*25} Experimenting with Feature Set: {fs_name} {'='*25}")
    results[fs_name] = {}

    for model_name, (model_instance, params, use_random, *n_iter_val) in models_to_try.items():
        fit_params = {}
        X_train_model = X_train_fs
        X_val_model = X_val_fs

        n_iter = n_iter_val[0] if n_iter_val else 20
        best_model_tuned, f1_val_score = train_evaluate_model(
            model_instance, params, X_train_model, y_train, X_val_model, y_val,
            use_random_search=use_random, n_iter=n_iter
        )
        results[fs_name][model_name] = {'model': best_model_tuned, 'f1_macro': f1_val_score}


print(f"\n{'='*30} Final Results Summary {'='*30}")
summary_df = pd.DataFrame({
    fs: {model: results[fs].get(model, {}).get('f1_macro', 0.0) for model in models_to_try.keys()}
    for fs in results.keys()
}).T

### Model Summary and Saving

In [None]:
summary_df = summary_df.fillna(0.0)
summary_df = summary_df[summary_df.max().sort_values(ascending=False).index]
print(summary_df.round(4))

best_overall_f1 = summary_df.max().max()
if best_overall_f1 > 0:
    best_combination = summary_df.stack().idxmax()
    best_fs, best_model_name = best_combination
    print(f"\nBest combination: Feature Set='{best_fs}', Model='{best_model_name}' with F1-macro = {best_overall_f1:.4f}")

    final_model = results[best_fs][best_model_name]['model']
    filename = f'best_cultural_classifier_NOtransformer_{best_fs}_{best_model_name}.joblib'
    save_path = os.path.join(models_path, filename)
    try:
        joblib.dump(final_model, save_path)
        print(f"Saved best model to {save_path}")
    except Exception as e:
        print(f"Error saving model: {e}")
else:
    print("\nNo valid results found.")

In [None]:
def save_models_and_print_data(results):
    for feature_set, models in results.items():
        for model_name, data in models.items():
            model = data['model']
            f1_macro = data['f1_macro']
            fs_clean = feature_set.replace(' ','_').replace('(','').replace(')','').replace('+','_')
            filename = f'{model_name}_{fs_clean}.joblib'
            save_path = os.path.join(models_path, filename)
            try:
                joblib.dump(model, save_path)
                print(f"Model: {model_name}, Feature Set: {feature_set}, F1-macro = {f1_macro:.4f}, Saved to: {save_path}")
            except Exception as e:
                print(f"Error saving {model_name} for {feature_set}: {e}")

print("\n--- Saving All Trained Models ---")
save_models_and_print_data(results)

### Post-Training Validation

The following code tests the XGBoost on hybrid features, since it's our model of choice given its performance. It's possible to test the others by changing the names of the feature set and the model.

In [None]:
loaded_model = results["Hybrid (TFIDF+NERLEX+WD)"]["XGBoost"]['model']


In [None]:
print("\n--- Starting Prediction on Unlabeled Dataset ---")

unlabeled_file_path = os.path.join(data_path, 'test_unlabeled.csv')
output_file_path = os.path.join(results_path, 'test_unlabeled_predictions_NOtransformer.csv')

required_objects = [
    'loaded_model', 'tfidf_vectorizer', 'vectorizer_nerlex',
    'imputer_numerical', 'scaler_wd', 'imputer_binary',
    'nlp', 'wikidata_client', 'wikidata_cache', 'inverse_label_map'
]
for obj_name in required_objects:
    if obj_name not in locals():
        print(f"ERROR: Required object '{obj_name}' not found in the current environment.")
        print("Please ensure the previous cells (data loading, preprocessing, model training/loading) have been run successfully.")
        raise NameError(f"Object '{obj_name}' is not defined.")

print(f"Input unlabeled data: {unlabeled_file_path}")
print(f"Output prediction file: {output_file_path}")

try:
    test_df = pd.read_csv(unlabeled_file_path)
    print(f"Successfully loaded unlabeled data. Shape: {test_df.shape}")
    print("Sample of unlabeled data:")
    print(test_df.head())
    print("Columns:", test_df.columns.tolist())
except FileNotFoundError:
    print(f"ERROR: Unlabeled data file not found at {unlabeled_file_path}")
    raise
except Exception as e:
    print(f"ERROR: Failed to load unlabeled data. {e}")
    raise


print("\n1. Combining text fields...")
if 'combined_text' not in test_df.columns:
    expected_text_cols = ['name', 'description', 'type', 'category', 'subcategory']
    for col in expected_text_cols:
        if col not in test_df.columns:
            print(f"Warning: Column '{col}' not found in unlabeled data. Filling with empty strings.")
            test_df[col] = ''
    test_df['combined_text'] = test_df.apply(combine_text, axis=1)
else:
    print("'combined_text' column already exists.")

print("\n2. Extracting Wikidata QIDs...")
if 'item' not in test_df.columns:
     print("ERROR: 'item' column (containing Wikidata URLs) is missing from the unlabeled data.")
     raise KeyError("'item' column not found.")
if 'qid' not in test_df.columns:
    test_df['qid'] = test_df['item'].apply(get_qid_from_url)
else:
    print("'qid' column already exists.")


print("\n3. Fetching/Applying Wikidata features...")
wikidata_cache_hits_test = 0
wikidata_newly_fetched_test = 0
wikidata_fetch_errors_test = 0

def apply_wikidata_features_test(qid_series):
    global wikidata_cache_hits_test, wikidata_newly_fetched_test, wikidata_fetch_errors_test, wikidata_cache
    processed_features = []
    qids_to_fetch = []

    for qid in qid_series:
        if qid and qid in wikidata_cache:
            processed_features.append(wikidata_cache[qid])
            wikidata_cache_hits_test += 1
        elif qid:
             qids_to_fetch.append(qid)
             processed_features.append(None)
        else:
             processed_features.append({'sitelink_count': 0, 'has_country_of_origin': 0, 'item_age': np.nan})

    if qids_to_fetch:
        unique_qids_to_fetch = list(set(qids_to_fetch))
        print(f"Identified {len(unique_qids_to_fetch)} unique QIDs needing fetch for the test set...")
        if wikidata_client:
            start_fetch_time = time.time()
            for qid_to_fetch in unique_qids_to_fetch:
                 if qid_to_fetch not in wikidata_cache:
                    fetched_data = get_wikidata_features(qid_to_fetch)
                    if fetched_data['sitelink_count']==0 and fetched_data['has_country_of_origin']==0 and np.isnan(fetched_data['item_age']):
                         wikidata_fetch_errors_test += 1
                    wikidata_newly_fetched_test += 1

            print(f"Fetching/Cache check took {time.time() - start_fetch_time:.2f}s")
        else:
            print("Warning: Wikidata client not available. Wikidata features will be default/missing.")
            for qid_missing in unique_qids_to_fetch:
                wikidata_cache[qid_missing] = {'sitelink_count': 0, 'has_country_of_origin': 0, 'item_age': np.nan}


    final_features_list = []
    for i, qid in enumerate(qid_series):
        if processed_features[i] is not None and not qid:
            final_features_list.append(processed_features[i])
        elif qid and qid in wikidata_cache:
             final_features_list.append(wikidata_cache[qid])
        elif qid:
             final_features_list.append({'sitelink_count': 0, 'has_country_of_origin': 0, 'item_age': np.nan})
        else:
             final_features_list.append({'sitelink_count': 0, 'has_country_of_origin': 0, 'item_age': np.nan})

    print(f"Test Wikidata summary: Cache Hits={wikidata_cache_hits_test}, Newly Fetched={wikidata_newly_fetched_test}, Fetch Errors={wikidata_fetch_errors_test}")
    if CACHE_FILE and wikidata_newly_fetched_test > 0:
        try:
            with open(CACHE_FILE, 'wb') as f:
                pickle.dump(wikidata_cache, f)
            print(f"Wikidata cache updated and saved to {CACHE_FILE}, now {len(wikidata_cache)} entries.")
        except Exception as e:
            print(f"Error saving updated cache: {e}")

    return final_features_list

test_wiki_list = apply_wikidata_features_test(test_df['qid'])
test_wiki_df = pd.json_normalize(test_wiki_list).add_prefix('wd_')

required_wd_cols = ['wd_sitelink_count', 'wd_has_country_of_origin', 'wd_item_age']
for col in required_wd_cols:
    if col not in test_wiki_df.columns:
         print(f"Warning: Column '{col}' missing after WD fetch for test set. Adding placeholder.")
         if 'count' in col or 'has' in col:
             test_wiki_df[col] = 0
         else:
             test_wiki_df[col] = np.nan

test_df = pd.concat([test_df.reset_index(drop=True), test_wiki_df.reset_index(drop=True)], axis=1)
print("Wikidata features merged into test DataFrame.")

print("\n4. Imputing and Scaling Wikidata features...")
wikidata_numerical_cols = ['wd_sitelink_count', 'wd_item_age']
wikidata_binary_cols = ['wd_has_country_of_origin']

try:
    missing_num = [c for c in wikidata_numerical_cols if c not in test_df.columns]
    missing_bin = [c for c in wikidata_binary_cols if c not in test_df.columns]
    if missing_num: raise ValueError(f"Missing numerical WD columns for imputation: {missing_num}")
    if missing_bin: raise ValueError(f"Missing binary WD columns for imputation: {missing_bin}")

    test_df[wikidata_numerical_cols] = imputer_numerical.transform(test_df[wikidata_numerical_cols])
    test_df[wikidata_binary_cols] = imputer_binary.transform(test_df[wikidata_binary_cols])
    X_test_wd_numeric_scaled = scaler_wd.transform(test_df[wikidata_numerical_cols])
    X_test_wd_binary = test_df[wikidata_binary_cols].values
    print("WD features imputed and scaled using fitted transformers.")
except Exception as e:
    print(f"ERROR during WD feature transformation for test set: {e}")
    raise

print("\n5. Extracting NER+Lexicon features...")
start_time = time.time()
test_features_list_nerlex = test_df['combined_text'].apply(extract_ner_lexicon_features).tolist()
print(f"NER/Lexicon feature extraction for test set took {time.time() - start_time:.2f}s")

try:
    X_test_nerlex = vectorizer_nerlex.transform(test_features_list_nerlex)
    print(f"Test NER+Lexicon Matrix Shape: {X_test_nerlex.shape}")
except Exception as e:
    print(f"ERROR transforming NER/Lexicon features for test set: {e}")
    raise

print("\n6. Transforming text with TF-IDF vectorizer...")
try:
    X_test_tfidf = tfidf_vectorizer.transform(test_df['combined_text'])
    print(f"Test TF-IDF Matrix Shape: {X_test_tfidf.shape}")
except Exception as e:
    print(f"ERROR transforming TF-IDF features for test set: {e}")
    raise

print("\n7. Combining all features for the test set...")
try:
    X_test_hybrid = hstack([
        X_test_tfidf,
        X_test_nerlex,
        X_test_wd_numeric_scaled,
        X_test_wd_binary
    ], format='csr')
    print(f"Final Test Hybrid Feature Matrix Shape: {X_test_hybrid.shape}")
except Exception as e:
    print(f"Error combining features with hstack for test set: {e}")
    raise

print("\n8. Making predictions on the test set...")
try:
    test_predictions_ids = loaded_model.predict(X_test_hybrid)
    print("Predictions completed.")
except Exception as e:
    print(f"ERROR during prediction on test set: {e}")
    raise

print("\n9. Adding predictions to the DataFrame...")
try:
    test_df['predicted_label_id'] = test_predictions_ids
    test_df['label'] = test_df['predicted_label_id'].map(inverse_label_map)

    if test_df['label'].isnull().any():
        print("Warning: Some predicted label IDs could not be mapped back to string labels.")
        print("Unique predicted IDs:", test_df['predicted_label_id'].unique())
        print("Inverse label map:", inverse_label_map)

    print("Sample of test data with predictions:")
    print(test_df[['item', 'name', 'predicted_label_id', 'label']].head())

except Exception as e:
    print(f"ERROR adding predictions to DataFrame: {e}")
    raise

print(f"\n10. Saving predictions to {output_file_path}...")
try:
    original_cols_df = pd.read_csv(unlabeled_file_path)
    output_columns = list(original_cols_df.columns) + ['label']

    if 'label' not in output_columns:
        output_columns.append('label')

    output_columns_present = [col for col in output_columns if col in test_df.columns]
    missing_output_cols = [col for col in output_columns if col not in test_df.columns]
    if missing_output_cols:
        print(f"Warning: The following requested output columns are missing and will not be saved: {missing_output_cols}")

    test_df.to_csv(output_file_path, columns=output_columns_present, index=False)
    print("Predictions saved successfully.")
except Exception as e:
    print(f"ERROR saving prediction file: {e}")
    raise

print("\n--- Prediction on Unlabeled Dataset Finished ---")

## Standalone Evaluation

In [None]:
print("\n--- Starting Validation Loop for a specific XGBoost Hybrid Model ---")

model_load_path = os.path.join(models_path, "XGBoost_Hybrid.joblib")

print(f"Attempting to load model from: {model_load_path}")
try:
    if not os.path.exists(model_load_path):
         print(f"ERROR: Model file not found at {model_load_path}. Please ensure the file exists at this exact path in your Google Drive.")
         raise FileNotFoundError(f"Model file not found: {model_load_path}")

    loaded_model = joblib.load(model_load_path)
    print(f"Model loaded successfully. Model Type: {type(loaded_model)}")

except Exception as e:
    print(f"ERROR: Failed to load the model. {e}")
    raise

In [None]:
print("\n--- Starting Prediction on Unlabeled Dataset ---")

unlabeled_file_path = os.path.join(data_path, 'test_unlabeled.csv')
output_file_path = os.path.join(results_path, 'test_unlabeled_predictions_NOtransformer.csv')

required_objects = [
    'loaded_model', 'tfidf_vectorizer', 'vectorizer_nerlex',
    'imputer_numerical', 'scaler_wd', 'imputer_binary',
    'nlp', 'wikidata_client', 'wikidata_cache', 'inverse_label_map'
]
for obj_name in required_objects:
    if obj_name not in locals():
        print(f"ERROR: Required object '{obj_name}' not found in the current environment.")
        print("Please ensure the previous cells (data loading, preprocessing, model training/loading) have been run successfully.")
        raise NameError(f"Object '{obj_name}' is not defined.")

print(f"Using model loaded from: {model_load_path}")
print(f"Input unlabeled data: {unlabeled_file_path}")
print(f"Output prediction file: {output_file_path}")

try:
    test_df = pd.read_csv(unlabeled_file_path)
    print(f"Successfully loaded unlabeled data. Shape: {test_df.shape}")
    print("Sample of unlabeled data:")
    print(test_df.head())
    print("Columns:", test_df.columns.tolist())
except FileNotFoundError:
    print(f"ERROR: Unlabeled data file not found at {unlabeled_file_path}")
    raise
except Exception as e:
    print(f"ERROR: Failed to load unlabeled data. {e}")
    raise


print("\n1. Combining text fields...")
if 'combined_text' not in test_df.columns:
    expected_text_cols = ['name', 'description', 'type', 'category', 'subcategory']
    for col in expected_text_cols:
        if col not in test_df.columns:
            print(f"Warning: Column '{col}' not found in unlabeled data. Filling with empty strings.")
            test_df[col] = ''
    test_df['combined_text'] = test_df.apply(combine_text, axis=1)
else:
    print("'combined_text' column already exists.")

print("\n2. Extracting Wikidata QIDs...")
if 'item' not in test_df.columns:
     print("ERROR: 'item' column (containing Wikidata URLs) is missing from the unlabeled data.")
     raise KeyError("'item' column not found.")
if 'qid' not in test_df.columns:
    test_df['qid'] = test_df['item'].apply(get_qid_from_url)
else:
    print("'qid' column already exists.")


print("\n3. Fetching/Applying Wikidata features...")
wikidata_cache_hits_test = 0
wikidata_newly_fetched_test = 0
wikidata_fetch_errors_test = 0

def apply_wikidata_features_test(qid_series):
    global wikidata_cache_hits_test, wikidata_newly_fetched_test, wikidata_fetch_errors_test, wikidata_cache
    processed_features = []
    qids_to_fetch = []

    for qid in qid_series:
        if qid and qid in wikidata_cache:
            processed_features.append(wikidata_cache[qid])
            wikidata_cache_hits_test += 1
        elif qid:
             qids_to_fetch.append(qid)
             processed_features.append(None)
        else:
             processed_features.append({'sitelink_count': 0, 'has_country_of_origin': 0, 'item_age': np.nan})

    if qids_to_fetch:
        unique_qids_to_fetch = list(set(qids_to_fetch))
        print(f"Identified {len(unique_qids_to_fetch)} unique QIDs needing fetch for the test set...")
        if wikidata_client:
            start_fetch_time = time.time()
            for qid_to_fetch in unique_qids_to_fetch:
                 if qid_to_fetch not in wikidata_cache:
                    fetched_data = get_wikidata_features(qid_to_fetch)
                    if fetched_data['sitelink_count']==0 and fetched_data['has_country_of_origin']==0 and np.isnan(fetched_data['item_age']):
                         wikidata_fetch_errors_test += 1
                    wikidata_newly_fetched_test += 1

            print(f"Fetching/Cache check took {time.time() - start_fetch_time:.2f}s")
        else:
            print("Warning: Wikidata client not available. Wikidata features will be default/missing.")
            for qid_missing in unique_qids_to_fetch:
                wikidata_cache[qid_missing] = {'sitelink_count': 0, 'has_country_of_origin': 0, 'item_age': np.nan}


    final_features_list = []
    for i, qid in enumerate(qid_series):
        if processed_features[i] is not None and not qid:
            final_features_list.append(processed_features[i])
        elif qid and qid in wikidata_cache:
             final_features_list.append(wikidata_cache[qid])
        elif qid:
             final_features_list.append({'sitelink_count': 0, 'has_country_of_origin': 0, 'item_age': np.nan})
        else:
             final_features_list.append({'sitelink_count': 0, 'has_country_of_origin': 0, 'item_age': np.nan})

    print(f"Test Wikidata summary: Cache Hits={wikidata_cache_hits_test}, Newly Fetched={wikidata_newly_fetched_test}, Fetch Errors={wikidata_fetch_errors_test}")
    if CACHE_FILE and wikidata_newly_fetched_test > 0:
        try:
            with open(CACHE_FILE, 'wb') as f:
                pickle.dump(wikidata_cache, f)
            print(f"Wikidata cache updated and saved to {CACHE_FILE}, now {len(wikidata_cache)} entries.")
        except Exception as e:
            print(f"Error saving updated cache: {e}")

    return final_features_list

test_wiki_list = apply_wikidata_features_test(test_df['qid'])
test_wiki_df = pd.json_normalize(test_wiki_list).add_prefix('wd_')

required_wd_cols = ['wd_sitelink_count', 'wd_has_country_of_origin', 'wd_item_age']
for col in required_wd_cols:
    if col not in test_wiki_df.columns:
         print(f"Warning: Column '{col}' missing after WD fetch for test set. Adding placeholder.")
         if 'count' in col or 'has' in col:
             test_wiki_df[col] = 0
         else:
             test_wiki_df[col] = np.nan

test_df = pd.concat([test_df.reset_index(drop=True), test_wiki_df.reset_index(drop=True)], axis=1)
print("Wikidata features merged into test DataFrame.")

print("\n4. Imputing and Scaling Wikidata features...")
wikidata_numerical_cols = ['wd_sitelink_count', 'wd_item_age']
wikidata_binary_cols = ['wd_has_country_of_origin']

try:
    missing_num = [c for c in wikidata_numerical_cols if c not in test_df.columns]
    missing_bin = [c for c in wikidata_binary_cols if c not in test_df.columns]
    if missing_num: raise ValueError(f"Missing numerical WD columns for imputation: {missing_num}")
    if missing_bin: raise ValueError(f"Missing binary WD columns for imputation: {missing_bin}")

    test_df[wikidata_numerical_cols] = imputer_numerical.transform(test_df[wikidata_numerical_cols])
    test_df[wikidata_binary_cols] = imputer_binary.transform(test_df[wikidata_binary_cols])
    X_test_wd_numeric_scaled = scaler_wd.transform(test_df[wikidata_numerical_cols])
    X_test_wd_binary = test_df[wikidata_binary_cols].values
    print("WD features imputed and scaled using fitted transformers.")
except Exception as e:
    print(f"ERROR during WD feature transformation for test set: {e}")
    raise

print("\n5. Extracting NER+Lexicon features...")
start_time = time.time()
test_features_list_nerlex = test_df['combined_text'].apply(extract_ner_lexicon_features).tolist()
print(f"NER/Lexicon feature extraction for test set took {time.time() - start_time:.2f}s")

try:
    X_test_nerlex = vectorizer_nerlex.transform(test_features_list_nerlex)
    print(f"Test NER+Lexicon Matrix Shape: {X_test_nerlex.shape}")
except Exception as e:
    print(f"ERROR transforming NER/Lexicon features for test set: {e}")
    raise

print("\n6. Transforming text with TF-IDF vectorizer...")
try:
    X_test_tfidf = tfidf_vectorizer.transform(test_df['combined_text'])
    print(f"Test TF-IDF Matrix Shape: {X_test_tfidf.shape}")
except Exception as e:
    print(f"ERROR transforming TF-IDF features for test set: {e}")
    raise

print("\n7. Combining all features for the test set...")
try:
    X_test_hybrid = hstack([
        X_test_tfidf,
        X_test_nerlex,
        X_test_wd_numeric_scaled,
        X_test_wd_binary
    ], format='csr')
    print(f"Final Test Hybrid Feature Matrix Shape: {X_test_hybrid.shape}")
except Exception as e:
    print(f"Error combining features with hstack for test set: {e}")
    raise

print("\n8. Making predictions on the test set...")
try:
    test_predictions_ids = loaded_model.predict(X_test_hybrid)
    print("Predictions completed.")
except Exception as e:
    print(f"ERROR during prediction on test set: {e}")
    raise

print("\n9. Adding predictions to the DataFrame...")
try:
    test_df['predicted_label_id'] = test_predictions_ids
    test_df['label'] = test_df['predicted_label_id'].map(inverse_label_map)

    if test_df['label'].isnull().any():
        print("Warning: Some predicted label IDs could not be mapped back to string labels.")
        print("Unique predicted IDs:", test_df['predicted_label_id'].unique())
        print("Inverse label map:", inverse_label_map)

    print("Sample of test data with predictions:")
    print(test_df[['item', 'name', 'predicted_label_id', 'label']].head())

except Exception as e:
    print(f"ERROR adding predictions to DataFrame: {e}")
    raise

print(f"\n10. Saving predictions to {output_file_path}...")
try:
    original_cols_df = pd.read_csv(unlabeled_file_path)
    output_columns = list(original_cols_df.columns) + ['label']

    if 'label' not in output_columns:
        output_columns.append('label')

    output_columns_present = [col for col in output_columns if col in test_df.columns]
    missing_output_cols = [col for col in output_columns if col not in test_df.columns]
    if missing_output_cols:
        print(f"Warning: The following requested output columns are missing and will not be saved: {missing_output_cols}")

    test_df.to_csv(output_file_path, columns=output_columns_present, index=False)
    print("Predictions saved successfully.")
except Exception as e:
    print(f"ERROR saving prediction file: {e}")
    raise

print("\n--- Prediction on Unlabeled Dataset Finished ---")

## Additional Blocks

### Plotting

In [None]:
predictions_file_path = output_file_path
label_column = 'predicted_label'
original_columns_to_analyze = ['type', 'category', 'subcategory']


print(f"--- Analyzing Predictions from: {predictions_file_path} ---")
try:
    df_predictions = pd.read_csv(predictions_file_path)
    print(f"Successfully loaded predictions file. Shape: {df_predictions.shape}")
except FileNotFoundError:
    print(f"ERROR: Predictions file not found at {predictions_file_path}")
    print("Please ensure the previous prediction script ran successfully and the file exists.")
    exit()
except Exception as e:
    print(f"ERROR: Failed to load predictions file. {e}")
    exit()

print("\n--- Basic File Information ---")
print(f"Total number of predictions: {len(df_predictions)}")
print(f"Columns found: {df_predictions.columns.tolist()}")

if label_column not in df_predictions.columns:
    print(f"ERROR: The expected prediction column '{label_column}' was not found in the file.")
    print("Please check the column names in the CSV and update the 'label_column' variable if needed.")
    exit()

missing_predictions = df_predictions[label_column].isnull().sum()
if missing_predictions > 0:
    print(f"\nWARNING: Found {missing_predictions} rows with missing predicted labels.")
else:
    print("\nNo missing values found in the predicted label column.")

print(f"\n--- Distribution of Predicted Labels ({label_column}) ---")
label_counts = df_predictions[label_column].value_counts()
label_percentages = df_predictions[label_column].value_counts(normalize=True) * 100

distribution_df = pd.DataFrame({
    'Count': label_counts,
    'Percentage': label_percentages.round(2)
})

print(distribution_df)

print("\n--- Visualizing Label Distribution ---")
plt.figure(figsize=(10, 6))
sns.countplot(data=df_predictions, y=label_column, order=label_counts.index, palette='viridis')
plt.title('Distribution of Predicted Labels (No Transformer Model)')
plt.xlabel('Number of Items')
plt.ylabel('Predicted Label')
for index, value in enumerate(label_counts):
    plt.text(value, index, f' {value} ({label_percentages.iloc[index]:.1f}%)', va='center')
plt.tight_layout()
dist_plot_path = os.path.join(plot_save_dir, 'predicted_label_distribution.png')
plt.savefig(dist_plot_path, bbox_inches='tight')
print(f"Saved label distribution plot to {dist_plot_path}")
plt.close()


print(f"\n--- Sample Predictions per Label ({label_column}) ---")
display_cols = ['item']
if 'name' in df_predictions.columns:
    display_cols.append('name')
if 'description' in df_predictions.columns:
    display_cols.append('description')
display_cols.append(label_column)

display_cols = [col for col in display_cols if col in df_predictions.columns]

if len(display_cols) > 1:
    try:
        sample_predictions = df_predictions.groupby(label_column)[display_cols].head(3)
        print(sample_predictions.to_string())
    except KeyError as e:
         print(f"Warning: Could not display samples because column {e} is missing.")
    except Exception as e:
         print(f"An error occurred while trying to display samples: {e}")
else:
    print("Skipping sample display as 'name' or 'description' columns are not found.")


print(f"\n--- Cross-tabulation with Original Features (if available) ---")
analyzed_cross_tabs = False
for original_col in original_columns_to_analyze:
    if original_col in df_predictions.columns:
        unique_values = df_predictions[original_col].nunique()
        if unique_values > 50:
             print(f"Skipping crosstab for '{original_col}' (Too many unique values: {unique_values})")
             continue
        if unique_values < 2:
             print(f"Skipping crosstab for '{original_col}' (Not enough unique values: {unique_values})")
             continue


        print(f"\n* Predicted Labels vs. '{original_col}':")
        try:
            cross_tab = pd.crosstab(df_predictions[original_col].fillna('Unknown'), df_predictions[label_column])
            cross_tab_percent = pd.crosstab(df_predictions[original_col].fillna('Unknown'), df_predictions[label_column], normalize='index') * 100

            print("Counts:")
            print(cross_tab)
            print("\nRow Percentages (%):")
            print(cross_tab_percent.round(1))
            analyzed_cross_tabs = True

        except Exception as e:
            print(f"Could not generate crosstab for '{original_col}': {e}")
    else:
        print(f"Column '{original_col}' not found in the predictions file, skipping crosstab.")

if not analyzed_cross_tabs:
    print("No suitable original columns found or specified for cross-tabulation analysis.")


print("\n--- Analysis Complete ---")

In [None]:
if 'summary_df' in locals() and isinstance(summary_df, pd.DataFrame) and not summary_df.empty:
    print("\n--- Plotting F1-Score Summary ---")
    try:
        ax = summary_df.plot(kind='bar', figsize=(14, 8), rot=45)
        plt.title('Comparison of Model F1-Macro Scores across Feature Sets')
        plt.ylabel('Validation F1-Macro Score')
        plt.xlabel('Feature Set')
        plt.legend(title='Models', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.grid(axis='y', linestyle='--')
        plt.ylim(bottom=max(0, summary_df.min().min() - 0.05))
        plt.tight_layout(rect=[0, 0, 0.85, 1])
        f1_plot_path = os.path.join(plot_save_dir, 'f1_summary_comparison.png')
        plt.savefig(f1_plot_path, bbox_inches='tight')
        print(f"Saved F1 summary plot to {f1_plot_path}")
        plt.close()
    except Exception as e:
        print(f"Error plotting F1 summary chart: {e}")
else:
    print("Skipping F1 summary plot: summary_df not found, is not a DataFrame, or is empty.")

In [None]:
print("\n--- Checking for XGBoost Training History ---")
if isinstance(loaded_model, xgb.XGBClassifier) and hasattr(loaded_model, 'evals_result'):
    try:
        results_eval = loaded_model.evals_result()
        if results_eval and 'validation_0' in results_eval:
            eval_metric = list(results_eval['validation_0'].keys())[0]
            epochs = len(results_eval['validation_0'][eval_metric])
            x_axis = range(0, epochs)

            plt.figure(figsize=(10, 6))
            plt.plot(x_axis, results_eval['validation_0'][eval_metric], label='Validation')

            plt.legend()
            plt.ylabel(f'{eval_metric.capitalize()}')
            plt.xlabel('Boosting Rounds')
            plt.title(f'XGBoost {eval_metric.capitalize()} History')
            plt.grid(True)
            plt.tight_layout()
            history_plot_path = os.path.join(plot_save_dir, 'xgboost_training_history.png')
            plt.savefig(history_plot_path, bbox_inches='tight')
            print(f"Plotted and saved XGBoost training history to {history_plot_path}.")
            plt.close()
        else:
            print("No evaluation results found in 'validation_0'. Cannot plot history.")

    except AttributeError:
         print("The loaded XGBoost model does not have evaluation results stored ('evals_result()').")
         print("This usually happens if early stopping wasn't used or the model wasn't trained with an eval_set directly.")
    except Exception as e:
        print(f"Error plotting XGBoost history: {e}")
        print("Eval results structure:", results_eval)
else:
    print("Skipping XGBoost history plot: Loaded model is not XGBoost or lacks evaluation results.")


In [None]:
if 'y_val_true' in locals() and 'y_val_pred' in locals() and 'target_names' in locals():
    print("\n--- Plotting Confusion Matrix ---")
    try:
        cm = confusion_matrix(y_val_true, y_val_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=target_names, yticklabels=target_names)
        plt.title(f'Confusion Matrix for Loaded Model ({os.path.basename(model_load_path)})')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.tight_layout()
        cm_plot_path = os.path.join(plot_save_dir, 'confusion_matrix_validation.png')
        plt.savefig(cm_plot_path, bbox_inches='tight')
        print(f"Saved confusion matrix plot to {cm_plot_path}")
        plt.close()
    except Exception as e:
        print(f"Error plotting confusion matrix: {e}")
else:
    print("Skipping confusion matrix plot: y_val_true, y_val_pred, or target_names not found.")
