# Product Classification using TF-IDF, Graph Features, and Ensemble Models with Stacking

This notebook implements product classification by combining approaches from two different notebooks:
1. Graph features from `tfidf_node2vec_classification.ipynb`
2. TF-IDF implementation from `Data_Challenge_TFIDF.ipynb`
3. Models: CalibratedCSVC and XGBoost from `Data_Challenge_TFIDF.ipynb`, Neural Network and Ensemble from `tfidf_node2vec_classification.ipynb`
4. Focus on optimizing log loss like in `Data_Challenge_TFIDF.ipynb`
5. Test predictions implementation from `Data_Challenge_TFIDF.ipynb`
6. Uses stacking instead of weighted averaging for ensemble


## 1. Imports and Setup


In [1]:
# General imports
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, HTML
import re
import string
import csv
import spacy
from collections import Counter
import joblib

# Feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import StandardScaler, label_binarize

# Models
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.feature_selection import SelectKBest, chi2

# Graph embeddings
import node2vec
from node2vec import Node2Vec
from gensim.models import Word2Vec

# Evaluation
from sklearn.metrics import accuracy_score, classification_report, log_loss
from sklearn.model_selection import StratifiedKFold, train_test_split

# For stacking
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression


2025-05-31 17:50:09.556000: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748703009.614841   27540 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748703009.632049   27540 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748703009.765498   27540 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748703009.765518   27540 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748703009.765522   27540 computation_placer.cc:177] computation placer alr

## 2. Helper Functions


In [2]:
# Function to create price features (from tfidf_node2vec_classification.ipynb)
def create_price_features(product_ids, price_df):
    """
    Create price-based features for a list of product IDs

    Args:
        product_ids: List of product IDs
        price_df: DataFrame with product_id and price columns

    Returns:
        DataFrame with price features
    """
    # Create a DataFrame with product IDs as index
    price_features = pd.DataFrame(index=product_ids)

    # Map prices to products
    price_dict = dict(zip(price_df['product_id'], price_df['price']))
    price_features['price'] = price_features.index.map(lambda x: price_dict.get(x, np.nan))

    # Fill missing prices with median
    median_price = price_df['price'].median()
    price_features['price'].fillna(median_price, inplace=True)

    # Create price buckets (as binary features)
    price_features['price_0_10'] = (price_features['price'] <= 10).astype(int)
    price_features['price_10_100'] = ((price_features['price'] > 10) & (price_features['price'] <= 100)).astype(int)
    price_features['price_100_plus'] = (price_features['price'] > 100).astype(int)

    # Log transformation of price
    price_features['price_log'] = np.log1p(price_features['price'])

    # Price rank (percentile)
    price_features['price_rank'] = price_features['price'].rank(pct=True)

    # Z-score of price (how many standard deviations from the mean)
    mean_price = price_df['price'].mean()
    std_price = price_df['price'].std()
    price_features['price_zscore'] = (price_features['price'] - mean_price) / std_price

    return price_features

# Function to extract graph features for a set of nodes (from tfidf_node2vec_classification.ipynb)
def extract_graph_features(G, node_list):
    print("Calculating degree centrality...")
    degree_centrality = nx.degree_centrality(G)

    print("Calculating clustering coefficient...")
    clustering_coefficient = nx.clustering(G)

    print("Calculating PageRank...")
    pagerank = nx.pagerank(G, alpha=0.85, max_iter=100)

    print("Calculating triangle count...")
    triangles = nx.triangles(G)

    # Create a dataframe with the features
    features_df = pd.DataFrame(index=node_list)

    features_df['degree_centrality'] = features_df.index.map(lambda x: degree_centrality.get(str(x), 0))
    features_df['clustering_coefficient'] = features_df.index.map(lambda x: clustering_coefficient.get(str(x), 0))
    features_df['pagerank'] = features_df.index.map(lambda x: pagerank.get(str(x), 0))
    features_df['triangle_count'] = features_df.index.map(lambda x: triangles.get(str(x), 0))

    # Degree (number of connections)
    print("Calculating degree...")
    degree_dict = dict(G.degree())
    features_df['degree'] = features_df.index.map(lambda x: degree_dict.get(str(x), 0))

    return features_df

# Text preprocessing function with lemmatization (from Data_Challenge_TFIDF.ipynb)
def clean_text_with_lemma(text):
    # Lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove digits
    text = re.sub(r'\d+', '', text)

    # Use spaCy to tokenize and lemmatize
    doc = nlp(text)
    tokens = [
        token.lemma_.lower() for token in doc
        if token.lemma_.lower() not in STOP_WORDS
        and not token.is_punct
        and not token.is_space
        and not token.like_num
    ]

    return ' '.join(tokens)

# Function to calculate multiclass log loss (from both notebooks)
def multiclass_log_loss(y_true, y_pred_proba, eps=1e-15):
    """
    y_true: array-like of shape (N,) - true class labels
    y_pred_proba: array-like of shape (N, C) - predicted class probabilities
    """
    # Number of samples
    N = y_true.shape[0]

    # One-hot encode the true labels (yij)
    y_true_one_hot = label_binarize(y_true, classes=np.arange(y_pred_proba.shape[1]))

    # Clip predicted probabilities to avoid log(0)
    y_pred_proba = np.clip(y_pred_proba, eps, 1 - eps)

    # Compute the log loss
    loss = -np.sum(y_true_one_hot * np.log(y_pred_proba)) / N
    return loss


## 3. Loading Data


In [3]:
# Load spaCy English model for text preprocessing
nlp = spacy.load("en_core_web_lg")
from spacy.lang.en.stop_words import STOP_WORDS

# Load the edge list data
edgelist_file = 'data_files/edgelist.txt'
edges_df = pd.read_csv(edgelist_file, header=None, names=['source', 'target'])

# Load the class labels
labels_file = 'y_train.txt'
labels_df = pd.read_csv(labels_file, header=None, names=['product_id', 'label'])

# Load the train and test splits
train_df = pd.read_csv('split_dataset/train.csv')
test_df = pd.read_csv('split_dataset/test.csv')

# Load price data
price_df = pd.read_csv('data_files/price.txt', header=None, names=['product_id', 'price'])

# Display basic information about the datasets
print(f"Edge list shape: {edges_df.shape}")
print(f"Labels shape: {labels_df.shape}")
print(f"Train set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"Price data shape: {price_df.shape}")

# Check the first few rows of each dataset
print("\nEdge list sample:")
print(edges_df.head())

print("\nLabels sample:")
print(labels_df.head())

print("\nTrain set sample:")
print(train_df.head())

print("\nTest set sample:")
print(test_df.head())

print("\nPrice data sample:")
print(price_df.head())


Edge list shape: (1811087, 2)
Labels shape: (182006, 2)
Train set shape: (145604, 3)
Test set shape: (36402, 3)
Price data shape: (198817, 2)

Edge list sample:
   source  target
0  251528  237411
1  100805   74791
2   38634   97747
3  247470   77089
4  267060  250490

Labels sample:
   product_id  label
0       66795      9
1      242781      3
2       91280      2
3       56356      5
4      218494      0

Train set sample:
   product_id                                         text_clean  label
0      114704  hornady unprimed winchester cartridge case hor...      2
1      250731  tachikara tk leopard knee pad tachikara tk leo...     11
2      152967  g asd replacement cutter aluminum amp carbon u...      2
3        4541  mtech usa mt tactical folding knife inch close...      2
4      142062  nhl pittsburgh penguins game day black pro sha...      7

Test set sample:
   product_id                                         text_clean  label
0       56218                             katz h

In [4]:
# Handle missing values in text data
# Find rows with missing 'description' in train_df
missing_train = train_df[train_df['text_clean'].isnull()]

# Find rows with missing 'description' in test_df
missing_test = test_df[test_df['text_clean'].isnull()]

# Print the rows with missing values
print("Rows with missing 'description' in train_df:")
print(missing_train)

print("\nRows with missing 'description' in test_df:")
print(missing_test)

# Remove rows with missing 'description' in train_df
train_df = train_df.dropna(subset=['text_clean'])

# Remove rows with missing 'description' in test_df
test_df = test_df.dropna(subset=['text_clean'])

# Verify if any rows with missing values remain
print("Missing values in cleaned train_df:", train_df.isnull().sum())
print("Missing values in cleaned test_df:", test_df.isnull().sum())

# Reset the index after removing rows with missing values
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# Prepare data for modeling
X_train = train_df['text_clean']
y_train = train_df['label']

X_test = test_df['text_clean']
y_test = test_df['label']

# Load product IDs from test.txt for final predictions
print("Loading product IDs from test.txt...")
with open('test.txt', 'r') as f:
    test_products = [line.strip().rstrip(',') for line in f.readlines()]

print(f"Loaded {len(test_products)} product IDs from test.txt")
print("First 10 product IDs:", test_products[:10])

# Load descriptions for all products
print("Getting descriptions")
descriptions = dict()
with open("data_files/description_part_1.txt", "r") as f:
    for line in f:
        if '|=|' in line:
            t = line.split('|=|')
            descriptions[int(t[0])] = t[1][:-1]

with open("data_files/description_part_2.txt", "r") as f:
    for line in f:
        if '|=|' in line:
            t = line.split('|=|')
            descriptions[int(t[0])] = t[1][:-1]

# Get descriptions for test products
test_text = []
for i in test_products:
    try:
        test_text.append(descriptions[int(i)])
    except (KeyError, ValueError):
        test_text.append("")  # Empty string for missing descriptions

# Apply the cleaning function to test data
print("Apply cleaning")
test_text_cleaned = [clean_text_with_lemma(text) for text in test_text]


Rows with missing 'description' in train_df:
       product_id text_clean  label
89285      265165        NaN      7

Rows with missing 'description' in test_df:
       product_id text_clean  label
34767      174103        NaN      5
Missing values in cleaned train_df: product_id    0
text_clean    0
label         0
dtype: int64
Missing values in cleaned test_df: product_id    0
text_clean    0
label         0
dtype: int64
Loading product IDs from test.txt...
Loaded 45502 product IDs from test.txt
First 10 product IDs: ['49957', '135386', '226880', '165114', '256154', '254193', '20830', '46170', '19248', '158023']
Getting descriptions
Apply cleaning


## 4. Creating Graph and Extracting Graph Features


In [5]:
# Create a graph from the edge list
print("Creating graph")
G = nx.from_pandas_edgelist(edges_df, 'source', 'target')

# Print basic information about the graph
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
print(f"Network Density: {nx.density(G):.6f}")

# Check if the graph is connected
is_connected = nx.is_connected(G)
print(f"Is the graph connected? {is_connected}")

if not is_connected:
    # Get the largest connected component
    largest_cc = max(nx.connected_components(G), key=len)
    largest_cc_subgraph = G.subgraph(largest_cc)
    print(f"\nLargest Connected Component:")
    print(f"  Nodes: {largest_cc_subgraph.number_of_nodes()}")
    print(f"  Edges: {largest_cc_subgraph.number_of_edges()}")
    print(f"  Percentage of total nodes: {largest_cc_subgraph.number_of_nodes() / G.number_of_nodes() * 100:.2f}%")

# Get the list of product IDs from train and test sets
train_product_ids = train_df['product_id'].tolist()
test_product_ids = test_df['product_id'].tolist()

# Extract graph features for training and testing sets
print("\nExtracting graph features for training set...")
train_graph_features = extract_graph_features(G, train_product_ids)

print("\nExtracting graph features for testing set...")
test_graph_features = extract_graph_features(G, test_product_ids)

# Fill missing values with 0 if any
train_graph_features = train_graph_features.fillna(0)
test_graph_features = test_graph_features.fillna(0)

# Scale the features
graph_scaler = StandardScaler()
train_graph_features_scaled = graph_scaler.fit_transform(train_graph_features)
test_graph_features_scaled = graph_scaler.transform(test_graph_features)

# Generate Node2Vec embeddings
model_path = "node2vec.model"
print("Loading Node2Vec embeddings...")

try:
    if os.path.exists(model_path):
        print("Loading existing Node2Vec model from disk...")
        n2v_model = Word2Vec.load(model_path)
    else:
        # Convert NetworkX graph to node2vec format
        # First, ensure all nodes are strings for compatibility
        G_node2vec = nx.Graph()
        for edge in G.edges():
            G_node2vec.add_edge(str(edge[0]), str(edge[1]))

        # Initialize node2vec model
        node2vec_model = node2vec.Node2Vec(
            G_node2vec,
            dimensions=128,  # Embedding dimension
            walk_length=10,  # Length of each random walk
            num_walks=10,    # Number of random walks per node
            workers=1       # Number of parallel workers
        )

        # Train the model
        print("Training Node2Vec model...")
        n2v_model = node2vec_model.fit(
            window=10,       # Context size for optimization
            min_count=1,     # Minimum count of node occurrences
            batch_words=4    # Number of words per batch
        )

        n2v_model.save(model_path)

    # Generate embeddings for train and test nodes
    train_node2vec_features = np.zeros((len(train_product_ids), 128))
    test_node2vec_features = np.zeros((len(test_product_ids), 128))

    # Extract embeddings for training nodes
    for i, node_id in enumerate(train_product_ids):
        try:
            train_node2vec_features[i] = n2v_model.wv[str(node_id)]
        except KeyError:
            # If node not in embeddings, use zeros
            pass

    # Extract embeddings for testing nodes
    for i, node_id in enumerate(test_product_ids):
        try:
            test_node2vec_features[i] = n2v_model.wv[str(node_id)]
        except KeyError:
            # If node not in embeddings, use zeros
            pass

    print(f"Node2Vec embeddings shape - Train: {train_node2vec_features.shape}, Test: {test_node2vec_features.shape}")

    # Scale the embeddings
    n2v_scaler = StandardScaler()
    train_node2vec_scaled = n2v_scaler.fit_transform(train_node2vec_features)
    test_node2vec_scaled = n2v_scaler.transform(test_node2vec_features)

except Exception as e:
    print(f"Error generating Node2Vec embeddings: {e}")
    print("Skipping Node2Vec embeddings...")


Creating graph
Number of nodes: 276453
Number of edges: 1811087
Network Density: 0.000047
Is the graph connected? False

Largest Connected Component:
  Nodes: 273012
  Edges: 1808230
  Percentage of total nodes: 98.76%

Extracting graph features for training set...
Calculating degree centrality...
Calculating clustering coefficient...
Calculating PageRank...
Calculating triangle count...
Calculating degree...

Extracting graph features for testing set...
Calculating degree centrality...
Calculating clustering coefficient...
Calculating PageRank...
Calculating triangle count...
Calculating degree...
Loading Node2Vec embeddings...
Loading existing Node2Vec model from disk...
Node2Vec embeddings shape - Train: (145603, 128), Test: (36401, 128)


## 5. Extracting TF-IDF Features from Text (using approach from Data_Challenge_TFIDF.ipynb)


In [6]:
print("Initializing TF-IDF")
# Initialize the TfidfVectorizer with parameters from Data_Challenge_TFIDF.ipynb
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=1, max_df=0.95, sublinear_tf=True, norm='l2')

# Fit and transform the text data to get the TF-IDF matrix
X_tfidf_train = tfidf_vectorizer.fit_transform(X_train)
X_tfidf_test = tfidf_vectorizer.transform(X_test)
X_tfidf_test_comp = tfidf_vectorizer.transform(test_text_cleaned)  # For final predictions

print(f"TF-IDF features shape - Train: {X_tfidf_train.shape}, Test: {X_tfidf_test.shape}")


Initializing TF-IDF
TF-IDF features shape - Train: (145603, 2060013), Test: (36401, 2060013)


## 6. Creating Price Features


In [7]:
# Create price features for training and testing sets
print("Creating price features for training set...")
train_price_features = create_price_features(train_product_ids, price_df)

print("Creating price features for testing set...")
test_price_features = create_price_features(test_product_ids, price_df)

# Display the first few rows of price features
print("\nTrain price features sample:")
print(train_price_features.head())

print("\nTest price features sample:")
print(test_price_features.head())

# Scale the price features (except binary features)
price_scaler = StandardScaler()
price_columns_to_scale = ['price', 'price_log', 'price_rank', 'price_zscore']
binary_columns = ['price_0_10', 'price_10_100', 'price_100_plus']

# Scale the selected columns
train_price_scaled = train_price_features.copy()
test_price_scaled = test_price_features.copy()

train_price_scaled[price_columns_to_scale] = price_scaler.fit_transform(train_price_features[price_columns_to_scale])
test_price_scaled[price_columns_to_scale] = price_scaler.transform(test_price_features[price_columns_to_scale])

# Convert to numpy arrays for easier handling
train_price_features_array = train_price_scaled.values
test_price_features_array = test_price_scaled.values

print(f"Price features shape - Train: {train_price_features_array.shape}, Test: {test_price_features_array.shape}")


Creating price features for training set...
Creating price features for testing set...

Train price features sample:
        price  price_0_10  price_10_100  price_100_plus  price_log  \
114704  43.20           0             1               0   3.788725   
250731  24.99           0             1               0   3.257712   
152967  22.95           0             1               0   3.175968   
4541     8.49           1             0               0   2.250239   
142062  24.99           0             1               0   3.257712   

        price_rank  price_zscore  
114704    0.760266     -0.130318  
250731    0.495769     -0.326493  
152967    0.339856     -0.348470  
4541      0.095352     -0.504247  
142062    0.495769     -0.326493  

Test price features sample:
         price  price_0_10  price_10_100  price_100_plus  price_log  \
56218    11.99           0             1               0   2.564180   
42346    65.00           0             1               0   4.189655   
215842   2

## 7. Combining Features


In [8]:
# Convert node2vec embeddings to sparse for efficient concatenation
print("Converting node2vec embeddings to sparse format...")
train_node2vec_sparse = csr_matrix(train_node2vec_scaled)
test_node2vec_sparse = csr_matrix(test_node2vec_scaled)

# Convert graph features to sparse
print("Converting graph features to sparse format...")
train_graph_sparse = csr_matrix(train_graph_features_scaled)
test_graph_sparse = csr_matrix(test_graph_features_scaled)

# Convert price features to sparse
print("Converting price features to sparse format...")
train_price_sparse = csr_matrix(train_price_features_array)
test_price_sparse = csr_matrix(test_price_features_array)

# Combine all features: TF-IDF + graph features + node2vec embeddings + price features
print("Combining all features...")
X_combined_train = hstack([X_tfidf_train, train_graph_sparse, train_node2vec_sparse, train_price_sparse], format='csr')
X_combined_test = hstack([X_tfidf_test, test_graph_sparse, test_node2vec_sparse, test_price_sparse], format='csr')

# Output memory usage information
print(f"Final combined features shape - Train: {X_combined_train.shape}, Test: {X_combined_test.shape}")
print("Memory usage (approximate):")
print(f"  - X_combined_train: {X_combined_train.data.nbytes / (1024 ** 2):.2f} MB")
print(f"  - X_combined_test: {X_combined_test.data.nbytes / (1024 ** 2):.2f} MB")


Converting node2vec embeddings to sparse format...
Converting graph features to sparse format...
Converting price features to sparse format...
Combining all features...
Final combined features shape - Train: (145603, 2060153), Test: (36401, 2060153)
Memory usage (approximate):
  - X_combined_train: 230.32 MB
  - X_combined_test: 55.17 MB


## 8. Model Training and Evaluation


In [9]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_classif

k_feat = 1000000
print(f"Selecting {k_feat} features")
feature_selector = SelectKBest(f_classif, k=k_feat)
X_combined_train_selected = feature_selector.fit_transform(X_combined_train, y_train)
X_combined_test_selected = feature_selector.transform(X_combined_test)
print(X_combined_train_selected.shape)

Selecting 1000000 features
(145603, 1000000)


In [10]:
# We trained the LinearSVC on 2m features so we use another feature selector
k_feat = 2000000
print(f"Selecting {k_feat} features")
feature_selector_svm = SelectKBest(f_classif, k=k_feat)
X_combined_train_selected_svm = feature_selector_svm.fit_transform(X_combined_train, y_train)
X_combined_test_selected_svm = feature_selector_svm.transform(X_combined_test)
print(X_combined_train_selected_svm.shape)

Selecting 2000000 features
(145603, 2000000)


### 8.1 Loading Pre-trained Models


In [14]:
# Load the pre-trained models
print("Loading LinearSVC model...")
svm_model = joblib.load('linear_svc_model.pkl')

y_pred_svm = svm_model.predict(X_combined_test_selected_svm)
y_proba_svm = svm_model.predict_proba(X_combined_test_selected_svm)

svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_log_loss = multiclass_log_loss(y_test, y_proba_svm)

print(f"LinearSVC Accuracy: {svm_accuracy:.4f}, Log Loss: {svm_log_loss:.4f}")

Loading LinearSVC model...
LinearSVC Accuracy: 0.9439, Log Loss: 0.2128


In [13]:
# Load XGBoost model
print("Loading XGBoost model...")
xgb_model = joblib.load('xgb_model.pkl')

y_pred_xgb = xgb_model.predict(X_combined_test_selected)
y_proba_xgb = xgb_model.predict_proba(X_combined_test_selected)

xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_log_loss = multiclass_log_loss(y_test, y_proba_xgb)

print(f"XGBoost Accuracy: {xgb_accuracy:.4f}, Log Loss: {xgb_log_loss:.4f}")

Loading XGBoost model...
XGBoost Accuracy: 0.9283, Log Loss: 0.2413


In [12]:
# Load Neural Network model
print("Loading Neural Network model...")
nn_model = load_model('neural_network_model.keras')

y_pred_nn = nn_model.predict(X_combined_test_selected)
y_pred_nn_classes = np.argmax(y_pred_nn, axis=1)

nn_accuracy = accuracy_score(y_test, y_pred_nn_classes)
nn_log_loss = multiclass_log_loss(y_test, y_pred_nn)

print(f"Neural Network Accuracy: {nn_accuracy:.4f}, Log Loss: {nn_log_loss:.4f}")

Loading Neural Network model...
[1m1138/1138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 139ms/step
Neural Network Accuracy: 0.9323, Log Loss: 0.2617


### 8.2 Implementing Stacking Ensemble


In [15]:
# Split the training data for stacking
print("Splitting training data for stacking...")
X_train_stack, X_val_stack, y_train_stack, y_val_stack = train_test_split(
    X_combined_train_selected, y_train, test_size=0.3, random_state=42, stratify=y_train
)

X_train_stack_svm, X_val_stack_svm, y_train_stack_svm, y_val_stack_svm = train_test_split(
    X_combined_train_selected_svm, y_train, test_size=0.3, random_state=42, stratify=y_train
)

# Generate predictions from base models on validation set
print("Generating base model predictions for stacking...")
val_proba_svm = svm_model.predict_proba(X_val_stack_svm)
val_proba_xgb = xgb_model.predict_proba(X_val_stack)
val_proba_nn = nn_model.predict(X_val_stack)

# Combine predictions for meta-model training
print("Preparing meta-features for stacking...")
meta_features_val = np.hstack([val_proba_svm, val_proba_xgb, val_proba_nn])

# Generate predictions from base models on test set
print("Generating base model predictions on test set...")
test_proba_svm = svm_model.predict_proba(X_combined_test_selected_svm)
test_proba_xgb = xgb_model.predict_proba(X_combined_test_selected)
test_proba_nn = nn_model.predict(X_combined_test_selected)

# Combine predictions for meta-model testing
print("Preparing meta-features for test set...")
meta_features_test = np.hstack([test_proba_svm, test_proba_xgb, test_proba_nn])

# Train a meta-model (LogisticRegression) on the stacked predictions
print("Training meta-model for stacking...")
num_classes = len(np.unique(y_train))
meta_model = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
meta_model.fit(meta_features_val, y_val_stack)

# Make predictions with the stacking ensemble
print("Making predictions with stacking ensemble...")
y_pred_stack = meta_model.predict(meta_features_test)
y_proba_stack = meta_model.predict_proba(meta_features_test)

# Evaluate stacking ensemble
stack_accuracy = accuracy_score(y_test, y_pred_stack)
stack_log_loss = multiclass_log_loss(y_test, y_proba_stack)

print(f"Stacking Ensemble Accuracy: {stack_accuracy:.4f}")
print(f"Stacking Ensemble Log Loss: {stack_log_loss:.4f}")
print("\nStacking Ensemble Classification Report:")
print(classification_report(y_test, y_pred_stack))

# Compare with individual models
print("\nModel Performance Comparison:")
print(f"{'Model':<20} {'Accuracy':<10} {'Log Loss':<10}")
print(f"{'-'*40}")
print(f"{'LinearSVC':<20} {svm_accuracy:.4f}     {svm_log_loss:.4f}")
print(f"{'XGBoost':<20} {xgb_accuracy:.4f}     {xgb_log_loss:.4f}")
print(f"{'Neural Network':<20} {nn_accuracy:.4f}     {nn_log_loss:.4f}")
print(f"{'Stacking Ensemble':<20} {stack_accuracy:.4f}     {stack_log_loss:.4f}")


Splitting training data for stacking...
Generating base model predictions for stacking...
[1m1366/1366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 137ms/step
Preparing meta-features for stacking...
Generating base model predictions on test set...
[1m1138/1138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 137ms/step
Preparing meta-features for test set...
Training meta-model for stacking...
Making predictions with stacking ensemble...
Stacking Ensemble Accuracy: 0.9431
Stacking Ensemble Log Loss: 0.2821

Stacking Ensemble Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3033
           1       0.91      0.92      0.91      2372
           2       0.93      0.95      0.94      8652
           3       0.96      0.97      0.97      1073
           4       0.96      0.96      0.96      3016
           5       0.98      0.96      0.97      3564
           6       0.96      0.96      0.96    

## 9. Generate Predictions for Test Products


In [None]:
# Create features for test products
print("\nCreating features for test products...")

# 1. Create graph features for test products
print("\nExtracting graph features for test products...")
test_graph_features_final = extract_graph_features(G, test_products)
test_graph_features_final = test_graph_features_final.fillna(0)
test_graph_features_final_scaled = graph_scaler.transform(test_graph_features_final)
test_graph_sparse_final = csr_matrix(test_graph_features_final_scaled)
print(f"Graph features shape for test products: {test_graph_sparse_final.shape}")

# 2. Create Node2Vec embeddings for test products
test_node2vec_features_final = np.zeros((len(test_products), 128))
for i, node_id in enumerate(test_products):
    try:
        test_node2vec_features_final[i] = n2v_model.wv[str(node_id)]
    except KeyError:
        # If node not in embeddings, use zeros
        pass

# Scale the Node2Vec embeddings
test_node2vec_scaled_final = n2v_scaler.transform(test_node2vec_features_final)
test_node2vec_sparse_final = csr_matrix(test_node2vec_scaled_final)
print(f"Node2Vec features shape for test products: {test_node2vec_sparse_final.shape}")

# 3. Create price features for test products
test_price_features_final = create_price_features(test_products, price_df)
test_price_scaled_final = test_price_features_final.copy()
test_price_scaled_final[price_columns_to_scale] = price_scaler.transform(test_price_features_final[price_columns_to_scale])
test_price_features_array_final = test_price_scaled_final.values
test_price_sparse_final = csr_matrix(test_price_features_array_final)
print(f"Price features shape for test products: {test_price_sparse_final.shape}")

# COMBINE ALL FEATURES
print("\nCombining all features...")
X_combined_test_final = hstack([
    X_tfidf_test_comp, 
    test_graph_sparse_final,
    test_node2vec_sparse_final, 
    test_price_sparse_final
], format='csr')

print(f"Combined features shape for test products: {X_combined_test_final.shape}")

# Apply feature selection
X_combined_test_final_selected = feature_selector.transform((X_combined_test_final))
X_combined_test_final_selected_svm = feature_selector_svm.transform((X_combined_test_final))

# Make predictions using all base models
print("\nMaking predictions using all base models...")
test_pred_proba_svm = svm_model.predict_proba(X_combined_test_final_selected_svm)
test_pred_proba_xgb = xgb_model.predict_proba(X_combined_test_final_selected)
test_pred_proba_nn = nn_model.predict(X_combined_test_final_selected)

# Combine predictions for meta-model
print("Preparing meta-features for final predictions...")
meta_features_final = np.hstack([test_pred_proba_svm, test_pred_proba_xgb, test_pred_proba_nn])

# Make predictions with the stacking ensemble
print("Making final predictions with stacking ensemble...")
test_pred_proba_stack = meta_model.predict_proba(meta_features_final)

print(f"Predictions shape: {test_pred_proba_stack.shape}")

# Create a DataFrame with the predictions
predictions_df = pd.DataFrame()
predictions_df['product'] = test_products

# Add probability for each class
for i in range(test_pred_proba_stack.shape[1]):
    predictions_df[f'class{i}'] = test_pred_proba_stack[:, i].round(4)

# Save predictions to CSV
predictions_df.to_csv('predictions_stacking.csv', index=False)
print(f"Predictions saved to predictions_stacking.csv")

# Display the first few rows of the predictions
print("\nSample of predictions:")
print(predictions_df.head())