# Graph

## Imports

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, log_loss
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Import XGBoost
import xgboost as xgb

# Import TensorFlow for Neural Network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import node2vec
from karateclub import DeepWalk


## Load files

In [None]:
# Load the edge list data
edgelist_file = 'data_files/edgelist.txt'
edges_df = pd.read_csv(edgelist_file, header=None, names=['source', 'target'])

# Load the class labels
labels_file = 'y_train.txt'
labels_df = pd.read_csv(labels_file, header=None, names=['product_id', 'label'])

# Load the train and test splits
train_df = pd.read_csv('split_dataset/train.csv')
test_df = pd.read_csv('split_dataset/test.csv')

# Display basic information about the datasets
print(f"Edge list shape: {edges_df.shape}")
print(f"Labels shape: {labels_df.shape}")
print(f"Train set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")

# Check the first few rows of each dataset
print("\nEdge list sample:")
print(edges_df.head())

print("\nLabels sample:")
print(labels_df.head())

print("\nTrain set sample:")
print(train_df.head())

print("\nTest set sample:")
print(test_df.head())


## Create Graph

In [None]:
# Create a graph from the edge list
G = nx.from_pandas_edgelist(edges_df, 'source', 'target')

# Print basic information about the graph
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
print(f"Network Density: {nx.density(G):.6f}")

# Check if the graph is connected
is_connected = nx.is_connected(G)
print(f"Is the graph connected? {is_connected}")

if not is_connected:
    # Get the largest connected component
    largest_cc = max(nx.connected_components(G), key=len)
    largest_cc_subgraph = G.subgraph(largest_cc)
    print(f"\nLargest Connected Component:")
    print(f"  Nodes: {largest_cc_subgraph.number_of_nodes()}")
    print(f"  Edges: {largest_cc_subgraph.number_of_edges()}")
    print(f"  Percentage of total nodes: {largest_cc_subgraph.number_of_nodes() / G.number_of_nodes() * 100:.2f}%")


## Prepare train test

In [None]:
# Get the list of product IDs from train and test sets
train_product_ids = train_df['product_id'].tolist()
test_product_ids = test_df['product_id'].tolist()

# Extract features for training and testing sets
print("Extracting features for training set...")
train_features = extract_graph_features(G, train_product_ids)

print("\nExtracting features for testing set...")
test_features = extract_graph_features(G, test_product_ids)

# Get the labels for training and testing sets
train_labels = train_df['label'].values
test_labels = test_df['label'].values

# Display the feature dataframes
print("\nTraining features shape:", train_features.shape)
print("Testing features shape:", test_features.shape)

# Check for missing values
print("\nMissing values in training features:", train_features.isnull().sum().sum())
print("Missing values in testing features:", test_features.isnull().sum().sum())

# Fill missing values with 0 if any
train_features = train_features.fillna(0)
test_features = test_features.fillna(0)

# Scale the features
scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)
test_features_scaled = scaler.transform(test_features)


## Node2Vec Embeddings + XGBoost

In [None]:
# Generate Node2Vec embeddings
print("Generating Node2Vec embeddings...")

try:
    # Convert NetworkX graph to node2vec format
    # First, ensure all nodes are strings for compatibility
    G_node2vec = nx.Graph()
    for edge in G.edges():
        G_node2vec.add_edge(str(edge[0]), str(edge[1]))

    # Initialize node2vec model
    node2vec_model = node2vec.Node2Vec(
        G_node2vec,
        dimensions=64,  # Embedding dimension
        walk_length=5,  # Length of each random walk
        num_walks=5,    # Number of random walks per node
        workers=1        # Number of parallel workers
    )

    # Train the model
    print("Training Node2Vec model...")
    n2v_model = node2vec_model.fit(
        window=10,       # Context size for optimization
        min_count=1,     # Minimum count of node occurrences
        batch_words=4    # Number of words per batch
    )

    # Generate embeddings for train and test nodes
    train_node2vec_features = np.zeros((len(train_product_ids), 64))
    test_node2vec_features = np.zeros((len(test_product_ids), 64))

    # Extract embeddings for training nodes
    for i, node_id in enumerate(train_product_ids):
        try:
            train_node2vec_features[i] = n2v_model.wv[str(node_id)]
        except KeyError:
            # If node not in embeddings, use zeros
            pass

    # Extract embeddings for testing nodes
    for i, node_id in enumerate(test_product_ids):
        try:
            test_node2vec_features[i] = n2v_model.wv[str(node_id)]
        except KeyError:
            # If node not in embeddings, use zeros
            pass

    print(f"Node2Vec embeddings shape - Train: {train_node2vec_features.shape}, Test: {test_node2vec_features.shape}")

    # Scale the embeddings
    n2v_scaler = StandardScaler()
    train_node2vec_scaled = n2v_scaler.fit_transform(train_node2vec_features)
    test_node2vec_scaled = n2v_scaler.transform(test_node2vec_features)

    # Train a classifier on Node2Vec embeddings
    print("Training XGBoost on Node2Vec embeddings...")
    n2v_xgb_model = xgb.XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42,
        n_jobs=-1
    )
    n2v_xgb_model.fit(train_node2vec_scaled, train_labels)

    # Make predictions
    n2v_xgb_test_pred = n2v_xgb_model.predict(test_node2vec_scaled)
    n2v_xgb_test_pred_proba = n2v_xgb_model.predict_proba(test_node2vec_scaled)

    # Calculate metrics
    n2v_xgb_accuracy = accuracy_score(test_labels, n2v_xgb_test_pred)
    n2v_xgb_loss = log_loss(test_labels, n2v_xgb_test_pred_proba)

    print(f"Node2Vec XGBoost Test Accuracy: {n2v_xgb_accuracy:.4f}")
    print(f"Node2Vec XGBoost Multi-class Log Loss: {n2v_xgb_loss:.4f}")
    print("\nNode2Vec XGBoost Classification Report:")
    print(classification_report(test_labels, n2v_xgb_test_pred))

except Exception as e:
    print(f"Error generating Node2Vec embeddings: {e}")
    print("Skipping Node2Vec embeddings...")


## Predictions for the test dataset (the one that we actually want to send)

In [None]:
# Load product IDs from test.txt
print("Loading product IDs from test.txt...")
test_txt_products = []
with open("test.txt", "r") as f:
    for line in f:
        t = line.split(',')
        test_txt_products.append(int(t[0]))

print(f"Loaded {len(test_txt_products)} product IDs from test.txt")

# Generate Node2Vec embeddings for test.txt products
print("Generating Node2Vec embeddings for test.txt products...")
test_txt_node2vec_features = np.zeros((len(test_txt_products), 64))

# Extract embeddings for test.txt products
for i, node_id in enumerate(test_txt_products):
    try:
        test_txt_node2vec_features[i] = n2v_model.wv[str(node_id)]
    except KeyError:
        # If node not in embeddings, use zeros
        pass

print(f"Node2Vec embeddings shape for test.txt products: {test_txt_node2vec_features.shape}")

# Scale the embeddings using the same scaler used for training
test_txt_node2vec_scaled = n2v_scaler.transform(test_txt_node2vec_features)

# Make predictions using the trained Node2Vec XGBoost model
print("Making predictions for test.txt products...")
test_txt_pred_proba = n2v_xgb_model.predict_proba(test_txt_node2vec_scaled)

# Create a DataFrame with the predictions
print("Creating CSV with predictions...")
predictions_df = pd.DataFrame()
predictions_df['product'] = test_txt_products

# Add probability columns for each class
for i in range(len(n2v_xgb_model.classes_)):
    predictions_df[f'class{i}'] = test_txt_pred_proba[:, i].round(4)

# Save predictions to CSV
predictions_df.to_csv('node2vec_predictions.csv', index=False)
print(f"Predictions saved to node2vec_predictions.csv")

# Display the first few rows of the predictions
print("\nSample of predictions:")
print(predictions_df.head())


# Description - Text

In [None]:
from IPython.display import display, HTML
import pandas as pd

train_df = pd.read_csv('split_dataset/train.csv')
test_df = pd.read_csv('split_dataset/test.csv')

# Find rows with missing 'description' in train_df
missing_train = train_df[train_df['text_clean'].isnull()]

# Find rows with missing 'description' in test_df
missing_test = test_df[test_df['text_clean'].isnull()]

# Print the rows with missing values
print("Rows with missing 'description' in train_df:")
print(missing_train)

print("\nRows with missing 'description' in test_df:")
print(missing_test)


# Remove rows with missing 'description' in train_df
train_df = train_df.dropna(subset=['text_clean'])

# Remove rows with missing 'description' in test_df
test_df = test_df.dropna(subset=['text_clean'])

# Verify if any rows with missing values remain
print("Missing values in cleaned train_df:", train_df.isnull().sum())
print("Missing values in cleaned test_df:", test_df.isnull().sum())

# Reset the index after removing rows with missing values
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# Check the first few rows to confirm that the index is reset
print(train_df.head())
print(test_df.head())

train_df.columns = ['product_id', 'description', 'label']
test_df.columns = ['product_id', 'description', 'label']

display(HTML(train_df.tail(5).to_html(escape=False)))

In [None]:
X_train = train_df['description']
y_train = train_df['label']

X_test = test_df['description']
y_test = test_df['label']

In [None]:
# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=0.95)


# Fit and transform the text data to get the TF-IDF matrix
X_tfidf_train = tfidf_vectorizer.fit_transform(X_train) # labeled train set
X_tfidf_test = tfidf_vectorizer.transform(X_test) # labeled test set
#X_tfidf_test_comp =  tfidf_vectorizer.transform(test_text_cleaned) # unlabeled test set

from sentence_transformers import SentenceTransformer
#model = SentenceTransformer('all-MiniLM-L6-v2')

model = SentenceTransformer('all-mpnet-base-v2')

X_embed_train = model.encode(X_train, convert_to_tensor=False)
X_embed_test = model.encode(X_test, convert_to_tensor=False)

# Concatenate TF-IDF with semantic embeddings
from scipy.sparse import hstack
X_comb_train = hstack([X_tfidf_train, X_embed_train])
X_comb_test = hstack([X_tfidf_test, X_embed_test])


#---------TRAIN BEST SVC-------------------------------------

from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

base_svm2 = LinearSVC(max_iter=5000)
svm_model2 = CalibratedClassifierCV(base_svm2, cv=5)  # Platt scaling internally

svm_model2.fit(X_comb_train, y_train)

y_pred_svm2 = svm_model2.predict(X_comb_test)
print("\nSVM Final Test Set Performance:")


y_proba_svm2 = svm_model2.predict_proba(X_comb_test)

# Compute custom log loss
custom_loss_svm2 = multiclass_log_loss(y_test, y_proba_svm2)
print(f"Probabilistic Log Loss (SVM): {custom_loss_svm2:.4f}")