In [1]:
from tools import *

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[bool_cols] = X_train[bool_cols].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = le.fit_transform(X_train[col])


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Assume train_raw and test_raw are already loaded DataFrames.
# The features are named x1 to x13 and target column is 'y'

features = [f"x{i}" for i in range(1, 14)]

# Create copies for processing
train_df = train_raw.copy()
test_df = test_raw.copy()

# ---------------------------
# 1. Preprocessing
# ---------------------------

# Convert boolean columns to integers (if any)
bool_cols = train_df.select_dtypes(include=['bool']).columns.tolist()
if bool_cols:
    train_df[bool_cols] = train_df[bool_cols].astype(int)
    test_df[bool_cols] = test_df[bool_cols].astype(int)

# Separate the features we'll use
X_train = train_df[features]
X_test = test_df[features]
y_train = train_df['y']

# Identify categorical and numeric columns.
# Here we treat columns with object dtype as categorical.
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
# The remaining numeric columns (including booleans now as ints)
numeric_cols = [col for col in features if col not in cat_cols]

# 1a. Process numeric columns: Scale them
scaler = StandardScaler()
X_train_numeric = scaler.fit_transform(X_train[numeric_cols])
X_test_numeric = scaler.transform(X_test[numeric_cols])

# 1b. Process categorical columns: Label encode and record vocabulary sizes
cat_vocab_sizes = {}
X_train_cat = {}
X_test_cat = {}
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    # Store number of unique values for the embedding layer:
    cat_vocab_sizes[col] = X_train[col].nunique()
    # Save the transformed columns as numpy arrays:
    X_train_cat[col] = X_train[col].values
    label_encoders[col] = le
    
    # Transform the test data using the same encoder
    X_test[col] = le.transform(X_test[col])
    X_test_cat[col] = X_test[col].values

# 1c. Encode target variable: y_train (assume classes: 'Anthropic', 'OpenAI', 'Mistral')
target_le = LabelEncoder()
y_train_enc = target_le.fit_transform(y_train)  # Now in 0,1,2

# ---------------------------
# 2. Prepare inputs for the model
# ---------------------------

# For the numeric input we'll use the scaled numeric features.
# For categorical features, we will create separate inputs that feed into embedding layers.

# Build dictionary of inputs for training and testing:
train_inputs = {}
test_inputs = {}

# Numeric input (if there are any numeric columns)
if numeric_cols:
    train_inputs['numeric'] = X_train_numeric
    test_inputs['numeric'] = X_test_numeric

# Categorical inputs: one input per categorical column.
for col in cat_cols:
    train_inputs[col] = X_train_cat[col]
    test_inputs[col] = X_test_cat[col]

# ---------------------------
# 3. Build the Neural Network Model
# ---------------------------
inputs = []
embedded_outputs = []

# 3a. Numeric input branch
if numeric_cols:
    numeric_input = Input(shape=(len(numeric_cols),), name='numeric')
    x_numeric = BatchNormalization()(numeric_input)
    inputs.append(numeric_input)
    embedded_outputs.append(x_numeric)

# 3b. Categorical input branches with Embeddings
for col in cat_cols:
    # Each categorical input is a single integer.
    input_cat = Input(shape=(1,), name=col)
    vocab_size = cat_vocab_sizes[col]
    # A rule-of-thumb for embedding dimension:
    embed_dim = min(50, (vocab_size + 1) // 2)
    embedding = Embedding(input_dim=vocab_size + 1, output_dim=embed_dim, name=f'{col}_embed')(input_cat)
    embedding = Flatten()(embedding)
    inputs.append(input_cat)
    embedded_outputs.append(embedding)

# 3c. Concatenate all embeddings/numeric representations
if len(embedded_outputs) > 1:
    x = Concatenate()(embedded_outputs)
else:
    x = embedded_outputs[0]

# 3d. Add hidden Dense layers with dropout and batch normalization
x = Dense(64, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(32, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)

# Output layer: 3 classes with softmax activation
output = Dense(3, activation='softmax')(x)

# Define and compile the model
model = Model(inputs=inputs, outputs=output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# ---------------------------
# 4. Train the Model with Early Stopping
# ---------------------------
# For validation, split off a small portion of the training data
X_tr, X_val, y_tr, y_val = train_test_split(train_inputs, y_train_enc, test_size=0.2, random_state=42, stratify=y_train_enc)

# When using multiple inputs, we need to pass a dictionary.
early_stop = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)

history = model.fit(
    X_tr, y_tr,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

# ---------------------------
# 5. Predict on the Test Set and Save Submission
# ---------------------------
# Predict probabilities on test inputs, then choose the class with highest probability.
y_test_prob = model.predict(test_inputs)
y_test_pred = np.argmax(y_test_prob, axis=1)
# Convert numeric predictions back to original labels
y_test_pred_labels = target_le.inverse_transform(y_test_pred)

# Create submission DataFrame (assuming test_df index or an 'id' column is available)
submission = pd.DataFrame({"id": test_df.index, "y": y_test_pred_labels})
submission.to_csv("submission_nn.csv", index=False)

print("Neural network submission file saved as submission_nn.csv")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = le.fit_transform(X_train[col])


ValueError: Found input variables with inconsistent numbers of samples: [2, 5000]