In [1]:
from tools import *

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[bool_cols] = X_train[bool_cols].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = le.fit_transform(X_train[col])


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss, accuracy_score

# Create copies for processing
train_df = train_raw.copy()
test_df = test_raw.copy()

# Convert boolean columns to integers (if any)
bool_cols = train_df.select_dtypes(include=['bool']).columns.tolist()
if bool_cols:
    train_df[bool_cols] = train_df[bool_cols].astype(int)
    test_df[bool_cols] = test_df[bool_cols].astype(int)

# Separate features and target
X = train_df[features]
y = train_df['y']
X_test = test_df[features]

# Identify categorical and numeric columns
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
numeric_cols = [col for col in features if col not in cat_cols]

# Scale numeric features
scaler = StandardScaler()
X_numeric = scaler.fit_transform(X[numeric_cols])
X_test_numeric = scaler.transform(X_test[numeric_cols])

# Label encode categorical features
cat_vocab_sizes = {}
X_cat = {}
X_test_cat = {}
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    cat_vocab_sizes[col] = X[col].nunique()
    X_cat[col] = X[col].values
    label_encoders[col] = le
    X_test[col] = le.transform(X_test[col])
    X_test_cat[col] = X_test[col].values

# Encode target variable
target_le = LabelEncoder()
y_enc = target_le.fit_transform(y)

# Prepare test inputs
test_inputs = {"numeric": X_test_numeric} if numeric_cols else {}
for col in cat_cols:
    test_inputs[col] = X_test_cat[col]

# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_accuracies = []
fold_log_losses = []

def build_model():
    inputs = []
    embedded_outputs = []

    # Numeric input
    if numeric_cols:
        numeric_input = Input(shape=(len(numeric_cols),), name='numeric')
        x_numeric = BatchNormalization()(numeric_input)
        inputs.append(numeric_input)
        embedded_outputs.append(x_numeric)

    # Categorical inputs
    for col in cat_cols:
        input_cat = Input(shape=(1,), name=col)
        vocab_size = cat_vocab_sizes[col]
        embed_dim = min(50, (vocab_size + 1) // 2)
        embedding = Embedding(input_dim=vocab_size + 1, output_dim=embed_dim, name=f'{col}_embed')(input_cat)
        embedding = Flatten()(embedding)
        inputs.append(input_cat)
        embedded_outputs.append(embedding)

    # Concatenate all inputs
    x = Concatenate()(embedded_outputs) if len(embedded_outputs) > 1 else embedded_outputs[0]

    # Fully connected layers
    x = Dense(64, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Dense(32, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    output = Dense(3, activation='softmax')(x)
    
    model = Model(inputs=inputs, outputs=output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Perform k-fold cross-validation
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Training fold {fold+1}...")
    
    X_tr, X_val = {}, {}
    for key in numeric_cols:
        X_tr['numeric'], X_val['numeric'] = X_numeric[train_idx], X_numeric[val_idx]
    for key in cat_cols:
        X_tr[key], X_val[key] = X_cat[key][train_idx], X_cat[key][val_idx]
    
    y_tr, y_val = y_enc[train_idx], y_enc[val_idx]
    
    model = build_model()
    early_stop = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)
    
    model.fit(
        X_tr, y_tr,
        validation_data=(X_val, y_val),
        epochs=50,
        batch_size=32,
        callbacks=[early_stop],
        verbose=1
    )
    
    y_val_pred_prob = model.predict(X_val)
    y_val_pred = np.argmax(y_val_pred_prob, axis=1)
    
    acc = accuracy_score(y_val, y_val_pred)
    logloss = log_loss(y_val, y_val_pred_prob)
    
    fold_accuracies.append(acc)
    fold_log_losses.append(logloss)
    
    print(f"Fold {fold+1} - Accuracy: {acc:.4f}, Log Loss: {logloss:.4f}")

# Print final results
print(f"Average Accuracy: {np.mean(fold_accuracies):.4f} (+/- {np.std(fold_accuracies):.4f})")
print(f"Average Log Loss: {np.mean(fold_log_losses):.4f} (+/- {np.std(fold_log_losses):.4f})")

# Train final model on full dataset and predict test set
final_model = build_model()
final_model.fit(X_tr, y_tr, epochs=50, batch_size=32, callbacks=[early_stop], verbose=1)
y_test_prob = final_model.predict(test_inputs)
y_test_pred = np.argmax(y_test_prob, axis=1)
y_test_pred_labels = target_le.inverse_transform(y_test_pred)

submission = pd.DataFrame({"id": test_df.index, "y": y_test_pred_labels})
submission.to_csv("submission_nn_kfold.csv", index=False)
print("Final model trained and predictions saved.")


Training fold 1...
Epoch 1/50


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.4020 - loss: 1.4119 - val_accuracy: 0.5910 - val_loss: 0.9108
Epoch 2/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5454 - loss: 1.0225 - val_accuracy: 0.6230 - val_loss: 0.8169
Epoch 3/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5778 - loss: 0.9323 - val_accuracy: 0.6430 - val_loss: 0.7648
Epoch 4/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5961 - loss: 0.8920 - val_accuracy: 0.6830 - val_loss: 0.7364
Epoch 5/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6131 - loss: 0.8620 - val_accuracy: 0.6830 - val_loss: 0.7260
Epoch 6/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6295 - loss: 0.8339 - val_accuracy: 0.6860 - val_loss: 0.7167
Epoch 7/50
[1m125/125[0m [32m━━━━━━━

  current = self.get_monitor_value(logs)


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5445 - loss: 0.9999
Epoch 3/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5823 - loss: 0.9101
Epoch 4/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5950 - loss: 0.8869
Epoch 5/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6327 - loss: 0.8342
Epoch 6/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6305 - loss: 0.8334
Epoch 7/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6341 - loss: 0.8160
Epoch 8/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6588 - loss: 0.7766
Epoch 9/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6401 - loss: 0.8047
Epoch 10/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━

In [15]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, log_loss

# Use the preprocessed X_train, X_test, and y_train directly

# Identify categorical and numeric columns
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
numeric_cols = [col for col in features if col not in cat_cols]

# Standardize numeric columns for neural network
scaler = StandardScaler()
X_train_numeric = scaler.fit_transform(X_train[numeric_cols])
X_test_numeric = scaler.transform(X_test[numeric_cols])

# Store vocabulary sizes for categorical embeddings
cat_vocab_sizes = {col: X_train[col].nunique() for col in cat_cols}

# Convert categorical features to numpy arrays
X_train_cat = {col: X_train[col].values for col in cat_cols}
X_test_cat = {col: X_test[col].values for col in cat_cols}

# ---------------------------
# K-Fold Cross Validation
# ---------------------------
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []
log_losses = []

for train_idx, val_idx in kf.split(X_train, y_train):
    # Prepare inputs for training/validation split
    train_inputs = {'numeric': X_train_numeric[train_idx]}
    val_inputs = {'numeric': X_train_numeric[val_idx]}
    
    for col in cat_cols:
        train_inputs[col] = X_train_cat[col][train_idx]
        val_inputs[col] = X_train_cat[col][val_idx]
    
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    # Define the model
    inputs = []
    embedded_outputs = []
    
    # Numeric input
    numeric_input = Input(shape=(len(numeric_cols),), name='numeric')
    x_numeric = BatchNormalization()(numeric_input)
    inputs.append(numeric_input)
    embedded_outputs.append(x_numeric)
    
    # Categorical input with embedding layers
    for col in cat_cols:
        input_cat = Input(shape=(1,), name=col)
        embedding = Embedding(input_dim=cat_vocab_sizes[col] + 1, output_dim=min(50, (cat_vocab_sizes[col] + 1) // 2))(input_cat)
        embedding = Flatten()(embedding)
        inputs.append(input_cat)
        embedded_outputs.append(embedding)
    
    # Concatenate all inputs
    x = Concatenate()(embedded_outputs) if len(embedded_outputs) > 1 else embedded_outputs[0]
    x = Dense(64, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Dense(32, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    output = Dense(3, activation='softmax')(x)
    model = Model(inputs=inputs, outputs=output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    # Train model
    model.fit(train_inputs, y_tr, validation_data=(val_inputs, y_val), epochs=50, batch_size=32, verbose=1)
    
    # Evaluate model
    y_val_pred = model.predict(val_inputs)
    accuracies.append(accuracy_score(y_val, np.argmax(y_val_pred, axis=1)))
    log_losses.append(log_loss(y_val, y_val_pred))

# Print average accuracy and log loss
print(f'Average Accuracy: {np.mean(accuracies):.4f}')
print(f'Average Log Loss: {np.mean(log_losses):.4f}')

# Train final model on full data and make predictions
test_inputs = {'numeric': X_test_numeric}
for col in cat_cols:
    test_inputs[col] = X_test_cat[col]

model.fit(train_inputs, y_tr, epochs=50, batch_size=32, verbose=1)
y_test_prob = model.predict(test_inputs)
y_test_pred = np.argmax(y_test_prob, axis=1)
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)

# Save submission
submission = pd.DataFrame({"id": test_raw.index, "y": y_test_pred_labels})
submission.to_csv("submission_nn.csv", index=False)
print("Submission saved as submission_nn.csv")

Epoch 1/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.4121 - loss: 1.4359 - val_accuracy: 0.5680 - val_loss: 0.9188
Epoch 2/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5462 - loss: 1.0276 - val_accuracy: 0.6290 - val_loss: 0.8171
Epoch 3/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5616 - loss: 0.9640 - val_accuracy: 0.6530 - val_loss: 0.7656
Epoch 4/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5974 - loss: 0.8850 - val_accuracy: 0.6770 - val_loss: 0.7411
Epoch 5/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.6021 - loss: 0.8793 - val_accuracy: 0.6900 - val_loss: 0.7325
Epoch 6/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6167 - loss: 0.8421 - val_accuracy: 0.6870 - val_loss: 0.7255
Epoch 7/50
[1m125/125[0m 

In [14]:
train_inputs['numeric'].shape
# y_train.shape
y_tr.shape

(4000,)

In [None]:
# train_inputs.keys()
# train_inputs['numeric'].shape
# train_inputs['x7'].shape
# y_train_enc.shape

(5000,)