In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/neural-net-nexus-2-0/sample_submission.csv
/kaggle/input/neural-net-nexus-2-0/train.csv
/kaggle/input/neural-net-nexus-2-0/test.csv


In [8]:
# Data handling libraries
import pandas as pd
import numpy as np

# Cross-validation
from sklearn.model_selection import StratifiedKFold

# Machine learning models
from sklearn.ensemble import HistGradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Data preprocessing
from sklearn.impute import SimpleImputer

# Model evaluation metrics
from sklearn.metrics import accuracy_score, precision_score

# Model saving
import joblib

In [11]:
# Load CSV files from Kaggle input directory
train = pd.read_csv("/kaggle/input/neural-net-nexus-2-0/train.csv")
test = pd.read_csv("/kaggle/input/neural-net-nexus-2-0/test.csv")

# Separate input features and target label
X = train.drop(columns=["Revenue"])
y = train["Revenue"].astype(int)

# Create a copy of test features for final prediction
X_test = test.copy()

print("Training data shape:", train.shape)
print("Test data shape:", test.shape)
print("\nTarget value distribution:")
print(y.value_counts())
X.head()

Training data shape: (9880, 18)
Test data shape: (2470, 18)

Target value distribution:
Revenue
0    8342
1    1538
Name: count, dtype: int64


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0.0,0.0,0.0,0.0,25.0,686.5,0.016,0.032,0.0,0.0,Dec,2.0,2.0,7.0,1.0,Returning_Visitor,True
1,2.0,179.0,1.0,0.0,2.0,163.0,0.0,0.04,0.0,0.0,May,1.0,1.0,3.0,2.0,New_Visitor,False
2,0.0,0.0,0.0,0.0,12.0,168.0,0.041667,0.063889,0.0,0.0,Nov,2.0,2.0,1.0,1.0,Returning_Visitor,False
3,8.0,144.5,4.0,181.722222,52.0,2826.790048,0.007258,0.030061,0.0,0.0,May,3.0,2.0,3.0,13.0,Returning_Visitor,True
4,0.0,0.0,0.0,0.0,11.0,210.0,0.009091,0.024242,0.0,0.0,May,2.0,2.0,8.0,2.0,Returning_Visitor,False


In [13]:
def engineer_features(df):
    
    # Apply log transformation to reduce skewness in PageValues
    df["PageValue_Log"] = np.log1p(df["PageValues"])
    
    # Compute total number of pages visited in a session
    df["Total_Pages"] = (
        df["Administrative"] +
        df["Informational"] +
        df["ProductRelated"]
    )
    
    # Calculate engagement quality ratio
    df["Product_Duration_Ratio"] = (
        df["ProductRelated_Duration"] / (df["Total_Pages"] + 1)
    )
    
    # Identify high-conversion (peak) months
    df["Is_Peak_Month"] = df["Month"].isin(["Nov", "Dec", "May"]).astype(int)
    
    # Detect high-intent users
    df["High_Intent"] = (
        (df["PageValues"] > 0) &
        (df["ExitRates"] < 0.02)
    ).astype(int)
    
    # Drop low-importance/noisy categorical features
    cols_to_drop = ["OperatingSystems", "Browser", "Region", "TrafficType"]
    return df.drop(columns=[c for c in cols_to_drop if c in df.columns])

# Apply feature engineering to training and test datasets
X = engineer_features(X)
X_test = engineer_features(X_test)

print("Training features shape after engineering:", X.shape)
print("Test features shape after engineering:", X_test.shape)
# Display newly created features to verify correctness
X[[
    "PageValue_Log",
    "Total_Pages",
    "Product_Duration_Ratio",
    "Is_Peak_Month",
    "High_Intent"
]].head()

Training features shape after engineering: (9880, 18)
Test features shape after engineering: (2470, 19)


Unnamed: 0,PageValue_Log,Total_Pages,Product_Duration_Ratio,Is_Peak_Month,High_Intent
0,0.0,25.0,26.403846,1,0
1,0.0,5.0,27.166667,1,0
2,0.0,12.0,12.923077,1,0
3,0.0,64.0,43.489078,1,0
4,0.0,11.0,17.5,1,0


In [15]:
# Apply one-hot encoding to categorical features
X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)

# Align train and test columns to prevent mismatch errors
X, X_test = X.align(X_test, join="left", axis=1, fill_value=0)

print("Training feature matrix shape after encoding:", X.shape)
print("Test feature matrix shape after encoding:", X_test.shape)
print("Number of columns difference:", X.shape[1] - X_test.shape[1])

Training feature matrix shape after encoding: (9880, 30)
Test feature matrix shape after encoding: (2470, 30)
Number of columns difference: 0


In [16]:
# Initialize median imputer
imputer = SimpleImputer(strategy="median")

# Fit on training data and transform both train and test
X_final = imputer.fit_transform(X)
X_test_final = imputer.transform(X_test)

print("Shape of training data after imputation:", X_final.shape)
print("Shape of test data after imputation:", X_test_final.shape)
print("Missing values in training data:", np.isnan(X_final).sum())
print("Missing values in test data:", np.isnan(X_test_final).sum())

Shape of training data after imputation: (9880, 30)
Shape of test data after imputation: (2470, 30)
Missing values in training data: 0
Missing values in test data: 0


In [17]:
# Initialize 5-fold stratified cross-validation
skf = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

# Containers to store predictions and scores
test_probs = np.zeros(len(X_test_final))
val_scores = []

print("Number of folds:", skf.get_n_splits())
print("Number of test samples:", len(test_probs))

Number of folds: 5
Number of test samples: 2470


In [18]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X_final, y)):
    
    # Split data into training and validation sets
    X_tr, X_val = X_final[train_idx], X_final[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Define individual models
    xgb_model = XGBClassifier(
        n_estimators=1000,
        max_depth=5,
        learning_rate=0.01,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    
    lgb_model = LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.01,
        num_leaves=31,
        verbose=-1,
        random_state=42
    )
    
    hist_model = HistGradientBoostingClassifier(
        max_iter=500,
        learning_rate=0.01,
        max_depth=10
    )
    
    # Create soft-voting ensemble
    model = VotingClassifier(
        estimators=[
            ("xgb", xgb_model),
            ("lgb", lgb_model),
            ("hist", hist_model)
        ],
        voting="soft"
    )
    
    # Train ensemble model
    model.fit(X_tr, y_tr)
    
    # Validation predictions
    val_probs = model.predict_proba(X_val)[:, 1]
    val_preds = (val_probs > 0.45).astype(int)
    
    # Calculate accuracy
    fold_acc = accuracy_score(y_val, val_preds)
    val_scores.append(fold_acc)
    
    # Accumulate test probabilities
    test_probs += model.predict_proba(X_test_final)[:, 1] / skf.n_splits
    
    print(f"Fold {fold + 1} Accuracy: {fold_acc:.4f}")

print("\nMean Cross-Validation Accuracy:", np.mean(val_scores))



Fold 1 Accuracy: 0.9018




Fold 2 Accuracy: 0.9054




Fold 3 Accuracy: 0.8963




Fold 4 Accuracy: 0.8968




Fold 5 Accuracy: 0.9033

Mean Cross-Validation Accuracy: 0.9007085020242915


In [19]:
final_threshold = 0.44

# Convert probabilities to final class predictions
final_preds = (test_probs >= final_threshold).astype(int)

unique, counts = np.unique(final_preds, return_counts=True)
print("Final prediction distribution:")
dict(zip(unique, counts))

Final prediction distribution:


{np.int64(0): np.int64(2132), np.int64(1): np.int64(338)}

In [20]:
submission = pd.DataFrame({
    "ID": test.index,
    "Revenue": final_preds
})

# Save submission file
submission.to_csv("submission for neural net.csv", index=False)

submission.head()

Unnamed: 0,ID,Revenue
0,0,1
1,1,1
2,2,0
3,3,0
4,4,0


In [21]:
team_name = "CodeKnights"

# Save the model
joblib.dump(model, f"{team_name}.joblib")

['CodeKnights.joblib']