# ML Programming Challenge
Author: Nicolò Gandini

Mail: ngandini@kth.se

In [26]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ml-challenge-datasets/EvaluateOnMe.csv
/kaggle/input/ml-challenge-datasets/TrainOnMe_orig.csv


# Imports and boring stuffs

In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV, cross_val_predict
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import warnings
warnings.filterwarnings('ignore')

base_path = '/kaggle/input/ml-challenge-datasets/'
output_path = '/kaggle/working/'
dataset_ev = base_path + 'EvaluateOnMe.csv'
dataset_tr = base_path + 'TrainOnMe_orig.csv'

print('Train data: ' + dataset_tr + '\nEvaluation data: ' + dataset_ev)

Train data: /kaggle/input/ml-challenge-datasets/TrainOnMe_orig.csv
Evaluation data: /kaggle/input/ml-challenge-datasets/EvaluateOnMe.csv


# Load the data

In [28]:
print("Loading datasets...")
train_df = pd.read_csv(dataset_tr, encoding="utf-8")
eval_df = pd.read_csv(dataset_ev)
print("Dataset loaded, Lesssgoooooooo!")

Loading datasets...
Dataset loaded, Lesssgoooooooo!


# Data inspection

In [29]:
label_col = "y"

# basic label cleaning (keep exact characters but remove stray whitespace)
train_df[label_col] = train_df[label_col].astype(str).str.strip()

# quick checks on labels
print("QUICK DATASET INSPECTION (just for fun)\n")
print(f"Training set shape: {train_df.shape}")
print(f"Evaluation set shape: {eval_df.shape}")
print(f"\nLabel distribution:\n{train_df[label_col].value_counts()}")

print("Number of unique labels:", train_df[label_col].nunique())
print("Missing label cells (NaN or blank):", train_df[label_col].isna().sum(), train_df[label_col].eq("").sum())
print("\nLabel relative frequencies (proportions):")
print(train_df[label_col].value_counts(normalize=True))
print("\nMinimum examples in any class:", train_df[label_col].value_counts().min())

QUICK DATASET INSPECTION (just for fun)

Training set shape: (1000, 14)
Evaluation set shape: (10000, 13)

Label distribution:
y
Andjorg     409
Andsuto     334
Jorgsuto    257
Name: count, dtype: int64
Number of unique labels: 3
Missing label cells (NaN or blank): 0 0

Label relative frequencies (proportions):
y
Andjorg     0.409
Andsuto     0.334
Jorgsuto    0.257
Name: proportion, dtype: float64

Minimum examples in any class: 257


# Data pre-processing

In [30]:
# Store exact labels
original_labels = train_df[label_col].unique()
print(f"Original labels: {original_labels}")

# Check for constant columns
constant_cols = []
for col in train_df.columns:
    if col != label_col and train_df[col].nunique() == 1:
        constant_cols.append(col)
        print(f"  Dropping constant column '{col}' (value: {train_df[col].iloc[0]})")

if not constant_cols:
    print("  No constant columns found")

# Separate features and target
X = train_df.drop(columns=[label_col] + constant_cols)
y = train_df[label_col]
X_eval = eval_df.drop(columns=constant_cols) if constant_cols else eval_df.copy()

print(f"\nFeatures after cleaning: {X.shape[1]}")
print(f"Feature names: {list(X.columns)}")

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Label encoding:")
for idx, label in enumerate(label_encoder.classes_):
    print(f"  {label} -> {idx}")

# Identify feature types
categorical_features = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"\nCategorical features ({len(categorical_features)}): {categorical_features}")
print(f"Numeric features ({len(numeric_features)}): {len(numeric_features)} features")

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ])

Original labels: ['Jorgsuto' 'Andjorg' 'Andsuto']
  Dropping constant column 'x12' (value: True)

Features after cleaning: 12
Feature names: ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x13']
Label encoding:
  Andjorg -> 0
  Andsuto -> 1
  Jorgsuto -> 2

Categorical features (1): ['x7']
Numeric features (11): 11 features


# Baseline Random Forest Evaluation
Note: I tried different models, but I will report here the best one I found, that is the Random Forest.
I tried decision trees and SVM

In [31]:
# Create baseline pipeline
baseline_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
])

# Cross-validation
k_splits = 5
cv = StratifiedKFold(n_splits=k_splits, shuffle=True, random_state=42)
baseline_scores = cross_val_score(baseline_rf, X, y_encoded, cv=cv, scoring='accuracy', n_jobs=-1)

print(f"Baseline Random Forest (n_estimators=200, default params):")
print(f"  CV scores: {baseline_scores}")
print(f"  Mean accuracy: {baseline_scores.mean():.4f} (+/- {baseline_scores.std():.4f})")
print(f"  Range: [{baseline_scores.min():.4f}, {baseline_scores.max():.4f}]")

Baseline Random Forest (n_estimators=200, default params):
  CV scores: [0.835 0.845 0.85  0.845 0.82 ]
  Mean accuracy: 0.8390 (+/- 0.0107)
  Range: [0.8200, 0.8500]


# Hyperparameter tuning
Let's see if he does a better job

In [32]:
# Create tuning pipeline
tuning_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Define parameter grid
param_grid = {
    'classifier__n_estimators': [200, 300, 400, 500],
    'classifier__max_depth': [10, 15, 20, 25, 30, None],
    'classifier__min_samples_split': [2, 5, 10, 15],
    'classifier__min_samples_leaf': [1, 2, 4, 6],
    'classifier__max_features': ['sqrt', 'log2', 0.5, 0.7],
    'classifier__bootstrap': [True],
    'classifier__class_weight': [None, 'balanced']
}

print(f"Parameter space:")
print(f"  n_estimators: {param_grid['classifier__n_estimators']}")
print(f"  max_depth: {param_grid['classifier__max_depth']}")
print(f"  min_samples_split: {param_grid['classifier__min_samples_split']}")
print(f"  min_samples_leaf: {param_grid['classifier__min_samples_leaf']}")
print(f"  max_features: {param_grid['classifier__max_features']}")
print(f"  class_weight: {param_grid['classifier__class_weight']}")

# Randomized search
n_iteration = 30
print(f"\nStarting RandomizedSearchCV ({n_iteration} iterations, {k_splits}-fold CV)...")

random_search = RandomizedSearchCV(
    tuning_rf,
    param_grid,
    n_iter=n_iteration,
    cv=cv,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

random_search.fit(X, y_encoded)

print(f"Tuning complete!, Lessssgooooooooo")
print(f"\nBest parameters found:")
for param, value in random_search.best_params_.items():
    print(f"  {param.replace('classifier__', '')}: {value}")

print(f"\nBest CV accuracy: {random_search.best_score_:.4f}")
print(f"Improvement over baseline: {random_search.best_score_ - baseline_scores.mean():+.4f} ({(random_search.best_score_ - baseline_scores.mean())*100:+.2f}%)")


Parameter space:
  n_estimators: [200, 300, 400, 500]
  max_depth: [10, 15, 20, 25, 30, None]
  min_samples_split: [2, 5, 10, 15]
  min_samples_leaf: [1, 2, 4, 6]
  max_features: ['sqrt', 'log2', 0.5, 0.7]
  class_weight: [None, 'balanced']

Starting RandomizedSearchCV (30 iterations, 5-fold CV)...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Tuning complete!, Lessssgooooooooo

Best parameters found:
  n_estimators: 400
  min_samples_split: 2
  min_samples_leaf: 2
  max_features: 0.7
  max_depth: 15
  class_weight: None
  bootstrap: True

Best CV accuracy: 0.8530
Improvement over baseline: +0.0140 (+1.40%)


# Error analysis

In [33]:
# Get best model
best_rf = random_search.best_estimator_

# Get cross-validated predictions
print("Generating cross-validated predictions for error analysis...")
y_pred_cv = cross_val_predict(best_rf, X, y_encoded, cv=cv, n_jobs=-1)

# Confusion matrix
cm = confusion_matrix(y_encoded, y_pred_cv)
cm_df = pd.DataFrame(cm,
                     index=label_encoder.classes_,
                     columns=label_encoder.classes_)

print("\nConfusion Matrix:")
print(cm_df)
print("\n(Rows=True labels, Columns=Predicted labels)")

# Per-class metrics
print("\nPer-Class Performance:")
report = classification_report(y_encoded, y_pred_cv,
                              target_names=label_encoder.classes_,
                              digits=4)
print(report)

# Calculate per-class accuracy
print("Per-Class Accuracy:")
for idx, label in enumerate(label_encoder.classes_):
    class_mask = y_encoded == idx
    class_acc = (y_pred_cv[class_mask] == idx).sum() / class_mask.sum()
    class_samples = class_mask.sum()
    print(f"  {label}: {class_acc:.4f} ({class_samples} samples)")

Generating cross-validated predictions for error analysis...

Confusion Matrix:
          Andjorg  Andsuto  Jorgsuto
Andjorg       383        3        23
Andsuto        11      307        16
Jorgsuto       60       34       163

(Rows=True labels, Columns=Predicted labels)

Per-Class Performance:
              precision    recall  f1-score   support

     Andjorg     0.8436    0.9364    0.8876       409
     Andsuto     0.8924    0.9192    0.9056       334
    Jorgsuto     0.8069    0.6342    0.7102       257

    accuracy                         0.8530      1000
   macro avg     0.8477    0.8299    0.8345      1000
weighted avg     0.8505    0.8530    0.8480      1000

Per-Class Accuracy:
  Andjorg: 0.9364 (409 samples)
  Andsuto: 0.9192 (334 samples)
  Jorgsuto: 0.6342 (257 samples)


# Train the final model on all the data

In [34]:
# Train on entire training set
print("Training final model with best parameters on all training data...")
best_rf.fit(X, y_encoded)
print("Training complete, lesssgoooo again")

# Show feature importance (top 10)
if hasattr(best_rf.named_steps['classifier'], 'feature_importances_'):
    # Get feature names after preprocessing
    feature_names = numeric_features.copy()
    if categorical_features:
        cat_encoder = best_rf.named_steps['preprocessor'].named_transformers_['cat']
        if hasattr(cat_encoder, 'get_feature_names_out'):
            cat_names = cat_encoder.get_feature_names_out(categorical_features)
            feature_names.extend(cat_names)

    importances = best_rf.named_steps['classifier'].feature_importances_

    # Sort by importance
    indices = np.argsort(importances)[::-1]

    print("\nTop 10 Most Important Features:")
    for i in range(min(10, len(importances))):
        idx = indices[i]
        feat_name = feature_names[idx] if idx < len(feature_names) else f"Feature_{idx}"
        print(f"  {i+1}. {feat_name}: {importances[idx]:.4f}")

Training final model with best parameters on all training data...
Training complete, lesssgoooo again

Top 10 Most Important Features:
  1. x4: 0.4337
  2. x11: 0.2367
  3. x10: 0.1197
  4. x9: 0.0553
  5. x8: 0.0429
  6. x6: 0.0255
  7. x3: 0.0222
  8. x2: 0.0208
  9. x13: 0.0136
  10. x5: 0.0116


# Get predictions

In [35]:
# Predict on evaluation data
print("Generating predictions...")
y_eval_encoded = best_rf.predict(X_eval)

# Decode back to original labels
y_eval_pred = label_encoder.inverse_transform(y_eval_encoded)

print(f"Generated {len(y_eval_pred)} predictions")

# Show prediction distribution
print(f"\nPrediction distribution:")
unique, counts = np.unique(y_eval_pred, return_counts=True)
for label, count in zip(unique, counts):
    percentage = count / len(y_eval_pred) * 100
    print(f"  {label}: {count} ({percentage:.1f}%)")

# Compare with training distribution
print(f"\nTraining distribution (just for comparison):")
train_unique, train_counts = np.unique(y, return_counts=True)
for label, count in zip(train_unique, train_counts):
    percentage = count / len(y) * 100
    print(f"  {label}: {count} ({percentage:.1f}%)")

Generating predictions...
Generated 10000 predictions

Prediction distribution:
  Andjorg: 4361 (43.6%)
  Andsuto: 3347 (33.5%)
  Jorgsuto: 2292 (22.9%)

Training distribution (just for comparison):
  Andjorg: 409 (40.9%)
  Andsuto: 334 (33.4%)
  Jorgsuto: 257 (25.7%)


# Output

In [36]:
# Create submission dataframe
submission_df = pd.DataFrame({'y': y_eval_pred})
output_file = 'predictions.txt'

# Save WITHOUT header and WITHOUT index
with open(output_file, 'w') as f:
    for label in y_eval_pred:
        f.write(f"{label}\n")


print("Predictions saved")

Predictions saved


## Double check to verify the format of the output

In [37]:
# Verify output format. Yes I am afraid of
print("\nVerifying output format:")
with open(output_file, 'r') as f:
    first_lines = [f.readline().strip() for _ in range(5)]
    print("First 5 lines of output:")
    for i, line in enumerate(first_lines, 1):
        print(f"  Line {i}: {line}")

# Check format
no_header_check = first_lines[0] in original_labels
labels_match_check = all(line in original_labels for line in first_lines)
line_count = sum(1 for _ in open(output_file))
correct_length = line_count == len(y_eval_pred)

print("\nFormat verification:")
print(f"  ✓ No header: {'✓ PASS' if no_header_check else '✗ FAIL - CRITICAL ERROR'}")
print(f"  ✓ Labels match exactly: {'✓ PASS' if labels_match_check else '✗ FAIL - CRITICAL ERROR'}")
print(f"  ✓ Correct number of predictions ({len(y_eval_pred)}): {'✓ PASS' if correct_length else '✗ FAIL'}")

if not (no_header_check and labels_match_check and correct_length):
    print("\n⚠️  WARNING: Format check failed! Review the output before submission.")
else:
    print("\n✓ All format checks passed!")


Verifying output format:
First 5 lines of output:
  Line 1: Andsuto
  Line 2: Andjorg
  Line 3: Jorgsuto
  Line 4: Andsuto
  Line 5: Andsuto

Format verification:
  ✓ No header: ✓ PASS
  ✓ Labels match exactly: ✓ PASS
  ✓ Correct number of predictions (10000): ✓ PASS

✓ All format checks passed!
