In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e8/sample_submission.csv
/kaggle/input/playground-series-s5e8/train.csv
/kaggle/input/playground-series-s5e8/test.csv


In [2]:
# Load the datasets
try:
    train_df = pd.read_csv('/kaggle/input/playground-series-s5e8/train.csv')
    test_df = pd.read_csv('/kaggle/input/playground-series-s5e8/test.csv')
    sample_submission_df = pd.read_csv('/kaggle/input/playground-series-s5e8/sample_submission.csv')
except FileNotFoundError as e:
    print(f"Error loading data files: {e}")
    print("Please make sure train.csv, test.csv, and sample_submission.csv are in the same directory.")
    exit()

In [3]:
# --- Preprocessing ---

# Function to preprocess the data
def preprocess(df):
    # Separate categorical and numerical features
    categorical_features = df.select_dtypes(include=['object']).columns
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns

    # Convert categorical features to numerical using one-hot encoding
    df = pd.get_dummies(df, columns=categorical_features, drop_first=True)
    return df

# Store test IDs for submission
test_ids = test_df['id']

# Preprocess the training and testing data
# Drop 'id' from training data as it's not a feature
train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)

# Align columns before training - crucial for consistent features
y = train_df['y']
X = train_df.drop('y', axis=1)

# Apply preprocessing
X_processed = preprocess(X)
test_processed = preprocess(test_df)

# Align columns - crucial for consistent features between train and test sets
train_cols = X_processed.columns
test_cols = test_processed.columns

missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    test_processed[c] = 0

missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X_processed[c] = 0

# Ensure the order of columns is the same
test_processed = test_processed[train_cols]

In [4]:
# --- Model Training ---

# Using RandomForestClassifier as suggested in the notebook
# The parameters are chosen based on common practices and can be tuned further
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, max_depth=10, min_samples_leaf=5)

print("Training the model...")
model.fit(X_processed, y)
print("Model training complete.")

Training the model...
Model training complete.


In [5]:
# --- Prediction ---

print("Making predictions on the test set...")
predictions = model.predict_proba(test_processed)[:, 1]
print("Prediction complete.")

Making predictions on the test set...
Prediction complete.


In [6]:
# --- Submission File Generation ---

submission_df = pd.DataFrame({'id': test_ids, 'y': predictions})
submission_df.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' has been created successfully!")
print("First 5 rows of the submission file:")
print(submission_df.head())


Submission file 'submission.csv' has been created successfully!
First 5 rows of the submission file:
       id         y
0  750000  0.034769
1  750001  0.128053
2  750002  0.017558
3  750003  0.005773
4  750004  0.089566
