In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

In [3]:
train_df = pd.read_csv('/kaggle/input/neural-net-nexus/train.csv') # Adjust path if necessary
test_df = pd.read_csv('/kaggle/input/neural-net-nexus/test.csv')   # Adjust path if necessary

In [4]:
train_df.head()
train_df.info()
train_df['Class Identification'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 529 entries, 0 to 528
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      526 non-null    float64
 1   HB (gm/dl)               529 non-null    float64
 2   ESR(mm)                  529 non-null    float64
 3   WBC(TC) (/cumm)          528 non-null    float64
 4   Neutrophils (%)          529 non-null    int64  
 5   lymphocytes (%)          529 non-null    int64  
 6   Monocytes (%)            527 non-null    float64
 7   Eosinophils (%)          527 non-null    float64
 8   Cir Eosinophils (/cumm)  526 non-null    float64
 9   RBC (m/ul)               527 non-null    float64
 10  HTC/PCV (%)              529 non-null    float64
 11  MCV (fl)                 526 non-null    float64
 12  MCH (pg)                 528 non-null    float64
 13  MCHC (g/dl)              529 non-null    float64
 14  RDW  (%)                 5

Class Identification
Negative    362
Positive    138
Normal       16
Abnormal     13
Name: count, dtype: int64

In [5]:
# Identify unnamed columns
unnamed_cols = [col for col in train_df.columns if 'Unnamed' in col]

# Drop unnamed columns from both train and test datasets
train_df = train_df.drop(columns=unnamed_cols)
test_df = test_df.drop(columns=unnamed_cols, errors='ignore')

In [7]:
# Identify numerical columns for imputation
numerical_cols = train_df.select_dtypes(include=np.number).columns.tolist()
if 'Class Identification' in numerical_cols: # Check if it's still there (it shouldn't be after previous steps, but for safety)
    numerical_cols.remove('Class Identification')

imputer = SimpleImputer(strategy='median') # Use median strategy
train_df[numerical_cols] = imputer.fit_transform(train_df[numerical_cols])
test_df[numerical_cols] = imputer.transform(test_df[numerical_cols])

In [8]:
train_df['Class Identification'] = train_df['Class Identification'].map({'Negative': 0, 'Positive': 1})

In [9]:
# 1. Neutrophil_Lymphocyte_Ratio
train_df['Neutrophil_Lymphocyte_Ratio'] = train_df['Neutrophils (%)'] / (train_df['lymphocytes (%)'] + 1e-6)
test_df['Neutrophil_Lymphocyte_Ratio'] = test_df['Neutrophils (%)'] / (test_df['lymphocytes (%)'] + 1e-6)

# 2. WBC_RBC_Ratio
train_df['WBC_RBC_Ratio'] = train_df['WBC(TC) (/cumm)'] / (train_df['RBC (m/ul)'] + 1e-6)
test_df['WBC_RBC_Ratio'] = test_df['WBC(TC) (/cumm)'] / (test_df['RBC (m/ul)'] + 1e-6)

# 3. Platelet_Lymphocytes_Ratio
train_df['Platelet_Lymphocytes_Ratio'] = train_df['Platelete(PC)(/cumm)'] / (train_df['lymphocytes (%)'] + 1e-6)
test_df['Platelet_Lymphocytes_Ratio'] = test_df['Platelete(PC)(/cumm)'] / (test_df['lymphocytes (%)'] + 1e-6)


engineered_numerical_cols = numerical_cols + [
    'Neutrophil_Lymphocyte_Ratio',
    'WBC_RBC_Ratio',
    'Platelet_Lymphocytes_Ratio'
]

In [10]:
scaler = StandardScaler()
train_df[engineered_numerical_cols] = scaler.fit_transform(train_df[engineered_numerical_cols])
test_df[engineered_numerical_cols] = scaler.transform(test_df[engineered_numerical_cols])

In [11]:
# Step 6: Prepare Data for Modeling

# Explicitly check for NaN values in 'Class Identification' column BEFORE splitting
nan_in_target = train_df['Class Identification'].isnull().sum()
print(f"Number of NaN values in 'Class Identification' BEFORE split: {nan_in_target}")

if nan_in_target > 0:
    print("NaN values found in 'Class Identification'. Removing rows with NaN target values...")
    train_df = train_df.dropna(subset=['Class Identification']) # Remove rows with NaN in target
    print(f"Shape of train_df after removing NaN target rows: {train_df.shape}")
    # Re-check NaNs after removal
    nan_in_target_after_dropna = train_df['Class Identification'].isnull().sum()
    print(f"Number of NaN values in 'Class Identification' AFTER removal: {nan_in_target_after_dropna}")
else:
    print("No NaN values found in 'Class Identification' before split.")


X = train_df[engineered_numerical_cols]
y = train_df['Class Identification']

# Now perform train_test_split - after ensuring y has no NaNs
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_val:", y_val.shape)

Number of NaN values in 'Class Identification' BEFORE split: 29
NaN values found in 'Class Identification'. Removing rows with NaN target values...
Shape of train_df after removing NaN target rows: (500, 23)
Number of NaN values in 'Class Identification' AFTER removal: 0
Shape of X_train: (400, 22)
Shape of X_val: (100, 22)
Shape of y_train: (400,)
Shape of y_val: (100,)


In [12]:
models = {
    'Logistic Regression': LogisticRegression(random_state=42, solver='liblinear'),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': lgb.LGBMClassifier(random_state=42)
}

param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5]
    },
    'XGBoost': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5]
    },
    'LightGBM': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5]
    },
     'Logistic Regression': {
        'C': [0.1, 1, 10]
    }
}

In [13]:
best_models = {}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    print(f"Training and tuning {name}...")
    grid_search = GridSearchCV(model, param_grids.get(name, {}), scoring='roc_auc', cv=cv, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_
    print(f"{name} Best Parameters: {grid_search.best_params_}")
    print(f"{name} Best AUC ROC (Validation): {grid_search.best_score_:.4f}\n")

Training and tuning Logistic Regression...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Logistic Regression Best Parameters: {'C': 10}
Logistic Regression Best AUC ROC (Validation): 0.8801

Training and tuning Random Forest...
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Random Forest Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Random Forest Best AUC ROC (Validation): 0.9508

Training and tuning Gradient Boosting...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Gradient Boosting Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Gradient Boosting Best AUC ROC (Validation): 0.9459

Training and tuning XGBoost...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
XGBoost Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
XGBoost Best AUC ROC (Validation): 0.9470

Training and tuning LightGBM...
Fitting 5 folds for each of

In [14]:
print("Validation Performance of Best Models:")
for name, model in best_models.items():
    y_pred_proba_val = model.predict_proba(X_val)[:, 1]
    auc_roc_val = roc_auc_score(y_val, y_pred_proba_val)
    print(f"{name} AUC ROC (Validation): {auc_roc_val:.4f}")

Validation Performance of Best Models:
Logistic Regression AUC ROC (Validation): 0.8442
Random Forest AUC ROC (Validation): 0.9301
Gradient Boosting AUC ROC (Validation): 0.9152
XGBoost AUC ROC (Validation): 0.9420
LightGBM AUC ROC (Validation): 0.9345


In [15]:
best_model_name = 'Random Forest' # model which performed best
best_model = best_models[best_model_name]

X_test = test_df[engineered_numerical_cols]
y_pred_proba_test = best_model.predict_proba(X_test)[:, 1]

# Apply threshold of 0.5 for binary predictions
threshold = 0.5
y_pred_binary_test = (y_pred_proba_test >= threshold).astype(int)

submission_df = pd.DataFrame({'ID': range(1, len(test_df) + 1), 'Prediction': y_pred_binary_test})
submission_df.to_csv('submission_eng3_rndm.csv', index=False)

print("Submission file created")

Submission file created


In [16]:
import joblib

best_model_name = 'Random Forest'  
best_model = best_models[best_model_name] # Assuming your best models are in best_models dictionary

# Save the best model in .joblib format, named as your team name
model_filename = 'Googol.joblib'
joblib.dump(best_model, model_filename)

print(f"Best model '{best_model_name}' saved as '{model_filename}' in .joblib format.")

Best model 'Random Forest' saved as 'Googol.joblib' in .joblib format.
