In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns2

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import log_loss

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

sns.set()


In [None]:
#Load Dataset
train_df = pd.read_csv('/kaggle/input/mock-test-2-mse-2/train.csv')   # update if needed
test_df  = pd.read_csv('/kaggle/input/mock-test-2-mse-2/test.csv')    # update if needed

TARGET_COL = 'Status'   # change if target name changes
ID_COL = 'id'           # change if ID column name changes


In [None]:
#Data Cleaning & Missing Value Handling
# Drop ID column
if ID_COL in train_df.columns:
    train_df.drop(columns=[ID_COL], inplace=True)

if ID_COL in test_df.columns:
    test_ids = test_df[ID_COL]
    test_df.drop(columns=[ID_COL], inplace=True)
else:
    test_ids = pd.Series(range(len(test_df)))

# Handle missing values (NO target leakage)
for col in train_df.columns:
    if col == TARGET_COL:
        continue

    if pd.api.types.is_numeric_dtype(train_df[col]):
        fill_val = train_df[col].median()
    else:
        fill_val = train_df[col].mode()[0]

    train_df[col] = train_df[col].fillna(fill_val)
    if col in test_df.columns:
        test_df[col] = test_df[col].fillna(fill_val)



In [None]:
#Data Visualization & Outlier Analysis (EDA)
# Select numeric columns
num_cols = train_df.select_dtypes(include=['int64', 'float64']).columns

# Histograms
train_df[num_cols].hist(figsize=(12, 10), bins=30)
plt.suptitle("Feature Distributions")
plt.show()

# Boxplots for outlier detection
plt.figure(figsize=(12, 6))
sns.boxplot(data=train_df[num_cols])
plt.xticks(rotation=90)
plt.title("Outlier Analysis (Boxplots)")
plt.show()


In [None]:
# Feature–Target Split
X = train_df.drop(columns=[TARGET_COL])
y = train_df[TARGET_COL]


In [None]:
#Target Encoding (FIXED & SAFE)
# Encode ONLY if target is categorical
if y.dtype == 'object':
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
else:
    y_encoded = y.values
    label_encoder = None


In [None]:
X = pd.get_dummies(X, drop_first=True)
test_df = pd.get_dummies(test_df, drop_first=True)

# Align train & test columns
X, test_df = X.align(test_df, join='left', axis=1, fill_value=0)


In [None]:
#Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_df)


In [None]:
#Train–Validation Split
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)


In [None]:
#Model Selection (CHOOSE ONE)
model = LogisticRegression(
    max_iter=1000,
    multi_class='multinomial',
    solver='lbfgs'
)

 #model = RandomForestClassifier()
# model = GaussianNB()
# model = KNeighborsClassifier()
# model = DecisionTreeClassifier()
# model = SVC(probability=True)  # required for log loss


In [None]:
#Model Training & Log Loss Evaluation
model.fit(X_train, y_train)

# Predict probabilities
y_prob = model.predict_proba(X_val)

# Log Loss (LOWER is BETTER)
ll = log_loss(y_val, y_prob)
print("Validation Log Loss:", ll)


In [None]:
#Train on Full Data & Predict Test
model.fit(X_scaled, y_encoded)

test_prob = model.predict_proba(test_scaled)
test_pred_encoded = np.argmax(test_prob, axis=1)

# Decode labels if needed
if label_encoder is not None:
    test_pred = label_encoder.inverse_transform(test_pred_encoded)
else:
    test_pred = test_pred_encoded


In [None]:
# Predict probabilities on test set
test_prob = model.predict_proba(test_scaled)

# Create submission dataframe
submission = pd.DataFrame(
    test_prob,
    columns=[f"Status_{cls}" for cls in label_encoder.classes_]
)

submission.insert(0, "id", test_ids)

# Save submission
submission.to_csv("submission.csv", index=False)

submission.head()
