In [9]:
import pandas as pd
import numpy as np

# ---------------------------------------------
# 0. User settings: adjust these
# ---------------------------------------------
# Paths to your Kaggle files (Windows raw strings)
train_path = r"D:\ninic\train.csv"
test_path = r"D:\ninic\test.csv"
# Name of the ID column and target column
id_col = 'sl_no'
target_col = 'status'  # actual target column name

# ---------------------------------------------
# 1. Load datasets
# ---------------------------------------------
print("Loading datasets...")
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
print(f"Train columns: {train_df.columns.tolist()}")
print(f"Test columns: {test_df.columns.tolist()}")

# ---------------------------------------------
# 2. Encode target variable
# ---------------------------------------------
from sklearn.preprocessing import LabelEncoder
le_target = LabelEncoder()
train_df[target_col] = le_target.fit_transform(train_df[target_col])  # e.g. 'Placed'->1, 'Not Placed'->0

# ---------------------------------------------
# 3. Quick sanity checks
# ---------------------------------------------
if target_col not in train_df.columns:
    raise KeyError(f"Target column '{target_col}' missing. Available: {train_df.columns.tolist()}")

# ---------------------------------------------
# 4. EDA (basic)
# ---------------------------------------------
print("\n--- Train head ---")
print(train_df.head())
print("Missing in train:\n", train_df.isnull().sum())
print("\n--- Test head ---")
print(test_df.head())
print("Missing in test:\n", test_df.isnull().sum())

# ---------------------------------------------
# 5. Combine for consistent encoding
# ---------------------------------------------
test_ids = test_df[id_col]
all_df = pd.concat([train_df.drop(columns=[target_col]), test_df], axis=0)

# ---------------------------------------------
# 6. Fill missing
# ---------------------------------------------
def fill_na(df):
    for c in df.columns:
        if df[c].dtype in [np.float64, np.int64]:
            df[c].fillna(df[c].median(), inplace=True)
        else:
            df[c].fillna(df[c].mode()[0], inplace=True)
fill_na(all_df)

# ---------------------------------------------
# 7. Encode features
# ---------------------------------------------
cat_cols = all_df.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns: {cat_cols}")
from sklearn.preprocessing import LabelEncoder
fe_le = LabelEncoder()
onehot = [c for c in cat_cols if all_df[c].nunique() > 2]
all_df = pd.get_dummies(all_df, columns=onehot, drop_first=True)
for c in cat_cols:
    if c not in onehot:
        all_df[c] = fe_le.fit_transform(all_df[c])

# Split back
glen = len(train_df)
X = all_df.iloc[:glen].drop(columns=[id_col])
y = train_df[target_col]
X_test = all_df.iloc[glen:].drop(columns=[id_col])

# ---------------------------------------------
# 8. Train/validation split
# ---------------------------------------------
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# ---------------------------------------------
# 9. Scale for select models
# ---------------------------------------------
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_val_sc = scaler.transform(X_val)
X_test_sc = scaler.transform(X_test)

# ---------------------------------------------
# 10. Define and tune models
# ---------------------------------------------
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

models = {
    'Logistic': LogisticRegression(max_iter=200),
    'Tree': DecisionTreeClassifier(),
    'Forest': RandomForestClassifier(n_estimators=100),
    'SVM': SVC(probability=True),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

# RandomForest tuning
print("\nTuning RandomForest...")
rf_params = {'n_estimators': [50, 100], 'max_depth': [None, 5]}
grid_rf = GridSearchCV(RandomForestClassifier(), rf_params, cv=3)
grid_rf.fit(X_train, y_train)
models['Forest'] = grid_rf.best_estimator_
print("RF best params:", grid_rf.best_params_)

# SVM tuning on scaled data
print("Tuning SVM...")
svm_params = {'C': [0.1, 1], 'kernel': ['linear', 'rbf']}
grid_svm = GridSearchCV(SVC(probability=True), svm_params, cv=3)
grid_svm.fit(X_train_sc, y_train)
models['SVM'] = grid_svm.best_estimator_
print("SVM best params:", grid_svm.best_params_)

# ---------------------------------------------
# 11. Evaluation
# ---------------------------------------------
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix

def evaluate(name, model, scaled=False):
    Xtr, Xte = (X_train_sc, X_val_sc) if scaled else (X_train, X_val)
    model.fit(Xtr, y_train)
    preds = model.predict(Xte)
    probs = model.predict_proba(Xte)[:,1]
    return {
        'acc': accuracy_score(y_val, preds),
        'f1': f1_score(y_val, preds),
        'roc': roc_auc_score(y_val, probs)
    }

results = {}
for n, m in models.items():
    results[n] = evaluate(n, m, scaled=(n in ['Logistic', 'SVM', 'KNN']))
print("\nValidation metrics:")
for k, v in results.items(): print(k, v)

best = max(results, key=lambda k: results[k]['f1'])
print("Best model by F1:", best)
bm = models[best]
cm = confusion_matrix(y_val, bm.predict(X_val_sc if best in ['Logistic', 'SVM', 'KNN'] else X_val))
print("Confusion matrix:\n", cm)

# ---------------------------------------------
# 12. Voting ensemble
# ---------------------------------------------
vote = VotingClassifier(estimators=[(n, m) for n, m in models.items() if n in ['Logistic', 'Tree', 'Forest']], voting='soft')
vote.fit(X_train_sc, y_train)
vp = vote.predict(X_val_sc)
print("Voting F1:", f1_score(y_val, vp))

# ---------------------------------------------
# 13. Final submission
# ---------------------------------------------
bm.fit(X, y)
vote.fit(scaler.fit_transform(X), y)
final = vote.predict(X_test_sc)
sub = pd.DataFrame({id_col: test_ids, target_col: final})
sub.to_csv('submission.csv', index=False)
print("Submission written to submission.csv")


Loading datasets...
Train columns: ['sl_no', 'gender', 'ssc_p', 'ssc_b', 'hsc_p', 'hsc_b', 'hsc_s', 'degree_p', 'degree_t', 'workex', 'etest_p', 'specialisation', 'mba_p', 'status', 'salary']
Test columns: ['sl_no', 'gender', 'salary']

--- Train head ---
   sl_no  gender  ssc_p    ssc_b  hsc_p    hsc_b     hsc_s  degree_p  \
0      1       0  67.00   Others  91.00   Others  Commerce     58.00   
1      2       0  79.33  Central  78.33   Others   Science     77.48   
2      3       0  65.00  Central  68.00  Central      Arts     64.00   
3      4       0  56.00  Central  52.00  Central   Science     52.00   
4      5       0  85.80  Central  73.60  Central  Commerce     73.30   

    degree_t workex  etest_p specialisation  mba_p  status    salary  
0   Sci&Tech     No     55.0         Mkt&HR  58.80       1  270000.0  
1   Sci&Tech    Yes     86.5        Mkt&Fin  66.28       1  200000.0  
2  Comm&Mgmt     No     75.0        Mkt&Fin  57.80       1  250000.0  
3   Sci&Tech     No     66.