In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
sns.set()

train_df = pd.read_csv('train.csv')
test_df  = pd.read_csv('test.csv')

TARGET_COL = 'Status'   # ← change if target column name changes
ID_COL = 'id'           # ← change if ID column name changes

# Drop ID column
if ID_COL in train_df.columns:
    train_df.drop(columns=[ID_COL], inplace=True)

if ID_COL in test_df.columns:
    test_ids = test_df[ID_COL]
    test_df.drop(columns=[ID_COL], inplace=True)
else:
    test_ids = pd.Series(range(len(test_df)))

# Handle missing values
for col in train_df.columns:
    if train_df[col].dtype in ['int64', 'float64']:
        val = train_df[col].median()
    else:
        val = train_df[col].mode()[0]

    train_df[col].fillna(val, inplace=True)
    if col in test_df.columns:
        test_df[col].fillna(val, inplace=True)

# Numeric columns only
num_cols = train_df.select_dtypes(include=['int64', 'float64']).columns

# Histograms
train_df[num_cols].hist(figsize=(12, 10), bins=30)
plt.suptitle("Numeric Feature Distributions")
plt.show()

# Boxplots for outlier detection
plt.figure(figsize=(12, 6))
sns.boxplot(data=train_df[num_cols])
plt.xticks(rotation=90)
plt.title("Outlier Analysis using Boxplots")
plt.show()

# Split features and target
X = train_df.drop(columns=[TARGET_COL])
y = train_df[TARGET_COL]

# Encode target if categorical
if y.dtype == 'object':
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
else:
    y_encoded = y.values
    label_encoder = None

# One-hot encode feature columns
X = pd.get_dummies(X, drop_first=True)
test_df = pd.get_dummies(test_df, drop_first=True)

# Align train & test
X, test_df = X.align(test_df, join='left', axis=1, fill_value=0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_df)

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

model = LogisticRegression(max_iter=1000)
# model = RandomForestClassifier()
# model = KNeighborsClassifier()
# model = SVC(probability=True)
# model = DecisionTreeClassifier()
# model = GaussianNB()

model.fit(X_train, y_train)
y_pred = model.predict(X_val)

# Accuracy
acc = accuracy_score(y_val, y_pred)
print("Accuracy:", acc)

# Classification report
print(classification_report(y_val, y_pred))

# Confusion matrix
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

model.fit(X_scaled, y_encoded)
test_pred_encoded = model.predict(test_scaled)

# Convert back to original labels if needed
if label_encoder is not None:
    test_pred = label_encoder.inverse_transform(test_pred_encoded)
else:
    test_pred = test_pred_encoded

# Submission file
submission = pd.DataFrame({
    ID_COL: test_ids,
    TARGET_COL: test_pred
})

submission.to_csv('submission.csv', index=False)
print("Submission file saved.")

