In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import joblib

In [2]:
df = pd.read_csv(r"C:\Users\nibas\Downloads\Titanic.csv")

In [3]:
df = df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

In [4]:
le_sex = LabelEncoder()
le_embarked = LabelEncoder()

df["Sex"] = le_sex.fit_transform(df["Sex"].astype(str))
df["Embarked"] = le_embarked.fit_transform(df["Embarked"].astype(str))

In [5]:
num_cols = df.columns 
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

print(df.isnull().sum())

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [6]:
df = df.fillna(df.mean(numeric_only=True))    
df = df.fillna(df.mode().iloc[0])

In [7]:
X = df.drop("Survived", axis=1)
y = df["Survived"]

In [8]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [10]:
k_values = [1, 2, 4, 6, 10, 20]
accuracy_scores = {}

for k in k_values:
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    accuracy_scores[k] = accuracy_score(y_test, pred)

print("Accuracy for each K:")
for k, acc in accuracy_scores.items():
    print(f"K={k}: {acc:.4f}")

Accuracy for each K:
K=1: 0.7933
K=2: 0.8101
K=4: 0.7933
K=6: 0.8045
K=10: 0.7989
K=20: 0.7989


In [11]:
best_k = max(accuracy_scores, key=accuracy_scores.get)
best_model = KNeighborsClassifier(n_neighbors=best_k)
best_model.fit(X_train, y_train)

print(f"\nBest K: {best_k}")
print("Best Model Accuracy:", accuracy_scores[best_k])


Best K: 2
Best Model Accuracy: 0.8100558659217877


In [12]:
joblib.dump(best_model, "knn_titanic_model.pkl")
joblib.dump(le_sex, "encoder_sex.pkl")
joblib.dump(le_embarked, "encoder_embarked.pkl")
joblib.dump(scaler, "minmax_scaler.pkl")

df.to_csv("Titanic_encoded_scaled.csv", index=False)

print("\nFiles Saved:")
print("✔ knn_titanic_model.pkl")
print("✔ encoder_sex.pkl")
print("✔ encoder_embarked.pkl")
print("✔ minmax_scaler.pkl")
print("✔ Titanic_encoded_scaled.csv (encoded + scaled data)")


Files Saved:
✔ knn_titanic_model.pkl
✔ encoder_sex.pkl
✔ encoder_embarked.pkl
✔ minmax_scaler.pkl
✔ Titanic_encoded_scaled.csv (encoded + scaled data)
