# Summary

## Cross Val = 5

In [6]:
# Import Library/Module
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


# Read data and drop table
df = pd.read_csv("data/titanic.csv")
df.drop(columns=["Name", "Age", "Ticket", "Cabin"], inplace=True)

# Splitting Data
X = df.drop(columns="Survived")
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Preprocessor
numeric_data = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", MinMaxScaler())
])

categoric_data = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("one_hot", OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ("numeric", numeric_data, ["Fare", "SibSp", "Parch"]),
    ("categoric", categoric_data, ["Pclass", "Sex", "Embarked"])
])

# Main Pipeline
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", KNeighborsClassifier())
])

# Parameter Tuning
model = GridSearchCV(
                pipeline,
                param_grid={
                    "algo__n_neighbors": range(1, 51, 2),
                    "algo__weights": ["uniform", "distance"],
                    "algo__p": [1, 2],
                },
                cv=5,
                n_jobs=-1,
                verbose=1
)
model.fit(X_train, y_train)


# Evaluation
print(model.best_params_)
print(model.score(X_train, y_train), model.score(X_test, y_test))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
{'algo__n_neighbors': 15, 'algo__p': 1, 'algo__weights': 'uniform'}
0.827247191011236 0.7877094972067039


## Cross Val = 3

In [7]:
# Import Library/Module
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


# Read data and drop table
df = pd.read_csv("data/titanic.csv")
df.drop(columns=["Name", "Age", "Ticket", "Cabin"], inplace=True)

# Splitting Data
X = df.drop(columns="Survived")
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Preprocessor
numeric_data = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", MinMaxScaler())
])

categoric_data = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("one_hot", OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ("numeric", numeric_data, ["Fare", "SibSp", "Parch"]),
    ("categoric", categoric_data, ["Pclass", "Sex", "Embarked"])
])

# Main Pipeline
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", KNeighborsClassifier())
])

# Parameter Tuning
model = GridSearchCV(
                pipeline,
                param_grid={
                    "algo__n_neighbors": range(1, 51, 2),
                    "algo__weights": ["uniform", "distance"],
                    "algo__p": [1, 2],
                },
                cv=3,
                n_jobs=-1,
                verbose=1
)
model.fit(X_train, y_train)


# Evaluation
print(model.best_params_)
print(model.score(X_train, y_train), model.score(X_test, y_test))

Fitting 3 folds for each of 100 candidates, totalling 300 fits
{'algo__n_neighbors': 19, 'algo__p': 1, 'algo__weights': 'uniform'}
0.8188202247191011 0.7877094972067039
