In [3]:
import pandas as pd

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import cross_val_score

from sklearn.neighbors import KNeighborsClassifier

In [4]:
train = pd.read_csv("titanic-train.csv", index_col = "PassengerId")

In [5]:
pd.options.mode.chained_assignment = None  # default='warn'

In [6]:
X = train[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]   #def X

In [7]:
kolumny = X.columns   #kolumny

In [8]:
X["Sex"] = X["Sex"].astype("category")
X["Sex"] = X["Sex"].cat.codes      #kategorie

X["Embarked"] = X["Embarked"].astype("category")
X["Embarked"] = X["Embarked"].cat.codes

In [9]:
imputer = Imputer(strategy = "median")
X = pd.DataFrame(imputer.fit_transform(X), columns = X.columns, index = X.index)  #NAs na Mediany

In [10]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X) #skalowanie

In [11]:
X = pd.DataFrame(X, columns = kolumny)      #X

In [12]:
y = train["Survived"]     #y

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,      #podział
                                                    test_size=0.33, 
                                                    random_state=42)

In [14]:
results = []
best_result = 0
parameters = {}  #parametry

In [15]:
for neighbours in range(1,101):   #looop
    for weight in ["uniform", "distance"]:
        for p in [1,2]:
            classifier = KNeighborsClassifier(n_neighbors = neighbours, weights = weight, 
                                              p = p)
            scores = cross_val_score(classifier, X_train, y_train, cv = 5)
            results.append(scores.mean())
            #print("Accuracy: {} (std: {})".format(scores.mean(), scores.std()))
            if scores.mean() > best_result:
                parameters["neighbours"] = neighbours
                parameters["weight"] = weight
                parameters["p"] = p
                print("New best neighbours: {}, weight: {}, p: {}".format(neighbours, 
                                                                          weight, p))
                best_result = scores.mean()

New best neighbours: 1, weight: uniform, p: 1
New best neighbours: 2, weight: uniform, p: 1
New best neighbours: 4, weight: uniform, p: 1
New best neighbours: 5, weight: uniform, p: 1
New best neighbours: 6, weight: uniform, p: 1
New best neighbours: 8, weight: uniform, p: 1
New best neighbours: 8, weight: uniform, p: 2
New best neighbours: 10, weight: uniform, p: 2
New best neighbours: 12, weight: uniform, p: 1
New best neighbours: 12, weight: uniform, p: 2
New best neighbours: 16, weight: uniform, p: 1
New best neighbours: 19, weight: uniform, p: 1
New best neighbours: 20, weight: uniform, p: 1
New best neighbours: 21, weight: uniform, p: 1
New best neighbours: 27, weight: uniform, p: 1


In [16]:
classifier = KNeighborsClassifier(n_neighbors = parameters["neighbours"], 
                                  weights = parameters["weight"],   #klasyfikator
                                  p = parameters["p"])

In [17]:
classifier.fit(X_train, y_train)  #fit

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=27, p=1,
           weights='uniform')

In [18]:
from sklearn.metrics import accuracy_score

In [19]:
accuracy_score(y_true=y_test, y_pred=classifier.predict(X_test)) #acc

0.8