In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [3]:
def res(cv):
    par, s = cv.best_params_, cv.best_score_
    print("Number of neighbors:", par['clf__n_neighbors'])
    print("Weights:", par['clf__weights'])
    if par['clf__p'] == 1:
        p = "Manhattan"
    elif par['clf__p'] == 2:
        p = "Euclidean"
    print(p, "metric is used")
    print("Best score is:", str(round(100*s, 1)) + "%")

In [4]:
df = pd.read_csv("BRCA_pam50.tsv", sep="\t", index_col=0)
X = df.iloc[:, :-1].to_numpy()
#X = df.loc[:,["ERBB2", "ESR1", "MKI67", "PGR"]].to_numpy()
y = df["Subtype"].to_numpy()

In [5]:
X_tsne = TSNE(n_components=2, perplexity=30).fit_transform(X)
X_pca = PCA(n_components=2).fit_transform(X)

In [6]:
model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", KNeighborsClassifier(n_jobs=-1))
])

params = {
    "clf__n_neighbors": [1, 3, 5, 7, 9],
    "clf__weights": ["uniform", "distance"],
    "clf__p": [1, 2]
}

### Without embedding (50 dimensions)

In [7]:
cv = GridSearchCV(
    model, params, n_jobs=-1,
    scoring=make_scorer(accuracy_score),
    cv=RepeatedStratifiedKFold(n_repeats=20)
)

In [8]:
cv.fit(X, y)
res(cv)

Number of neighbors: 5
Weights: uniform
Manhattan metric is used
Best score is: 89.9%


In [9]:
# t-SNE: 2 dimensions
cv.fit(X_tsne, y)
res(cv)

Number of neighbors: 5
Weights: uniform
Euclidean metric is used
Best score is: 89.2%


In [10]:
# PCA: 2 dimensions
cv.fit(X_pca, y)
res(cv)

Number of neighbors: 9
Weights: uniform
Euclidean metric is used
Best score is: 85.6%
