In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.base import BaseEstimator, ClassifierMixin

data_train = pd.read_csv("/Users/mariadeka/Downloads/train.csv")
X_test = pd.read_csv("/Users/mariadeka/Downloads/test.csv")
X_train = data_train.iloc[:, :-1]
y_train = data_train.iloc[:, -1]

In [None]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
poly2 = PolynomialFeatures(degree=2)
poly3 = PolynomialFeatures(degree=3)

X_train_poly2 = poly2.fit_transform(X_train)
X_train_poly3 = poly3.fit_transform(X_train)
X_test_poly2 = poly2.fit_transform(X_test)

scaler_original = StandardScaler()
X_train_scaled = scaler_original.fit_transform(X_train)
scaler_poly2 = StandardScaler()
X_train_poly2_scaled = scaler_poly2.fit_transform(X_train_poly2)
scaler_poly3 = StandardScaler()
X_train_poly3_scaled = scaler_poly3.fit_transform(X_train_poly3)


models = {
    "LogReg (original)": LogisticRegression(max_iter=1000),
    "LogReg (poly=2)": LogisticRegression(max_iter=1000),
    "LogReg (poly=3)": LogisticRegression(max_iter=1000),
    "LDA (original)": LinearDiscriminantAnalysis(),
    "LDA (poly=2)": LinearDiscriminantAnalysis(),
    "LDA (poly=3)": LinearDiscriminantAnalysis(),
    "QDA": QuadraticDiscriminantAnalysis(),
}


In [None]:
class DensityClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, bandwidth=0.5, kernel='gaussian'):
        self.bandwidth = bandwidth
        self.kernel = kernel
        self.kde_models = {}
        self.classes_ = None
        self.log_priors_ = {}

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        for c in self.classes_:
            X_c = X[y == c]
            kde = KernelDensity(kernel=self.kernel, bandwidth=self.bandwidth)
            kde.fit(X_c)
            self.kde_models[c] = kde
            self.log_priors_[c] = np.log(len(X_c) / len(X))

    def predict(self, X):
        scores = np.zeros((X.shape[0], len(self.classes_)))
        for i, c in enumerate(self.classes_):
            log_density = self.kde_models[c].score_samples(X)
            scores[:, i] = log_density + self.log_priors_[c]
        predictions = self.classes_[np.argmax(scores, axis=1)]
        return predictions

    def get_params(self, deep=True):
        return {"bandwidth": self.bandwidth, "kernel": self.kernel}

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self


In [None]:
models["KDE"] = DensityClassifier(bandwidth=0.5)

In [83]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123456) #42563478
scores = {}

for name, model in models.items():
    if "poly=2" in name:
        X_train_used = X_train_poly2_scaled
    elif "poly=3" in name:
        X_train_used = X_train_poly3_scaled
    else:
        X_train_used = X_train_scaled

    score = cross_val_score(model, X_train_used, y_train, cv=cv, scoring="accuracy").mean()
    scores[name] = score
    print(f"{name}: {score:.4f}")

LogReg (original): 0.8133
LogReg (poly=2): 0.9387
LogReg (poly=3): 0.9293
LDA (original): 0.8107
LDA (poly=2): 0.9187
LDA (poly=3): 0.8840
QDA: 0.9253


Zatem najlepszym modelem jest regresja logistyczna z rozszerzoną przestrzenią cech, z dokładnością na poziomie 0.9387.

In [84]:
best_model_name = max(scores, key=scores.get)

In [85]:
best_model = models[best_model_name]

In [87]:
best_model.fit(X_train_poly2_scaled, y_train)

In [92]:
X_test_scaled = scaler_poly2.transform(X_test_poly2)
y_predicted = best_model.predict(X_test_scaled)
np.savetxt('/Users/mariadeka/Downloads/y_predicted_Maria_Deka.txt', y_predicted, fmt='%d')