In [12]:
################# zadanie na 3 ##################

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report


DATA_PATH = "emvic.data"
df = pd.read_csv(DATA_PATH, sep="\t", header=None)

y = df.iloc[:, 0]         # etykiety (aXX)
X = df.iloc[:, 1:]        # cechy

# zamiana etykiet na wartosci liczbowe + podział na train/test ---
le = LabelEncoder()
y_enc = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, stratify=y_enc, random_state=42
)

# dwa klasyfikatory
# Logistic Regression
clf_lr = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler(),
    LogisticRegression(max_iter=1000, n_jobs=-1, random_state=42)
)

# Random Forest
clf_rf = make_pipeline(
    SimpleImputer(strategy="median"),
    RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
)

# Trenowanie
clf_lr.fit(X_train, y_train)
clf_rf.fit(X_train, y_train)

# Predykcje i metryki
pred_lr = clf_lr.predict(X_test)
pred_rf = clf_rf.predict(X_test)

def report(name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1m = f1_score(y_true, y_pred, average="macro")
    print(f"{name} -> Accuracy: {acc:.4f} | Macro-F1: {f1m:.4f}")
    print(classification_report(y_true, y_pred, zero_division=0))

print("=== Porównanie dwóch klasyfikatorów ===")
report("LogisticRegression", y_test, pred_lr)
print("-"*60)
report("RandomForest",      y_test, pred_rf)

# zwycięzca na podstawie Macro-F1
f1_lr = f1_score(y_test, pred_lr, average="macro")
f1_rf = f1_score(y_test, pred_rf, average="macro")
winner = "LogisticRegression" if f1_lr >= f1_rf else "RandomForest"
print(f"\n>>> Lepszy wg Macro-F1: {winner}")

=== Porównanie dwóch klasyfikatorów ===
LogisticRegression -> Accuracy: 0.9286 | Macro-F1: 0.9246
              precision    recall  f1-score   support

           0       0.95      0.95      0.95        21
           1       1.00      0.71      0.83         7
           2       1.00      1.00      1.00         7
           3       0.90      0.90      0.90        10
           4       1.00      0.85      0.92        13
           5       0.91      1.00      0.95        10
           6       1.00      1.00      1.00         8
           7       0.73      1.00      0.84         8

    accuracy                           0.93        84
   macro avg       0.94      0.93      0.92        84
weighted avg       0.94      0.93      0.93        84

------------------------------------------------------------
RandomForest -> Accuracy: 0.7976 | Macro-F1: 0.7755
              precision    recall  f1-score   support

           0       0.80      0.95      0.87        21
           1       0.60      

In [11]:
############### zadanie na 4.5 ##################

import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.metrics import accuracy_score, f1_score

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

RNG = 42

# Wczytanie i rozcięcie na bloki 2048
df = pd.read_csv("emvic.data", sep="\t", header=None)
sid = df.iloc[:, 0].astype(str)

def blk(i):  # i=0..5 -> sx,sy,lx,rx,ly,ry
    start = 1 + i*2048
    return slice(start, start+2048)

sx = df.iloc[:, blk(0)].to_numpy(dtype=float)
sy = df.iloc[:, blk(1)].to_numpy(dtype=float)
lx = df.iloc[:, blk(2)].to_numpy(dtype=float)
rx = df.iloc[:, blk(3)].to_numpy(dtype=float)
ly = df.iloc[:, blk(4)].to_numpy(dtype=float)
ry = df.iloc[:, blk(5)].to_numpy(dtype=float)

# PRZYGOTOWANIE DANYCH
# cechy różnicowe (blad patrzenia - to jak oczy podazaja za bodzcem)
dxL, dyL = lx - sx, ly - sy
dxR, dyR = rx - sx, ry - sy

# Odrzucamy surowe sx, sy (po wykorzystaniu do różnic niepotrzebne do klasyfikacji)
# sklejenie cech: 4 * 2048
X = np.hstack([dxL, dyL, dxR, dyR])
X = pd.DataFrame(X)

# usuniecie kolumn stalych i silnie skorelowanych (redukcja wymiaru bez straty informacji)
X = pd.DataFrame(VarianceThreshold(0.0).fit_transform(X))  # stałe out
corr = X.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [c for c in upper.columns if any(upper[c] > 0.98)]
if to_drop:
    X = X.drop(columns=to_drop)

le = LabelEncoder()
y = le.fit_transform(sid)

# trenowanie
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RNG
)

# modele
CLFS = {
    "LogReg":      (LogisticRegression(max_iter=1000, n_jobs=-1, random_state=RNG), True),
    "SVC_RBF":     (SVC(kernel="rbf", random_state=RNG), True),
    "LinearSVC":   (LinearSVC(max_iter=10000, tol=1e-3, random_state=RNG), True),
    "KNN":         (KNeighborsClassifier(n_neighbors=7), True),
    "RandomForest":(RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=RNG), False),
    "ExtraTrees":  (ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=RNG), False),
    "GradBoost":   (GradientBoostingClassifier(random_state=RNG), False)
}

def make_pipe(estimator, need_scale, with_fs=False, k=None):
    steps = [("impute", SimpleImputer(strategy="median"))]
    if need_scale:
        steps += [("scale", StandardScaler())]
    if with_fs:
        steps += [("select", SelectKBest(score_func=f_classif, k=k))]
    steps += [("clf", estimator)]
    return Pipeline(steps)

# 7 klasyfikatorow, ranking po Macro-F1
rows = []
for name, (est, sc) in CLFS.items():
    pipe = make_pipe(est, sc, with_fs=False)
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    rows.append((name,
                 accuracy_score(y_test, pred),
                 f1_score(y_test, pred, average="macro")))
res = pd.DataFrame(rows, columns=["model","accuracy","macro_f1"]).sort_values(
    by=["macro_f1","accuracy"], ascending=False
)
print("=== Wyniki holdout (test) ===")
print(res.to_string(index=False))

# CV dla top3
top3 = res.head(3)["model"].tolist()
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RNG)
print("\n=== 5-fold CV dla TOP-3 (bez selekcji) ===")
for name in top3:
    est, sc = CLFS[name]
    pipe = make_pipe(est, sc, with_fs=False)
    scores = cross_validate(pipe, X, y, scoring={"acc":"accuracy","f1m":"f1_macro"},
                            cv=cv, n_jobs=-1, return_train_score=False)
    print(f"{name:12s}  ACC {scores['test_acc'].mean():.4f}±{scores['test_acc'].std():.4f} | "
          f"F1m {scores['test_f1m'].mean():.4f}±{scores['test_f1m'].std():.4f}")

# selekcja cech i ponownie CV dla top3
k_sel = min(300, X.shape[1])
print(f"\n=== 5-fold CV dla TOP-3 z selekcją cech (k={k_sel}) ===")
for name in top3:
    est, sc = CLFS[name]
    pipe = make_pipe(est, sc, with_fs=True, k=k_sel)
    scores = cross_validate(pipe, X, y, scoring={"acc":"accuracy","f1m":"f1_macro"},
                            cv=cv, n_jobs=-1, return_train_score=False)
    print(f"{name:12s}  ACC {scores['test_acc'].mean():.4f}±{scores['test_acc'].std():.4f} | "
          f"F1m {scores['test_f1m'].mean():.4f}±{scores['test_f1m'].std():.4f}")


=== Wyniki holdout (test) ===
       model  accuracy  macro_f1
      LogReg  0.714286  0.689563
   LinearSVC  0.702381  0.679523
  ExtraTrees  0.666667  0.664504
RandomForest  0.642857  0.623252
   GradBoost  0.607143  0.530732
     SVC_RBF  0.571429  0.483228
         KNN  0.404762  0.337260

=== 5-fold CV dla TOP-3 (bez selekcji) ===
LogReg        ACC 0.7113±0.0508 | F1m 0.6894±0.0630
LinearSVC     ACC 0.6970±0.0663 | F1m 0.6753±0.0771
ExtraTrees    ACC 0.6754±0.0386 | F1m 0.6420±0.0539

=== 5-fold CV dla TOP-3 z selekcją cech (k=300) ===
LogReg        ACC 0.7764±0.0166 | F1m 0.7573±0.0190
LinearSVC     ACC 0.7477±0.0231 | F1m 0.7226±0.0226
ExtraTrees    ACC 0.6996±0.0257 | F1m 0.6561±0.0097
