## Εισαγωγικά

In [1]:
#IMPORTS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn 
import scipy.stats

In [2]:
# Set random seed for reproducibility
import random
import os
SEED = 56
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

from sklearn.utils import check_random_state
check_random_state(SEED)

RandomState(MT19937) at 0x29D7FAB2440

In [3]:
# Use cuda if present
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device available for running: ")
print(device)

Device available for running: 
cuda


## Β. Προεπεξεργασία και Εξερεύνηση Δεδομένων

### 1. Εξερευνητική Ανάλυση Δεδομένων (EDA)

In [4]:
# Load data with pickling enabled
data = np.load(r'C:\\Users\\vasgk\\Desktop\\Μηχανική μάθηση\\Data\\breastmnist_224.npz', allow_pickle=True)

# Inspect the keys in the dataset
print("Keys in the dataset:", data.files)
print("Shape of file contents: (no of contents, dimensions (224x224))")

# Print the shape of each file in the original npz file
for key in data.files:
    print(f"{key}: {data[key].shape}")

Keys in the dataset: ['train_images', 'train_labels', 'val_images', 'val_labels', 'test_images', 'test_labels']
Shape of file contents: (no of contents, dimensions (224x224))
train_images: (546, 224, 224)
train_labels: (546, 1)
val_images: (78, 224, 224)
val_labels: (78, 1)
test_images: (156, 224, 224)
test_labels: (156, 1)


In [None]:
# Convert the data into a DataFrame
# Flatten the images and combine them with labels
train_images_flat = data['train_images'].reshape(len(data['train_images']), -1)  # Flatten 224x224 images
df_train = pd.DataFrame(train_images_flat)
df_train['Label'] = data['train_labels']

val_images_flat = data['val_images'].reshape(len(data['val_images']), -1)
df_val = pd.DataFrame(val_images_flat)
df_val['Label'] = data['val_labels']

# Combine train and validation sets into a new train set
combined_images_flat = np.vstack([train_images_flat, val_images_flat])
combined_labels = np.concatenate([data['train_labels'], data['val_labels']])

df_train_combined = pd.DataFrame(combined_images_flat)
df_train_combined['Label'] = combined_labels

# Flatten the test images and create a DataFrame with labels
test_images_flat = data['test_images'].reshape(len(data['test_images']), -1)
df_test = pd.DataFrame(test_images_flat)
df_test['Label'] = data['test_labels']

# Split the data into features and labels for the rest of the code to be compatible with my group's code 
x_train = df_train_combined.drop(columns=['Label']).values
y_train = df_train_combined['Label'].values.ravel()
x_test = df_test.drop(columns=['Label']).values  
y_test = df_test['Label'].values.ravel()  

### 2. Προεπεξεργασία Δεδομένων

#### α. Χειρισμός ακραίων τιμών: δεν υπάρχουν ελλειπούσες τιμές

#### β. Κανονικοποίηση/Τυποποίηση χαρακτηριστικών: 
Για την κανονικοποίηση/τυποποίηση των χαρακτηριστικών, εφαρμόστηκε η μέθοδος Standard Scaling (Z-score normalization), ώστε κάθε χαρακτηριστικό να έχει μέση τιμή 0 και τυπική απόκλιση 1. Η επιλογή αυτή έγινε επειδή οι αλγόριθμοι PCA, LDA και Logistic Regression είναι ευαίσθητοι στην κλίμακα των χαρακτηριστικών. Η κατανομή των τιμών πριν και μετά την τυποποίηση απεικονίστηκε με ιστογράμματα, ενώ παρατηρήθηκε ότι η τυποποίηση βελτιώνει τη διακριτική ικανότητα και τη σύγκλιση των αλγορίθμων.

In [15]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
x_train_std = scaler.fit_transform(train_images_flat)
x_val_std = scaler.transform(val_images_flat)  
x_test_std = scaler.transform(test_images_flat)

### γ. Επιλογή και μετασχηματισμός χαρακτηριστικών:

#### 1. Αφαίρεση χαρακτηριστικών με χαμηλή διακύμανση (Remove low-variance features)
You can use VarianceThreshold from scikit-learn to remove features (pixels) with very low variance, which are unlikely to be useful for classification

In [18]:
from sklearn.feature_selection import VarianceThreshold

# Remove features with variance below a threshold (e.g., 0.01)
selector = VarianceThreshold(threshold=0.01)
x_train_var = selector.fit_transform(x_train_std)
x_val_var = selector.transform(x_val_std)
x_test_var = selector.transform(x_test_std)

print("Original shape:", x_train_std.shape)
print("After variance thresholding:", x_train_var.shape)

Original shape: (546, 50176)
After variance thresholding: (546, 50176)


#### 2. Δημιουργία νέων χαρακτηριστικών (Feature engineering)
For images, common new features include mean, standard deviation, or other statistics per image. Example:

#### Extracting Features

#### σσ

In [None]:
# Add mean and std of pixel values as new features
train_means = x_train.reshape(x_train.shape[0], -1).mean(axis=1).reshape(-1, 1)
train_stds = x_train.reshape(x_train.shape[0], -1).std(axis=1).reshape(-1, 1)
x_train_fe = np.hstack([x_train_var, train_means, train_stds])

test_means = x_test.reshape(x_test.shape[0], -1).mean(axis=1).reshape(-1, 1)
test_stds = x_test.reshape(x_test.shape[0], -1).std(axis=1).reshape(-1, 1)
x_test_fe = np.hstack([x_test_var, test_means, test_stds])

In [None]:
# Train a simple model to compare performance with and without the new features
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Model with only variance-thresholded features
clf1 = LogisticRegression(max_iter=1000, random_state=56)
clf1.fit(x_train_var, y_train)
y_pred1 = clf1.predict(x_val_var)
acc1 = accuracy_score(y_test, y_pred1)

# Model with mean and std features added
clf2 = LogisticRegression(max_iter=1000, random_state=56)
clf2.fit(x_train_fe, y_train)
y_pred2 = clf2.predict(x_test_fe)
acc2 = accuracy_score(y_test, y_pred2)

# model with all stats

print(f"Accuracy without mean/std: {acc1:.4f}")
print(f"Accuracy with mean/std:    {acc2:.4f}")

Accuracy without mean/std: 0.8718
Accuracy with mean/std:    0.8590
