## CNN

In [70]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV


In [71]:
img_size = (64, 64)  # Размер изображения
data_dir = "train"
csv_path = "train.csv"
df = pd.read_csv(csv_path)

In [72]:
# Преобразование меток классов в числовой формат
label_encoder = LabelEncoder()
df['Class'] = label_encoder.fit_transform(df['Class'])
num_classes = len(df['Class'].unique())

In [73]:
def load_and_preprocess_image(image_path):
    try:
        img = Image.open(image_path).convert('RGB')
        img = img.resize(img_size)
        img = np.array(img) / 255.0  # Нормализация
        return img
    except Exception as e:
        print(f"Ошибка при загрузке {image_path}: {e}")
        return None

In [74]:
X = []
y = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    image_id, label = row['ID'], row['Class']
    image_path = os.path.join(data_dir, image_id)
    img = load_and_preprocess_image(image_path)
    if img is not None:
        X.append(img)
        y.append(label)

X = np.array(X)
y = np.array(y)

  0%|          | 0/19906 [00:00<?, ?it/s]

100%|██████████| 19906/19906 [00:13<00:00, 1470.96it/s]


In [75]:
y = to_categorical(y, num_classes=num_classes)

In [76]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Форма X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Форма X_valid: {X_valid.shape}, y_valid: {y_valid.shape}")
print(f"Форма X_test: {X_test.shape}, y_test: {y_test.shape}")


Форма X_train: (15924, 64, 64, 3), y_train: (15924, 3)
Форма X_valid: (1991, 64, 64, 3), y_valid: (1991, 3)
Форма X_test: (1991, 64, 64, 3), y_test: (1991, 3)


In [77]:
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')  
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [81]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train, y_train,
                    epochs=10,  
                    batch_size=32,
                    validation_data=(X_valid, y_valid))

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Точность на тестовой выборке: {test_accuracy:.2f}")

Epoch 1/10
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 19ms/step - accuracy: 0.5552 - loss: 0.9274 - val_accuracy: 0.6675 - val_loss: 0.7665
Epoch 2/10
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - accuracy: 0.6746 - loss: 0.7627 - val_accuracy: 0.6881 - val_loss: 0.7018
Epoch 3/10
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 19ms/step - accuracy: 0.7038 - loss: 0.6990 - val_accuracy: 0.6966 - val_loss: 0.6711
Epoch 4/10
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 20ms/step - accuracy: 0.7195 - loss: 0.6581 - val_accuracy: 0.6881 - val_loss: 0.7206
Epoch 5/10
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 22ms/step - accuracy: 0.7358 - loss: 0.6240 - val_accuracy: 0.7318 - val_loss: 0.6158
Epoch 6/10
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 20ms/step - accuracy: 0.7573 - loss: 0.5727 - val_accuracy: 0.7454 - val_loss: 0.6080
Epoch 7/10
[1m498

In [82]:
model.save("cnn_model.h5")



In [83]:
sample_idx = 0
sample_image = X_test[sample_idx]
sample_label = np.argmax(y_test[sample_idx])
prediction = np.argmax(model.predict(np.expand_dims(sample_image, axis=0)))

print(f"Реальный класс: {sample_label}, Предсказанный класс: {prediction}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
Реальный класс: 0, Предсказанный класс: 0


In [86]:
y_train_pred = np.argmax(model.predict(X_train), axis=1)
y_valid_pred = np.argmax(model.predict(X_valid), axis=1)
y_test_pred = np.argmax(model.predict(X_test), axis=1)


y_train_true = np.argmax(y_train, axis=1)
y_valid_true = np.argmax(y_valid, axis=1)
y_test_true = np.argmax(y_test, axis=1)


train_mseCNN = mean_squared_error(y_train_true, y_train_pred)
train_r2CNN = r2_score(y_train_true, y_train_pred)

valid_mse = mean_squared_error(y_valid_true, y_valid_pred)
valid_r2 = r2_score(y_valid_true, y_valid_pred)

test_mseCNN = mean_squared_error(y_test_true, y_test_pred)
test_r2CNN = r2_score(y_test_true, y_test_pred)

accuracyCNN = accuracy_score(y_train_true, y_train_pred)

print(f"Точность моделиCNN: {accuracyCNN:.2f}")
print(f"Test MSECNN: {test_mseCNN:.4f}, Test R^2CNN: {test_r2CNN:.4f}")

[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Точность моделиCNN: 0.86
Test MSECNN: 0.7107, Test R^2CNN: 0.1637


## _____________________________________________________________________________________________________________

## Random Forest

In [87]:
csv_path = "train.csv"
data_dir = "train"
df = pd.read_csv(csv_path)

In [88]:
label_encoder = LabelEncoder()
df['Class'] = label_encoder.fit_transform(df['Class'])

In [89]:
def extract_features(image_path):
    try:
        image = Image.open(image_path).convert('RGB')
        image = image.resize((64, 64))
        features = np.array(image).flatten()
        return features
    except Exception as e:
        print(f"Ошибка при обработке {image_path}: {e}")
        return None

In [90]:
X = []
y = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    image_id, label = row['ID'], row['Class']
    image_path = os.path.join(data_dir, image_id)
    features = extract_features(image_path)
    if features is not None and len(features) == 12288:
        X.append(features)
        y.append(label)

X = np.array(X)
y = np.array(y)

print(f"Общее количество данных: {len(X)}")
print(f"Форма массива признаков: {X.shape}")
print(f"Форма меток: {y.shape}")

100%|██████████| 19906/19906 [00:10<00:00, 1813.46it/s]


Общее количество данных: 19906
Форма массива признаков: (19906, 12288)
Форма меток: (19906,)


In [91]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.5, random_state=42, shuffle=True
)

X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, shuffle=True
)

print(f"Форма обучающей выборки: {X_train.shape}")
print(f"Форма валидационной выборки: {X_valid.shape}")
print(f"Форма тестовой выборки: {X_test.shape}")


Форма обучающей выборки: (9953, 12288)
Форма валидационной выборки: (4976, 12288)
Форма тестовой выборки: (4977, 12288)


In [92]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [93]:
y_train_pred = model.predict(X_train)
y_valid_pred = model.predict(X_valid)
y_test_pred = model.predict(X_test)

In [95]:
train_mseRF = mean_squared_error(y_train, y_train_pred)
train_r2RF = r2_score(y_train, y_train_pred)

valid_mse = mean_squared_error(y_valid, y_valid_pred)
valid_r2 = r2_score(y_valid, y_valid_pred)

test_mseRF = mean_squared_error(y_test, y_test_pred)
test_r2RF = r2_score(y_test, y_test_pred)

accuracyRF = accuracy_score(y_train, y_train_pred)
print(f"Точность моделиRF: {accuracyRF:.2f}")
print(f"Test MSERF: {test_mseRF:.4f}, Test R^2RF: {test_r2RF:.4f}")

Точность моделиRF: 1.00
Train MSERF: 0.0001, Train R^2RF: 0.9999


## _______________________________________________________________________________________________________________________________

## Рандом форест обновленный

In [96]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score


model = RandomForestClassifier(
    n_estimators=100,        # Количество деревьев
    max_depth=10,            # Максимальная глубина дерева
    min_samples_split=5,     # Минимальное число выборок для разбиения
    min_samples_leaf=2,      # Минимальное число выборок в листе
    max_features='sqrt',     # Количество признаков для разбиения
    random_state=42
)

In [97]:
model.fit(X_train, y_train)


In [98]:
y_train_pred = model.predict(X_train)
y_valid_pred = model.predict(X_valid)
y_test_pred = model.predict(X_test)



In [99]:
train_mseRF = mean_squared_error(y_train, y_train_pred)
train_r2RF = r2_score(y_train, y_train_pred)

valid_mse = mean_squared_error(y_valid, y_valid_pred)
valid_r2 = r2_score(y_valid, y_valid_pred)

test_mseRFUP = mean_squared_error(y_test, y_test_pred)
test_rRFUP = r2_score(y_test, y_test_pred)

accuracyRFUP = accuracy_score(y_train, y_train_pred)
print(f"Точность моделиRF: {accuracyRFUP:.2f}")
print(f"Test MSERF: {test_mseRFUP:.4f}, Test R^2RF: {test_rRFUP:.4f}")

Точность моделиRF: 0.84
Test MSERF: 0.3125, Test R^2RF: 0.6953


## _______________________________________________________________________________________________________________________________

## Линейная регрессия (LR)

In [100]:
image_size = (64, 64)  

csv_path = "train.csv"
data = pd.read_csv(csv_path)

In [101]:
def load_images_and_labels(image_folder, data, image_size):
    images = []
    labels = []
    for _, row in tqdm(data.iterrows(), total=data.shape[0], desc="Загрузка изображений"):
        img_path = os.path.join(image_folder, row["ID"])
        label = row["Class"]
        try:
            img = Image.open(img_path).convert("L") 
            img = img.resize(image_size)
            images.append(np.array(img).flatten())
            labels.append(label)
        except Exception as e:
            print(f"Ошибка при обработке файла {img_path}: {e}")
    return np.array(images), np.array(labels)

In [102]:
image_folder = "train"

In [103]:
X, y = load_images_and_labels(image_folder, data, image_size)

Загрузка изображений: 100%|██████████| 19906/19906 [00:09<00:00, 2051.80it/s]


In [104]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [105]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [106]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

In [107]:
model = LinearRegression()
model.fit(X_train, y_train)

In [108]:
y_train_pred = model.predict(X_train)
y_valid_pred = model.predict(X_valid)
y_test_pred = model.predict(X_test)

In [113]:
train_mseLR = mean_squared_error(y_train, y_train_pred)
train_r2LR = r2_score(y_train, y_train_pred)
accuracyLR = accuracy_score(y_train, np.round(y_train_pred))

valid_mseLR = mean_squared_error(y_valid, y_valid_pred)
valid_r2LR = r2_score(y_valid, y_valid_pred)

test_mseLR = mean_squared_error(y_test, y_test_pred)
test_r2LR = r2_score(y_test, y_test_pred)

print(f"Точность модели LR (test): {accuracyLR:.2f}")
print(f"Test MSE LR: {test_mseLR:.4f}, Test R^2 LR: {test_r2LR:.4f}")


Точность модели LR (test): 0.44
Test MSE LR: 0.5230, Test R^2 LR: 0.3755


## _____________________________________________________________________________________________________________

## KNN

In [114]:
csv_path = "train.csv"
data_dir = "train"

In [115]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
df = pd.read_csv(csv_path)
label_encoder = LabelEncoder()
df['Class'] = label_encoder.fit_transform(df['Class'])

In [116]:
def extract_features(image_path):
    try:
        image = Image.open(image_path).convert('RGB')
        image = image.resize((64, 64))
        features = np.array(image).flatten()
        return features
    except Exception as e:
        print(f"Ошибка при обработке {image_path}: {e}")
        return None

In [117]:
X = []
y = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    image_id, label = row['ID'], row['Class']
    image_path = os.path.join(data_dir, image_id)
    features = extract_features(image_path)
    if features is not None and len(features) == 64 * 64 * 3:  # Проверка размера изображения
        X.append(features)
        y.append(label)

X = np.array(X)
y = np.array(y)

print(f"Общее количество данных: {len(X)}")
print(f"Форма массива признаков: {X.shape}")
print(f"Форма меток: {y.shape}")

100%|██████████| 19906/19906 [00:11<00:00, 1801.55it/s]


Общее количество данных: 19906
Форма массива признаков: (19906, 12288)
Форма меток: (19906,)


In [118]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Форма обучающей выборки: {X_train.shape}")
print(f"Форма валидационной выборки: {X_valid.shape}")
print(f"Форма тестовой выборки: {X_test.shape}")

Форма обучающей выборки: (15924, 12288)
Форма валидационной выборки: (1991, 12288)
Форма тестовой выборки: (1991, 12288)


In [119]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

In [120]:
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn.fit(X_train, y_train)

In [121]:
# Предсказания
y_train_pred = knn.predict(X_train)
y_valid_pred = knn.predict(X_valid)
y_test_pred = knn.predict(X_test)

In [123]:
# Оценка модели
train_mseKNN = mean_squared_error(y_train, y_train_pred)
train_r2KNN = r2_score(y_train, y_train_pred)

valid_mse = mean_squared_error(y_valid, y_valid_pred)
valid_r2 = r2_score(y_valid, y_valid_pred)

test_mseKNN = mean_squared_error(y_test, y_test_pred)
test_r2KNN = r2_score(y_test, y_test_pred)

accuracyKNN = accuracy_score(y_train, y_train_pred)

# Вывод результатов
print(f"Точность модели KNN: {accuracyKNN:.2f}")
print(f"Test MSE: {test_mseKNN:.4f}, Train R^2: {test_r2KNN:.4f}")

Точность модели KNN: 1.00
Test MSE: 0.0003, Train R^2: 0.9996


## _________________________________________________________________________________________________________

## Вывод

In [125]:
data = {
    "Модель": ["Нейронная сеть (СNN)", "Случайный лес (RF)", "Случайный лес (RF) Upd", "Линейная регрессия (LR)", "KNN"],
    "Точность": [accuracyCNN, accuracyRF,accuracyRFUP, accuracyLR, accuracyKNN],
    "MSE": [test_mseCNN, test_mseRF, test_mseRFUP, test_mseLR, test_mseKNN],
    "R²": [test_r2CNN, test_r2RF, test_rRFUP, test_r2LR, test_r2KNN]
    }


df = pd.DataFrame(data)

print("\nСравнение моделей:")
print(df)


Сравнение моделей:
                    Модель  Точность       MSE        R²
0     Нейронная сеть (СNN)  0.859897  0.372771  0.554920
1       Случайный лес (RF)  0.999900  0.000100  0.999880
2   Случайный лес (RF) Upd  0.840000  0.373254  0.554034
3  Линейная регрессия (LR)  0.438646  0.523001  0.375548
4                      KNN  0.999874  0.000314  0.999625
