---
title: "Baseline Model: Stat 362 Final Project"
format:
  html:
    toc: true
    toc-title: Contents
    toc-depth: 4
    code-fold: show
    self-contained: true
jupyter: python3
---

In [2]:
import numpy as np
import pandas as pd
import sklearn
import tensorflow as tf


In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow import keras


train = pd.read_csv("data/train.csv") 
test  = pd.read_csv("data/test.csv")  

train['file_name'] = train['file_name'].apply(lambda x: os.path.join('data', x))
if 'file_name' in test.columns:
    test['file_name'] = test['file_name'].apply(lambda x: os.path.join('data', x))
else:
    # test CSV has column 'id' instead
    test['file_name'] = test['id'].apply(lambda x: os.path.join('data', 'test_data_v2', x))


train_df, val_df = train_test_split(
    train, 
    test_size=0.2, 
    random_state=42, 
    stratify=train['label']
)

def create_generators(train_df, val_df, test_df):
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    test_val_datagen = ImageDataGenerator(rescale=1./255)

    train_generator = train_datagen.flow_from_dataframe(
        dataframe=train_df,
        x_col='file_name',
        y_col='label',
        target_size=(224, 224),
        class_mode='raw',
        batch_size=32,
        shuffle=True
    )
    val_generator = test_val_datagen.flow_from_dataframe(
        dataframe=val_df,
        x_col='file_name',
        y_col='label',
        target_size=(224, 224),
        class_mode='raw',
        batch_size=32,
        shuffle=False
    )
    test_generator = test_val_datagen.flow_from_dataframe(
        dataframe=test_df,
        x_col='file_name',
        y_col=None,
        target_size=(224, 224),
        class_mode=None,
        batch_size=32,
        shuffle=False
    )

    return train_generator, val_generator, test_generator

train_gen, val_gen, test_gen = create_generators(train_df, val_df, test)

def create_model():
    model = keras.models.Sequential([
        keras.layers.Input(shape=(224, 224, 3)),
        keras.layers.Conv2D(32, (3,3), activation="relu", padding="same"),
        keras.layers.MaxPooling2D(2,2),
        keras.layers.Conv2D(64, (3,3), activation="relu", padding="same"),
        keras.layers.MaxPooling2D(2,2),
        keras.layers.Conv2D(128, (3,3), activation="relu", padding="same"),
        keras.layers.MaxPooling2D(2,2),
        keras.layers.Flatten(),
        keras.layers.Dense(128, activation="relu"),
        keras.layers.Dense(1, activation="sigmoid")   # binary output
    ])
    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )
    return model

model = create_model()
model.summary()
print("Trainable params:", model.count_params())

EPOCHS = 20
history = model.fit(
    train_gen,
    epochs=EPOCHS,
    verbose=1,
    validation_data=val_gen
)


Found 63960 validated image filenames.
Found 15990 validated image filenames.
Found 0 validated image filenames.




Trainable params: 12938561
Epoch 1/20
[1m1999/1999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 160ms/step - accuracy: 0.7998 - loss: 0.5040 - val_accuracy: 0.7702 - val_loss: 0.6575
Epoch 2/20
[1m1999/1999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 158ms/step - accuracy: 0.7238 - loss: 0.7249 - val_accuracy: 0.7673 - val_loss: 0.6147
Epoch 3/20
[1m1999/1999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 158ms/step - accuracy: 0.7225 - loss: 0.6343 - val_accuracy: 0.7829 - val_loss: 0.5767
Epoch 4/20
[1m1999/1999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 158ms/step - accuracy: 0.7382 - loss: 0.5987 - val_accuracy: 0.7729 - val_loss: 0.6144
Epoch 5/20
[1m1999/1999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 160ms/step - accuracy: 0.7485 - loss: 0.5710 - val_accuracy: 0.7896 - val_loss: 0.5870
Epoch 6/20
[1m1999/1999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m328s[0m 164ms/step - accuracy: 0.7571 - loss: 0.5596 - val_a

In [12]:
val_loss, val_acc = model.evaluate(val_gen, verbose=0)
print('Initial validation loss', val_loss)
print('Initial validation accuracy', val_acc)



Initial validation loss 0.5149950385093689
Initial validation accuracy 0.8222013711929321
