<a href="https://colab.research.google.com/github/net39/ML-anomaly-detection/blob/main/P2_02_SVMPreprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load libraries and access to GCS

In [1]:
!pip install google-cloud-storage --quiet
!pip install opencv-python-headless --quiet
!pip install joblib --quiet


In [2]:
import os
import cv2
import numpy as np
import pandas as pd
from io import StringIO
from sklearn.preprocessing import LabelEncoder
import joblib
from tqdm import tqdm

In [3]:
from google.colab import auth
auth.authenticate_user()

In [4]:
from google.colab import drive
drive.mount('/content/drive')

#key upload
from google.colab import files
files.upload()

from google.cloud import storage
import os

# Set path to your uploaded key
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/p2-anomaly-c4545180e308.json'

storage_client = storage.Client()
bucket_name = 'p2-anomaly'
bucket = storage_client.bucket(bucket_name)

Mounted at /content/drive


Saving p2-anomaly-c4545180e308.json to p2-anomaly-c4545180e308.json


# Resize - Train & Test images

In [5]:
# Image target size (for both SVM + later CNN consistency)
IMG_HEIGHT = 224
IMG_WIDTH = 224
IMG_CHANNELS = 3

In [6]:
# Load Training_set.csv
train_blob = bucket.blob('raw/images/Training_set.csv')
train_content = train_blob.download_as_text()
train_df = pd.read_csv(StringIO(train_content))

# Load Testing_set.csv
test_blob = bucket.blob('raw/images/Testing_set.csv')
test_content = test_blob.download_as_text()
test_df = pd.read_csv(StringIO(test_content))

In [7]:
# Label Encoder
label_encoder = LabelEncoder()
train_df['encoded_label'] = label_encoder.fit_transform(train_df['label'])

joblib.dump(label_encoder, 'label_encoder_svm.joblib')

encoder_blob = bucket.blob('processed/labels/label_encoder_svm.joblib')
encoder_blob.upload_from_filename('label_encoder_svm.joblib')

In [8]:
# Image preprocess definition
def preprocess_image_gcs(blob_path):
    try:
        img_blob = bucket.blob(blob_path)
        img_bytes = img_blob.download_as_bytes()
        img_arr = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_COLOR)
        img_resized = cv2.resize(img_arr, (IMG_WIDTH, IMG_HEIGHT))
        img_flattened = img_resized.flatten()
        return img_flattened
    except Exception as e:
        print(f"Failed to process {blob_path}: {e}")
        return None


In [9]:
# Preprocess Train images
X_train = []
y_train = []

for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
    img_path = f'raw/images/train/{row["filename"]}'
    img_flat = preprocess_image_gcs(img_path)
    if img_flat is not None:
        X_train.append(img_flat)
        y_train.append(row['encoded_label'])

# Convert to numpy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)

# Save locally
np.save('X_train_svm.npy', X_train)
np.save('y_train_svm.npy', y_train)

100%|██████████| 12600/12600 [13:30<00:00, 15.54it/s]


In [10]:
# Upload to GCS
train_X_blob = bucket.blob('processed/resized_images/X_train_svm.npy')
train_X_blob.upload_from_filename('X_train_svm.npy')

train_y_blob = bucket.blob('processed/resized_images/y_train_svm.npy')
train_y_blob.upload_from_filename('y_train_svm.npy')

In [11]:
# Preprocess Test images

X_test = []
filenames_test = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
    img_path = f'raw/images/test/{row["filename"]}'
    img_flat = preprocess_image_gcs(img_path)
    if img_flat is not None:
        X_test.append(img_flat)
        filenames_test.append(row['filename'])

# Convert to numpy arrays
X_test = np.array(X_test)

# Save locally
np.save('X_test_svm.npy', X_test)
pd.DataFrame({'filename': filenames_test}).to_csv('test_filenames.csv', index=False)


100%|██████████| 5400/5400 [05:43<00:00, 15.70it/s]


In [12]:
# Upload to GCS
test_X_blob = bucket.blob('processed/resized_images/X_test_svm.npy')
test_X_blob.upload_from_filename('X_test_svm.npy')

test_filenames_blob = bucket.blob('processed/resized_images/test_filenames.csv')
test_filenames_blob.upload_from_filename('test_filenames.csv')

# Upload Notebook to GCS

In [14]:
!cp "/content/drive/MyDrive/Colab Notebooks/P2-02_SVMPreprocess.ipynb" "/content/P2-02_SVMPreprocess.ipynb"

client = storage.Client(project='p2-anomaly')
bucket = client.bucket('p2-anomaly')
notebook_blob = bucket.blob('notebooks/P2-02_SVMPreprocess.ipynb')
notebook_blob.upload_from_filename('/content/P2-02_SVMPreprocess.ipynb')
print("Notebook pushed to GCS.")

Notebook pushed to GCS.


In [16]:
!jupyter nbconvert --to script "/content/P2-02_SVMPreprocess.ipynb" --output "/content/P2-02_SVMPreprocess"
!mv /content/P2-02_SVMPreprocess.txt /content/P2-02_SVMPreprocess.py
local_script_path = '/content/P2-02_SVMPreprocess.py'
gcs_script_path = '/content/P2-02_SVMPreprocess.py'

# Upload to GCS
blob = bucket.blob(gcs_script_path)
blob.upload_from_filename(local_script_path)

print("Script uploaded to GCS.")

[NbConvertApp] Converting notebook /content/P2-02_SVMPreprocess.ipynb to script
[NbConvertApp] Writing 4385 bytes to /content/P2-02_SVMPreprocess.txt
Script uploaded to GCS.
